### Loading your data from Mongo

In [2]:
from pymongo import MongoClient
client = MongoClient()
db = client.nyt_dump
coll = db.articles

In [3]:
documents = [' '.join(article['content']).lower() for article in coll.find()]

### Text Processing Pipeline 
A text processing pipeline involves tokenization, stripping stopwords, and stemming.

#### 0. possible text mining inputs

In [None]:
import unicodedata

def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    only_ascii = nfkd_form.encode('ASCII', 'ignore')
    return only_ascii.decode()

input_string = remove_accents(paragraph)
input_string

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

#### 1. Tokenization 

In [None]:
from nltk.tokenize import word_tokenize
tokenized = [word_tokenize(content.lower()) for content in documents]


In [None]:
from nltk.corpus import stopwords

stopwords_ = set(stopwords.words('english'))
docs = [[word for word in words if word not in stop]
            for words in tokenized]

#### 2. Stemming/Lemmatization 

In [None]:
from nltk.stem.snowball import SnowballStemmer
snowball = SnowballStemmer('english')
docs_snowball = [[snowball.stem(word) for word in words] for words in docs]


#### 3. Bag Of Words and TFIDF

#### 3.1. Create your vocab, a set of words UNIQUE  over the whole corpusmy_docs = docs_snowball

In [None]:
vocab_set = set()
[[vocab_set.add(token) for token in tokens] for tokens in my_docs]
vocab = list(vocab_set)

#### 3.2. Create a reverse lookup for the vocab list.
 This is a dictionary whose keys are the words and values are the indices of the words (the word id). This will make things much faster than using the list `index` function.

In [None]:
vocab_dict = {word: i for i, word in enumerate(vocab)}

#### 3.3. Now let's create our word count vectors manually.
Create a numpy matrix where each row corresponds to a document and each column a word. The value should be the count of the number of times that word appeared in that document.

In [None]:
word_counts = np.zeros((len(my_docs), len(vocab)))
for doc_id, words in enumerate(my_docs):
    for word in words:
        word_id = vocab_dict[word]
        word_counts[doc_id][word_id] += 1

#### 3.4. Create the document frequencies. 
For each word, get a count of the number of documents the word appears in (different from the number of times the word appears!).

In [None]:
df = np.sum(word_counts > 0, axis=0)

#### 3.5. Normalize the word count matrix to get the term frequencies. 
This means dividing each count by the L1 norm (the sum of all the counts). This makes each vector a vector of term frequencies.

In [None]:
tf_norm = word_counts.sum(axis=1)
    tf_norm[tf_norm == 0] = 1
    tf = word_counts / tf_norm.reshape(len(my_docs), 1)

### Using sklearn

#### 1.Write the tokenize function. 
It should use nltk's word_tokenize

In [None]:
def tokenize(doc):
        '''
        INPUT: string
        OUTPUT: list of strings

        Tokenize and stem/lemmatize the document.
        '''
        return [snowball.stem(word) for word in word_tokenize(doc.lower())]
    

#### 2. Apply the CountVectorizer 
You can use vect.get_feature_names() to get the ids of the words.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

countvect = CountVectorizer(stop_words='english',tokenizer=tokenize)
count_vectorized = countvect.fit_transform(documents)

In [64]:
count_vectorized.toarray()

array([[0, 0, 2, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 4, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [67]:
# Compare my results:

words = countvect.get_feature_names()
print("sklearn count of 'dinner':", count_vectorized[0, words.index('dinner')])
print("my count of 'dinner':", word_counts[0, vocab_dict['dinner']])


sklearn count of 'dinner': 2
my count of 'dinner': 2.0


#### 3. Apply the TfidfVectorizer. Compare it to your tfidf results from above.

In [65]:
tfidfvect = TfidfVectorizer(stop_words='english', tokenizer=tokenize)
tfidf_vectorized = tfidfvect.fit_transform(documents)

In [70]:
words_tfidf = tfidfvect.get_feature_names()
print("sklearn tfidf of 'dinner':", tfidf_vectorized[0, words_tfidf.index('dinner')])
# print("my tfidf of 'dinner':", tfidf[0, vocab_dict['dinner']])

sklearn tfidf of 'dinner': 0.0572326357301963


## Understanding the Centroids from K-Means

#### 1) Import Data and apply KMeans

In [None]:
articles_df = pd.read_pickle("data/articles.pkl")
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(articles_df['content'])
features = vectorizer.get_feature_names()
kmeans = KMeans()
kmeans.fit(X)

#### 2) Print centroids as vectors

In [None]:
print("cluster centers:")
print(kmeans.cluster_centers_)

#### 3) Find the top 10 features for each cluster.

In [None]:
top_centroids = kmeans.cluster_centers_.argsort()[:,-1:-11:-1]
print("\n3) top features (words) for each cluster:")
for num, centroid in enumerate(top_centroids):
    print(f"{num}, {', '.join(features[i] for i in centroid)}")

#### 7) Set k = # sections. Find and count sections for each group
Not a perfect map to each section

In [None]:
kmeans = KMeans(n_clusters=10)
kmeans.fit(X)
assigned_cluster = kmeans.transform(X).argmin(axis=1)


In [8]:
for i in range(kmeans.n_clusters):
    cluster = np.arange(0, X.shape[0])[assigned_cluster==i]
    topics = articles_df.loc[cluster].dropna()['section_name']
    most_common = Counter(topics).most_common()
    print(f"Cluster {i}:")
    for j in range (len(most_common)):
        print(f"     {most_common[j][0]} ({most_common[j][1]} articles)")

Cluster 0:
     Business Day (34 articles)
     Opinion (11 articles)
     World (9 articles)
     U.S. (7 articles)
     Sports (1 articles)
     Arts (1 articles)
Cluster 1:
     Arts (89 articles)
     Opinion (3 articles)
     Business Day (2 articles)
     World (1 articles)
Cluster 2:
     World (30 articles)
     Business Day (2 articles)
     U.S. (1 articles)
Cluster 3:
     World (22 articles)
     Opinion (3 articles)
     U.S. (1 articles)
Cluster 4:
     World (145 articles)
     U.S. (6 articles)
     Opinion (4 articles)
     Business Day (2 articles)
     Sports (2 articles)
Cluster 5:
     Sports (30 articles)
Cluster 6:
     Sports (92 articles)
     World (3 articles)
     Arts (1 articles)
     Opinion (1 articles)
     Business Day (1 articles)
Cluster 7:
     Sports (72 articles)
     Arts (1 articles)
Cluster 8:
     Business Day (27 articles)
     Arts (24 articles)
     Sports (23 articles)
     Opinion (13 articles)
     World (5 articles)
     U.S. (1 article