In [1]:
#Import packages
import pandas as pd
import os

In [2]:
#Import dataset
path_to_data = os.path.join(os.getcwd(),
                            'people_wiki.csv')

articlesURLWiki = pd.read_csv(path_to_data)
print(path_to_data)
print(articlesURLWiki.shape)

/Users/roberthommes/PycharmProjects/nmfCaseExample/people_wiki.csv
(42786, 3)


In [3]:
articlesURLWiki.head()

Unnamed: 0,URI,name,text
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...


In [4]:
#find article we need for recommendation
POI_name = "George W. Bush"
articlesURLWiki[articlesURLWiki['name'].str.contains(POI_name)==True]

Unnamed: 0,URI,name,text
10042,<http://dbpedia.org/resource/Public_image_of_G...,Public image of George W. Bush,george w bush the 43rd president of the united...
28441,<http://dbpedia.org/resource/George_W._Bush>,George W. Bush,george walker bush born july 6 1946 is an amer...


In [5]:
#filter dataset
articlesWiki = articlesURLWiki.drop(['URI'], axis=1)
articles = articlesWiki.text.tolist()

In [6]:
# Import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TfidfVectorizer: tfidf
TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)'
tfidf = TfidfVectorizer(token_pattern=TOKENS_ALPHANUMERIC) 

# Apply fit_transform to document: csr_mat
csr_mat = tfidf.fit_transform(articles)

print('-- shape csr_matrix --')
print(csr_mat.shape)

# Get the words: words
words = tfidf.get_feature_names()

-- shape csr_matrix --
(42786, 435484)


In [7]:
%%time
# Import NMF
from sklearn.decomposition import NMF

# Create an NMF instance: model
model = NMF(n_components=40)

# Fit the model to articles
model.fit(csr_mat)

# Transform the articles: nmf_features
nmf_features = model.transform(csr_mat)

print('-- shape nmf_features --')
print(nmf_features.shape)

-- shape nmf_features --
(42786, 40)
CPU times: user 7min 58s, sys: 25.4 s, total: 8min 24s
Wall time: 7min 24s


In [8]:
#Create a pandas DataFrame: df
dfNMF_features = pd.DataFrame(nmf_features, index=articlesWiki.name.tolist())

# Print the row for POI
print(dfNMF_features.loc[POI_name])

0     0.009346
1     0.000000
2     0.000954
3     0.001075
4     0.011438
5     0.000000
6     0.041117
7     0.000000
8     0.000000
9     0.003085
10    0.000000
11    0.025544
12    0.000184
13    0.000000
14    0.001014
15    0.000000
16    0.011103
17    0.001243
18    0.000000
19    0.000000
20    0.003751
21    0.007536
22    0.000000
23    0.000000
24    0.000000
25    0.000000
26    0.016640
27    0.001862
28    0.000795
29    0.011989
30    0.000000
31    0.000000
32    0.002403
33    0.000000
34    0.000000
35    0.000066
36    0.000000
37    0.001431
38    0.013998
39    0.000000
Name: George W. Bush, dtype: float64


In [9]:
# Import normalize
from sklearn.preprocessing import normalize

# Normalize the NMF features: norm_features
norm_features = normalize(nmf_features)

# Create a DataFrame: df
df = pd.DataFrame(norm_features, index=articlesWiki.name.tolist())

# Select the row corresponding to POI_name: article
article = df.loc[POI_name]

# Compute the dot products: similarities
similarities = df.dot(article)

# Display those with the largest cosine similarity
print(similarities.nlargest(3))

George W. Bush    1.000000
Joe Biden         0.968051
Jimmy Carter      0.965332
dtype: float64


In [10]:
print(csr_mat.shape)
print(norm_features.shape)
print(model.components_.shape)

(42786, 435484)
(42786, 40)
(40, 435484)
