### Recommend articles similar to article being read by a customer

In [11]:
### using NMF on word frequency array , nfm features describes topics, similar docs have similar features
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.preprocessing import normalize
from sklearn.decomposition import NMF
#NMF features = topics.
df=pd.read_csv('wikipedia-vectors.csv')
df.head()


Unnamed: 0.1,Unnamed: 0,HTTP 404,Alexa Internet,Internet Explorer,HTTP cookie,Google Search,Tumblr,Hypertext Transfer Protocol,Social search,Firefox,...,Chad Kroeger,Nate Ruess,The Wanted,Stevie Nicks,Arctic Monkeys,Black Sabbath,Skrillex,Red Hot Chili Peppers,Sepsis,Adam Levine
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.008878,0.0,0.0,0.049502,0.0,0.0,0.0
1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00611,0.0
2,2,0.0,0.029607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.005646,0.0,0.0
4,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
#Each row = one document/article (e.g., row 0 is document 0),Each value = how much that article is related to a word, Each column = a Wikipedia article title
df = df.drop(columns=['Unnamed: 0'])
articles= csr_matrix(df.transpose()) #word frequency matrix
#The reason for taking this transpose is that without it, there would be 13,000 columns (corresponding to the 13,000 words in the file), which is a lot of columns for a CSV to have.
titles = list(df.columns)

In [25]:
'''
When you apply NMF to a TF-IDF matrix, it breaks it into:
1-Document–Topic Matrix:Each column here is a feature (topic) used to describe the document
2-Topic–Word Matrix:Tells you: Which words are important in each topic
So after NMF, the features it creates are 'topics' that it has already identified from the data
'''

print(titles,articles[:5, :10])



['HTTP 404', 'Alexa Internet', 'Internet Explorer', 'HTTP cookie', 'Google Search', 'Tumblr', 'Hypertext Transfer Protocol', 'Social search', 'Firefox', 'LinkedIn', 'Global warming', 'Nationally Appropriate Mitigation Action', 'Nigel Lawson', 'Connie Hedegaard', 'Climate change', 'Kyoto Protocol', '350.org', 'Greenhouse gas emissions by the United States', '2010 United Nations Climate Change Conference', '2007 United Nations Climate Change Conference', 'Angelina Jolie', 'Michael Fassbender', 'Denzel Washington', 'Catherine Zeta-Jones', 'Jessica Biel', 'Russell Crowe', 'Mila Kunis', 'Dakota Fanning', 'Anne Hathaway', 'Jennifer Aniston', 'France national football team', 'Cristiano Ronaldo', 'Arsenal F.C.', 'Radamel Falcao', 'Zlatan Ibrahimović', 'Colombia national football team', '2014 FIFA World Cup qualification', 'Football', 'Neymar', 'Franck Ribéry', 'Tonsillitis', 'Hepatitis B', 'Doxycycline', 'Leukemia', 'Gout', 'Hepatitis C', 'Prednisone', 'Fever', 'Gabapentin', 'Lymphoma', 'Chad 

In [22]:
nmf=NMF(n_components=6) # find 6 hidden topics
nmf_features=nmf.fit_transform(articles) #here where we find the topics
norm_feature=normalize(nmf_features)
#if user reading this article
idx = titles.index("Global warming")
current_article = norm_feature[idx, :]
cosine_similiarity=norm_feature.dot(current_article)
'''
The resulting cosine similarity array is a 1D array where:
Each element corresponds to one article
The order matches the order of articles in your original dataset
'''
print(cosine_similiarity)

[0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 2.30142751e-02 5.82471262e-02 0.00000000e+00
 1.06928735e-02 9.17339009e-02 1.00000000e+00 9.98574666e-01
 9.78731258e-01 9.97824050e-01 9.99316376e-01 9.98574666e-01
 9.93154300e-01 9.98574666e-01 9.98441100e-01 9.98574666e-01
 4.59716814e-02 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 9.78741601e-03 0.00000000e+00 0.00000000e+00
 0.00000000e+00 1.21762424e-03 4.94038091e-03 9.62050332e-04
 3.31726290e-03 0.00000000e+00 0.00000000e+00 6.18025023e-04
 4.98519098e-02 4.74630491e-02 0.00000000e+00 0.00000000e+00
 5.33726180e-02 5.33726180e-02 5.89862369e-02 5.33697345e-02
 8.66599677e-02 5.33726180e-02 6.21728211e-02 2.07278684e-01
 1.01853495e-01 5.33725984e-02 0.00000000e+00 0.00000000e+00
 9.49537728e-03 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 6.61804748e-04 6.17849482e-02 1.86495470e-03]


In [23]:
similarities_series = pd.Series(cosine_similiarity, index=titles) # the way we mapped bc py keeps memory of articles and the way it reurned hidden topic the topics 'titles our columns' are ordered with indeces in order
print(similarities_series.nlargest(5))

Global warming                                   1.000000
Climate change                                   0.999316
Nationally Appropriate Mitigation Action         0.998575
Kyoto Protocol                                   0.998575
Greenhouse gas emissions by the United States    0.998575
dtype: float64
