In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
  
df_articles = pd.read_csv('shared_articles.csv')
df_articles.head()

: 

In [None]:
print(df_articles.shape)
df_articles.isna().sum()

In [None]:
df_articles.authorUserAgent.fillna('unknown', inplace=True)
df_articles.authorRegion.fillna('unknown', inplace=True)
df_articles.authorCountry.fillna('unknown', inplace=True)

      
# Very important step
df_articles.reset_index(inplace=True)
      
print(df_articles.isna().sum(), '\n')
df_articles.shape

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
          
# Create a TfidfVectorizer and Remove stopwords
tfidf = TfidfVectorizer(stop_words='english')
        
# Fit and transform the data to a tfidf matrix
tfidf_matrix = tfidf.fit_transform(df_articles['text'])
        
# Print the shape of the tfidf_matrix
print(tfidf_matrix.shape)
        
# Preview the matrix by placing it into a DataFrame (which we won't need later)
df_tfidf = pd.DataFrame(tfidf_matrix.T.todense(), index=tfidf.get_feature_names_out(), columns=df_articles['title'])
df_tfidf.iloc[2221:2226]
      
# Output
# (8803, 18891)

In [None]:
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity between each movie description
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
      
# For easier viewing, put it in a dataframe
pd.DataFrame(cosine_sim)

In [None]:
df_sorted = pd.DataFrame(cosine_sim).sort_values(by=[0], ascending=False)

for id in df_sorted.index[0:4]:
    print(id, '\t', df_articles.loc[id, 'title'])
        
display(df_sorted)

In [None]:
def get_recommendations(contentId, sim_matrix, n=10, messages=True):
        
    # Get the pairwise similarity scores of all articles with that article
    sim_scores = list(enumerate(sim_matrix[contentId]))
        
    # Sort the items based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        
    # Get the scores of the n most similar items; start at 1 so that it skips itself
    top_similar = sim_scores[1:n+1]
        
    # Put the recommended item indices and similarity scores together in a dictionary using comprehension
    rec_dict = {i[0]:i[1] for i in top_similar}
        
    if messages:
        print(f"The top recommended item IDs are: {list(rec_dict.keys())}")
        print(f"Their similarity scores are:\t  {list(rec_dict.values())}")
        
    # Return the top n most similar items
    return rec_dict

In [None]:
# Change this value to any title you'd like to get recommendations
title = "Google Data Center 360° Tour"
        
# Check if the title is valid; if not, suggest alternatives and use the last one for recommendations
if title in df_articles['title'].to_list():
  id = df_articles.index[df_articles['title']==title][0] # Convert the title to an index (i.e. item ID)
else:
  print(f"\"{title}\" is not in the data set. Try one of these:\n")
  for row in df_articles.sample(n=10).itertuples():  # Get a random 10 titles
    id = row[0]
    title = row.title
    print(f'\t{title}')
        
print(f"\nIf you like \"{title},\" then you may also like:\n")
        
# Call the function and return the dictionary; print out the dictionary if you want to see what it is
recommend_dict = get_recommendations(id, cosine_sim, n=10, messages=False)
        
# Add the dictionary to a new DataFrame; this isn't necessary, but it helps to see what articles are recommended
df_similarity = pd.DataFrame(data=recommend_dict.values(), columns=['similarity'], index=recommend_dict.keys())
        
# Create a subset of the original df DataFrame with only the recommended articles
df_recommendations = df_articles.loc[df_articles.index.isin(recommend_dict.keys()), ['title',	'text']]
        
# Join the original df results with the recommended article similarity scores so that we can sort the list and view it
df_recommendations.join(df_similarity).sort_values(by=['similarity'], ascending=False)

In [None]:
import joblib

joblib.dump(cosine_sim, 'cosine_similarity_matrix.sav')

joblib.dump(tfidf, 'tfidf_vectorizer.sav')
