In [None]:
# Import
import pandas as pd

df = pd.read_csv('')
df_users = pd.read_csv('')
df_item = pd.read_csv('')

In [None]:
# Data exploration
display(df.head())
display(df_item.head())
display(df_users.head())

print('Min:\t\t', df_users['personId'].value_counts().min())
print('Quartile 1:\t', df_users['personId'].value_counts().quantile(.25))
print('Median:\t\t', df_users['personId'].value_counts().quantile(.5))
print('Quartile 3:\t', df_users['personId'].value_counts().quantile(.75))
print('Max:\t\t', df_users['personId'].value_counts().max())

# use these values to determine what data we keep

In [None]:
value_counts = df_users.contentId.value_counts()
keep_list = value_counts[value_counts >= 9]
df_users_reduced = df_users.loc[df_users.contentId.isin(keep_list.index)]

In [None]:
# Set up matrix and mapper
def create_matrix(df, user, item, rating):
  import numpy as np
  from scipy.sparse import csr_matrix

  U = df[user].nunique()  # Number of users for the matrix
  I = df[item].nunique()  # Number of items for the matrix

  # Map user and movie IDs to matrix indices
  user_mapper = dict(zip(np.unique(df[user]), list(range(U))))
  item_mapper = dict(zip(np.unique(df[item]), list(range(I))))

  # Map matrix indices back to IDs
  user_inv_mapper = dict(zip(list(range(U)), np.unique(df[user])))
  item_inv_mapper = dict(zip(list(range(I)), np.unique(df[item])))

  # Create a list of index values for the csr_matrix for users and movies
  user_index = [user_mapper[i] for i in df[user]]
  item_index = [item_mapper[i] for i in df[item]]

  # Build the final matrix which will look like: (movieId, userId) rating
  X = csr_matrix((df[rating], (item_index, user_index)), shape=(I, U))

  return X, user_mapper, item_mapper, user_inv_mapper, item_inv_mapper

In [None]:
# Filter by columns to include in model
X, user_mapper, item_mapper, user_inv_mapper, item_inv_mapper = create_matrix(df_users_reduced, 'personId', 'contentId', 'eventType')

In [None]:
# Recommender based on item
def recommend(itemId, X, item_mapper, item_inv_mapper, k, metric='cosine', messages=True):
  from sklearn.neighbors import NearestNeighbors

  rec_ids = []                # Make a list for the recommended item IDs we'll get later
  item = item_mapper[itemId]  # Get the index of the movie ID passed into the function
  item_vector = X[item]       # Get the vector of user ratings for the movie ID passed into the function

  # Fit the clustering algorithm based on the user-item matrix X
  knn = NearestNeighbors(n_neighbors=k+1, algorithm="brute", metric=metric).fit(X)

  # Call the trained knn cluster model to return the nearest neighbors of the item_vector passed in
  rec = knn.kneighbors(item_vector.reshape(1,-1), return_distance=True)
  rec_indeces = rec[1][0]     # Parse out the list of indeces of the recommended items
  rec_distances = rec[0][0]   # Parse out the recommendation strength calculated as the distance from the cluster center
  rec_distances = np.delete(rec_distances, 0) # Drop the first number in the list because it is the distance of itemId from itself

  # We need to replace the recommended item indeces with their original item IDs
  for i in range(1, knn.n_neighbors): # n_neighbors is the number of neighbors to return
    rec_ids.append(item_inv_mapper[rec_indeces[i]])

  # It may help to see what this is. The distance list is first and the recommended item indeces are second
  if messages:
    print(f'List of recommended item indeces:\n{rec_indeces}\n')
    print(f'List of recommended item IDs:\n{rec_ids}\n')
    print(f'List of recommended item similarity to selected item:\n{rec_distances}\n')

  # Return two lists: the original item IDs of the recommendations and their similarity scores
  return rec_ids, rec_distances

content_Id = 1 # Change here for testing
rec_ids, rec_distances = recommend(content_Id, X, item_mapper, item_inv_mapper, k=10, messages=False)

# Change dataframe here for predictions
print(f"If you like {df_articles.loc[content_Id, 'title']}, you may also enjoy:\n")
df_articles.loc[rec_ids, 'title']

In [None]:
# Recommender based on user
user_id = -1130272294246983140
k = 20

df_user_ratings = df_users_reduced.loc[df_users_reduced['personId']==user_id].sort_values(by=['eventType'], ascending=False)
max_rating = df_user_ratings.eventType.max()
df_user_ratings = df_user_ratings.loc[df_user_ratings.eventType==max_rating]

df_rec_list = pd.DataFrame(columns=['Distance'])

for i in df_user_ratings.contentId:
  rec_ids, rec_distances = recommend(i, X, item_mapper, item_inv_mapper, k=10, messages=False)
  for j, movie in enumerate(rec_ids):
    df_rec_list.loc[movie] = rec_distances[j]

df_rec_list.sort_values(by=['Distance'])

In [None]:
# Create recommendation on item
k = 5

df_recommendations = pd.DataFrame(columns=['If you enjoyed'], index=item_mapper)
for i in range(1, (k + 1)):
  df_recommendations[f'Recommendation {i}'] = None

for row in df_recommendations.itertuples():
  rec_ids, rec_distances = recommend(row[0], X, item_mapper, item_inv_mapper, k=k, messages=False)

  df_recommendations.at[row[0], 'If you enjoyed'] = df_articles.at[row[0], 'title']

  for i, r in enumerate(rec_ids):
    df_recommendations.at[row[0], f"Recommendation {i + 1}"] = df_articles.at[r, 'title']

df_recommendations
df_recommendations.to_csv('collabortive_predictions.csv', index=False)