# User similarity

The purpose of this simple NearestNeighbor model is to produce recommendations by identifying similar users.

The most popular 10 movies from each set of similar users is used to generate the final recommendations. This will be further improved by using a second model to predict movies from the subset of similar users.

Note that the user data has been synthetically generated, so it won't be useful in making meaningful recommendations on the MovieLens dataset.

In [1235]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [1236]:
DATA_DIR="../../data/recommendations"
users_path = 'users.csv'
users_df = pd.read_csv(os.path.join(DATA_DIR, users_path), dtype={'post_code':str})
users_df.head()

Unnamed: 0,userId,city,post_code,job,downloads,contract_months
0,1,Lawsonview,9820,Learning mentor,9164,40
1,2,Sandrafurt,46742,Diagnostic radiographer,7331,58
2,3,Lake Brettfort,36388,Comptroller,7287,43
3,4,New Jeffreyhaven,8294,"Psychotherapist, child",10176,60
4,5,Port Ryanside,46511,Tree surgeon,12032,47


In [1237]:
ratings_path = 'ml-latest-small/ratings.csv'
ratings_df = pd.read_csv(os.path.join(DATA_DIR, ratings_path))
# ignore ratings < 3
ratings_df = ratings_df[ratings_df['rating'] >= 3]
ratings_df = ratings_df.drop(columns=['rating', 'timestamp'])
ratings_df.head()

Unnamed: 0,userId,movieId
0,1,1
1,1,3
2,1,6
3,1,47
4,1,50


In [1238]:
# copy users df
users_df_copy = users_df.copy()

# quick and dirty normalisation
max_downloads = users_df_copy['downloads'].max()
max_contract_months = users_df_copy['contract_months'].max()
                                 
users_df_copy['downloads'] = users_df_copy['downloads'] / max_downloads
users_df_copy['contract_months'] = users_df_copy['contract_months'] / max_contract_months

users_df_copy.head()

Unnamed: 0,userId,city,post_code,job,downloads,contract_months
0,1,Lawsonview,9820,Learning mentor,0.596459,0.666667
1,2,Sandrafurt,46742,Diagnostic radiographer,0.477154,0.966667
2,3,Lake Brettfort,36388,Comptroller,0.474291,0.716667
3,4,New Jeffreyhaven,8294,"Psychotherapist, child",0.662328,1.0
4,5,Port Ryanside,46511,Tree surgeon,0.783129,0.783333


In [1239]:
dummy_cols=['city', 'post_code', 'job', 'contract_months']
users_df_copy = pd.get_dummies(users_df_copy, columns=dummy_cols)
users_df_copy.head()

Unnamed: 0,userId,downloads,city_Aguirretown,city_Annaland,city_Baileyfurt,city_Barbaraberg,city_Collinshaven,city_Coxhaven,city_Crystalshire,city_Cynthiatown,...,contract_months_0.85,contract_months_0.8666666666666667,contract_months_0.8833333333333333,contract_months_0.9,contract_months_0.9166666666666666,contract_months_0.9333333333333333,contract_months_0.95,contract_months_0.9666666666666667,contract_months_0.9833333333333333,contract_months_1.0
0,1,0.596459,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0.477154,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,0.474291,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0.662328,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,5,0.783129,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [1240]:
from sklearn.model_selection import train_test_split

X_users = np.array(users_df_copy.drop(columns=['userId']))
y_users = np.array(users_df_copy['userId'])

X_users_train, X_users_test, y_users_train, y_users_test = train_test_split(X_users, y_users, random_state=0)

In [1241]:
from sklearn.neighbors import NearestNeighbors

class Recommendations():
    def __init__(self, ratings_df, n_neighbors=10):
        self.ratings_df = ratings_df.sample(frac=1)
        self.users_model = NearestNeighbors(n_neighbors=n_neighbors,
                         metric='cosine',
                         algorithm='brute',
                         n_jobs=-1)
    
    def fit(self, X_users, y_users=None):
        self.users_model.fit(X_users, y_users)
        
    def predict_items(self, user, similar_users):
        ratings = self.ratings_df[self.ratings_df['userId'].isin(similar_users)]
        popular_movies = ratings['movieId'].value_counts().head(10).index.tolist()
        return popular_movies
        
    def predict(self, X_users, y_users=None):
        similar_users_dist, similar_users = self.users_model.kneighbors(X_users, return_distance=True)
        user_pairing = zip(y_users, similar_users)
        similar_items = [self.predict_items(user, users) for user, users in user_pairing]
        return similar_items

In [1242]:
from sklearn.neighbors import NearestNeighbors

model = Recommendations(ratings_df=ratings_df)

model.fit(X_users_train, y_users_train)
predictions = model.predict(X_users_test, y_users_test)

In [1243]:
correct_predictions = []

for idx, user in enumerate(y_users_test):
    real_user_interests = ratings_df[ratings_df['userId'] == user]['movieId'].tolist()
    user_predictions = predictions[idx]
    correct = set(real_user_interests).intersection(set(user_predictions))
    correct_predictions.append(len(correct))

print('predictions: true positive rate')
sum(correct_predictions) / len(correct_predictions) * 10

predictions: true positive rate


33.529411764705884

In [1244]:
# random sampling
random_predictions = []

for idx, user in enumerate(y_users_test):
    real_user_interests = ratings_df[ratings_df['userId'] == user]['movieId'].tolist()
    random_movies = ratings_df.sample(10)['movieId'].tolist()
    correct = set(real_user_interests).intersection(set(random_movies))
    random_predictions.append(len(correct))

print('random sampling: true positive rate')
sum(random_predictions) / len(random_predictions) * 10

random sampling: true positive rate


8.692810457516341