# User similarity

The purpose of this simple NearestNeighbor model is to produce recommendations by identifying similar users.

The most popular 10 movies from each set of similar users is used to generate the final recommendations. This will be further improved by using a second model to predict movies from the subset of similar users.

Note that the user data has been synthetically generated, so it won't be useful in making meaningful recommendations on the MovieLens dataset.

In [5]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [6]:
DATA_DIR="data/recommendations"
users_path = 'users.csv'
users_df = pd.read_csv(os.path.join(DATA_DIR, users_path), dtype={'post_code':str})
users_df.head()

Unnamed: 0,userId,city,post_code,job,downloads,contract_months
0,1,Thomasburgh,27555,Arts administrator,8005,55
1,2,Hoodfurt,41877,Trade mark attorney,13769,48
2,3,Hornetown,25544,"Therapist, occupational",10632,38
3,4,Samanthatown,32562,Midwife,10711,58
4,5,Mannshire,78137,Pension scheme manager,12163,51


In [7]:
ratings_path = 'ml-latest-small/ratings.csv'
ratings_df = pd.read_csv(os.path.join(DATA_DIR, ratings_path))
# ignore ratings < 3
ratings_df = ratings_df[ratings_df['rating'] >= 3]
ratings_df = ratings_df.drop(columns=['rating', 'timestamp'])
ratings_df.head()

Unnamed: 0,userId,movieId
0,1,1
1,1,3
2,1,6
3,1,47
4,1,50


In [8]:
# copy users df
users_df_copy = users_df.copy()

# quick and dirty normalisation
max_downloads = users_df_copy['downloads'].max()
max_contract_months = users_df_copy['contract_months'].max()
                                 
users_df_copy['downloads'] = users_df_copy['downloads'] / max_downloads
users_df_copy['contract_months'] = users_df_copy['contract_months'] / max_contract_months

users_df_copy.head()

Unnamed: 0,userId,city,post_code,job,downloads,contract_months
0,1,Thomasburgh,27555,Arts administrator,0.468156,0.916667
1,2,Hoodfurt,41877,Trade mark attorney,0.805252,0.8
2,3,Hornetown,25544,"Therapist, occupational",0.621791,0.633333
3,4,Samanthatown,32562,Midwife,0.626411,0.966667
4,5,Mannshire,78137,Pension scheme manager,0.711328,0.85


In [9]:
dummy_cols=['city', 'post_code', 'job', 'contract_months']
users_df_copy = pd.get_dummies(users_df_copy, columns=dummy_cols)
users_df_copy.head()

Unnamed: 0,userId,downloads,city_Allenfurt,city_Andersonland,city_Andersonton,city_Bondland,city_Brooksside,city_Bryantfort,city_Bullockberg,city_Chadchester,...,contract_months_0.85,contract_months_0.8666666666666667,contract_months_0.8833333333333333,contract_months_0.9,contract_months_0.9166666666666666,contract_months_0.9333333333333333,contract_months_0.95,contract_months_0.9666666666666667,contract_months_0.9833333333333333,contract_months_1.0
0,1,0.468156,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,2,0.805252,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,0.621791,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0.626411,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,5,0.711328,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [10]:
from sklearn.model_selection import train_test_split

X_users = np.array(users_df_copy.drop(columns=['userId']))
y_users = np.array(users_df_copy['userId'])

X_users_train, X_users_test, y_users_train, y_users_test = train_test_split(X_users, y_users, random_state=0)

In [11]:
from sklearn.neighbors import NearestNeighbors

model = NearestNeighbors(n_neighbors=10,
                         metric='cosine',
                         algorithm='brute',
                         n_jobs=-1)

model.fit(X_users_train, y_users_train)
similar_users = model.kneighbors(X_users_test, return_distance=False)

In [12]:
def predict_items(user, similar_users):
    ratings = ratings_df[ratings_df['userId'].isin(similar_users)]
    popular_movies = ratings['movieId'].value_counts().head(10).index.tolist()
    return popular_movies

user_pairing = zip(y_users, similar_users)
similar_items = [predict_items(user, users) for user, users in user_pairing]

In [13]:
correct_predictions = []

for idx, user in enumerate(y_users_test):
    real_user_interests = ratings_df[ratings_df['userId'] == user]['movieId'].tolist()
    user_predictions = similar_items[idx]
    correct = set(real_user_interests).intersection(set(user_predictions))
    if len(real_user_interests) == 0:
        correct_ratio = 0.5
    elif len(real_user_interests) >= len(user_predictions):
        correct_ratio = len(correct) / 10
    else:
        correct_ratio = len(correct) / len(real_user_interests)
    correct_predictions.append(correct_ratio)

print('predictions: true positive rate')
sum(correct_predictions) / len(correct_predictions) * 100

predictions: true positive rate


34.11764705882354

In [14]:
# random sampling
random_predictions = []

for idx, user in enumerate(y_users_test):
    real_user_interests = ratings_df[ratings_df['userId'] == user]['movieId'].tolist()
    random_movies = ratings_df.sample(10)['movieId'].tolist()
    correct = set(real_user_interests).intersection(set(random_movies))
    if len(real_user_interests) == 0:
        correct_ratio = 0.5
    elif len(real_user_interests) >= len(random_movies):
        correct_ratio = len(correct) / 10
    else:
        correct_ratio = len(correct) / len(real_user_interests)
    random_predictions.append(correct_ratio)

print('random sampling: true positive rate')
sum(random_predictions) / len(random_predictions) * 100

random sampling: true positive rate


9.73856209150326