# User similarity

The purpose of this simple KNN model is to identify similar users.

A secondary model (using collaborative filtering) will make recommendations based on the preferences of these similar users, rather than all users. This helps avoild all recommendations being "popular".

Note that the user data has been synthetically generated, so it won't be useful in making meaningful recommendations on the MovieLens dataset.

In [54]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [55]:
DATA_DIR="../../data/recommendations"
users_path = 'users.csv'
users_df = pd.read_csv(os.path.join(DATA_DIR, users_path), dtype={'post_code':str})
users_df

Unnamed: 0,userId,city,post_code,job,downloads,contract_months
0,1,Lawsonview,09820,Learning mentor,9164,40
1,2,Sandrafurt,46742,Diagnostic radiographer,7331,58
2,3,Lake Brettfort,36388,Comptroller,7287,43
3,4,New Jeffreyhaven,08294,"Psychotherapist, child",10176,60
4,5,Port Ryanside,46511,Tree surgeon,12032,47
...,...,...,...,...,...,...
605,606,South Jamesville,37286,Systems developer,9451,49
606,607,North Michaelfort,08590,Training and development officer,10843,48
607,608,Hernandezport,10175,Games developer,8800,36
608,609,New Jessicaview,07831,Games developer,7500,58


In [56]:
def normalize(df):
    return (df-df.min())/(df.max()-df.min())

for column in ['downloads', 'contract_months']:
    users_df[column] = normalize(users_df[column])
                                 
users_df

Unnamed: 0,userId,city,post_code,job,downloads,contract_months
0,1,Lawsonview,09820,Learning mentor,0.265403,0.655172
1,2,Sandrafurt,46742,Diagnostic radiographer,0.048223,0.965517
2,3,Lake Brettfort,36388,Comptroller,0.043009,0.706897
3,4,New Jeffreyhaven,08294,"Psychotherapist, child",0.385308,1.000000
4,5,Port Ryanside,46511,Tree surgeon,0.605213,0.775862
...,...,...,...,...,...,...
605,606,South Jamesville,37286,Systems developer,0.299408,0.810345
606,607,North Michaelfort,08590,Training and development officer,0.464336,0.793103
607,608,Hernandezport,10175,Games developer,0.222275,0.586207
608,609,New Jessicaview,07831,Games developer,0.068246,0.965517


In [57]:
df = pd.get_dummies(users_df)
df

Unnamed: 0,userId,downloads,contract_months,city_Aguirretown,city_Annaland,city_Baileyfurt,city_Barbaraberg,city_Collinshaven,city_Coxhaven,city_Crystalshire,...,"job_Surveyor, quantity",job_Systems developer,job_Technical author,job_Television production assistant,"job_Therapist, music",job_Tourism officer,job_Training and development officer,job_Translator,job_Tree surgeon,job_Water quality scientist
0,1,0.265403,0.655172,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0.048223,0.965517,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,0.043009,0.706897,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0.385308,1.000000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,0.605213,0.775862,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
605,606,0.299408,0.810345,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
606,607,0.464336,0.793103,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
607,608,0.222275,0.586207,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
608,609,0.068246,0.965517,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [58]:
from sklearn.model_selection import train_test_split

X = np.array(df.drop(labels=['userId'],axis=1))
y = np.array(df['userId'])

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [59]:
from sklearn.neighbors import NearestNeighbors

model = NearestNeighbors(n_neighbors=10,
                         metric='cosine',
                         algorithm='brute',
                         n_jobs=-1)


model.fit(X_train)
dist, neighbours = model.kneighbors(X_test, n_neighbors=10, return_distance=True)
neighbours

array([[319, 171, 413, ..., 158, 235, 435],
       [185, 213, 364, ..., 104, 308, 362],
       [140, 156, 352, ..., 287, 414, 234],
       ...,
       [ 52, 148, 421, ...,   4,  11, 248],
       [  3, 286, 158, ...,  97, 293, 235],
       [268, 202,  18, ..., 162,  55, 347]])

In [60]:
test_user_id=y_test[0]

test_user = users_df[users_df['userId'] == test_user_id]
test_user

Unnamed: 0,userId,city,post_code,job,downloads,contract_months
577,578,Crystalshire,76320,Food technologist,0.128555,0.793103


In [61]:
test_user_similar_ids = neighbours[0]
test_user_similar = users_df[users_df['userId'].isin(test_user_similar_ids)]
test_user_similar

Unnamed: 0,userId,city,post_code,job,downloads,contract_months
24,25,South Jeffery,95612,Graphic designer,0.267891,0.965517
157,158,West Edwardberg,34045,Location manager,0.313863,0.034483
168,169,Murraymouth,74847,Games developer,0.210071,0.793103
170,171,Hughesmouth,23526,Diagnostic radiographer,0.07109,0.689655
176,177,East Brittneyhaven,90411,"Engineer, land",0.280924,0.672414
197,198,Raymondland,87769,Technical author,0.861611,0.37931
234,235,Fuentesstad,13923,Applications developer,0.314218,0.637931
318,319,Donaldside,37391,Learning mentor,0.533057,0.775862
412,413,West Heidi,55053,Nutritional therapist,0.70936,0.931034
434,435,Meganfurt,58963,Clothing/textile technologist,0.255687,0.844828
