# User similarity

The purpose of this simple KNN model is to identify similar users.

A secondary model (using collaborative filtering) will make recommendations based on the preferences of these similar users, rather than all users. This helps avoild all recommendations being "popular".

Note that the user data has been synthetically generated, so it won't be useful in making meaningful recommendations on the MovieLens dataset.

In [13]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [14]:
DATA_DIR="../../data/recommendations"
users_path = 'users.csv'
users_df = pd.read_csv(os.path.join(DATA_DIR, users_path), dtype={'post_code':str})
df = users_df.copy()
df

Unnamed: 0,userId,city,post_code,job,downloads,contract_months
0,1,Lawsonview,09820,Learning mentor,9164,40
1,2,Sandrafurt,46742,Diagnostic radiographer,7331,58
2,3,Lake Brettfort,36388,Comptroller,7287,43
3,4,New Jeffreyhaven,08294,"Psychotherapist, child",10176,60
4,5,Port Ryanside,46511,Tree surgeon,12032,47
...,...,...,...,...,...,...
605,606,South Jamesville,37286,Systems developer,9451,49
606,607,North Michaelfort,08590,Training and development officer,10843,48
607,608,Hernandezport,10175,Games developer,8800,36
608,609,New Jessicaview,07831,Games developer,7500,58


In [15]:
# quick and dirty normalisation
max_downloads = df['downloads'].max()
max_contract_months = df['contract_months'].max()
                                 
df['downloads'] = df['downloads'] / max_downloads
df['contract_months'] = df['contract_months'] / max_contract_months

df

Unnamed: 0,userId,city,post_code,job,downloads,contract_months
0,1,Lawsonview,09820,Learning mentor,0.596459,0.666667
1,2,Sandrafurt,46742,Diagnostic radiographer,0.477154,0.966667
2,3,Lake Brettfort,36388,Comptroller,0.474291,0.716667
3,4,New Jeffreyhaven,08294,"Psychotherapist, child",0.662328,1.000000
4,5,Port Ryanside,46511,Tree surgeon,0.783129,0.783333
...,...,...,...,...,...,...
605,606,South Jamesville,37286,Systems developer,0.615139,0.816667
606,607,North Michaelfort,08590,Training and development officer,0.705741,0.800000
607,608,Hernandezport,10175,Games developer,0.572768,0.600000
608,609,New Jessicaview,07831,Games developer,0.488154,0.966667


In [16]:
dummy_cols=['city', 'post_code', 'job', 'contract_months']
df = pd.get_dummies(users_df, columns=dummy_cols)
df

Unnamed: 0,userId,downloads,city_Aguirretown,city_Annaland,city_Baileyfurt,city_Barbaraberg,city_Collinshaven,city_Coxhaven,city_Crystalshire,city_Cynthiatown,...,contract_months_51,contract_months_52,contract_months_53,contract_months_54,contract_months_55,contract_months_56,contract_months_57,contract_months_58,contract_months_59,contract_months_60
0,1,9164,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,7331,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,7287,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,10176,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,5,12032,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
605,606,9451,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
606,607,10843,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
607,608,8800,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
608,609,7500,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [17]:
from sklearn.model_selection import train_test_split

X = np.array(df.drop(labels=['userId'],axis=1))
y = np.array(df['userId'])

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [21]:
from sklearn.neighbors import NearestNeighbors

model = NearestNeighbors(n_neighbors=10,
                         metric='cosine',
                         algorithm='brute',
                         n_jobs=-1)


model.fit(X_train)
dist, neighbours = model.kneighbors(X_test, n_neighbors=10, return_distance=True)
dist, neighbours

(array([[3.27594495e-08, 3.28057056e-08, 3.66756323e-08, ...,
         4.00521237e-08, 4.03191998e-08, 4.03191998e-08],
        [1.14317968e-08, 2.13558017e-08, 2.30446976e-08, ...,
         2.59348619e-08, 2.59519320e-08, 2.89862899e-08],
        [1.55278810e-08, 1.61484014e-08, 1.71668412e-08, ...,
         1.96379203e-08, 1.96379203e-08, 1.98985607e-08],
        ...,
        [1.68809902e-08, 2.28155893e-08, 2.39133631e-08, ...,
         2.49603149e-08, 2.57497472e-08, 2.58018610e-08],
        [2.24457779e-08, 2.34098270e-08, 2.55514637e-08, ...,
         3.46758305e-08, 3.49122452e-08, 3.49122452e-08],
        [2.09980990e-08, 2.42640287e-08, 2.62403846e-08, ...,
         2.97152815e-08, 3.09207097e-08, 3.13203377e-08]]),
 array([[407, 357,  53, ...,  66, 438, 347],
        [213, 364, 185, ..., 156,  99, 316],
        [140, 156, 437, ...,  66, 241, 406],
        ...,
        [148,  52, 438, ...,  44, 102, 301],
        [286,   3, 158, ..., 386, 364, 131],
        [268, 241, 191, ...

In [19]:
test_user_id=y_test[0]

test_user = users_df[users_df['userId'] == test_user_id]
test_user

Unnamed: 0,userId,city,post_code,job,downloads,contract_months
577,578,Crystalshire,76320,Food technologist,8009,48


In [20]:
test_user_similar_ids = neighbours[0]
test_user_similar = users_df[users_df['userId'].isin(test_user_similar_ids)]
test_user_similar

Unnamed: 0,userId,city,post_code,job,downloads,contract_months
19,20,Meganfurt,37731,General practice doctor,8812,57
52,53,Meganfurt,72562,Building services engineer,9316,55
65,66,North Christinahaven,567,Applications developer,7475,42
75,76,Collinshaven,31191,Housing manager/officer,9194,54
168,169,Murraymouth,74847,Games developer,8697,48
240,241,West Edwardberg,97561,Charity officer,7328,59
346,347,Karenbury,56952,Radio broadcast assistant,11433,56
356,357,South Craigview,42110,Learning mentor,9146,46
406,407,Juanland,45624,Chief Technology Officer,10180,52
437,438,Donaldside,97598,Hotel manager,9091,35
