# User similarity

The purpose of this simple KNN model is to identify similar users.

A secondary model (using collaborative filtering) will make recommendations based on the preferences of these similar users, rather than all users. This helps avoild all recommendations being "popular".

Note that the user data has been synthetically generated, so it won't be useful in making meaningful recommendations on the MovieLens dataset.

In [290]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [291]:
DATA_DIR="../../data/recommendations"
users_path = 'users.csv'
users_df = pd.read_csv(os.path.join(DATA_DIR, users_path), dtype={'post_code':str})
users_df.head()

Unnamed: 0,userId,city,post_code,job,downloads,contract_months
0,1,Lawsonview,9820,Learning mentor,9164,40
1,2,Sandrafurt,46742,Diagnostic radiographer,7331,58
2,3,Lake Brettfort,36388,Comptroller,7287,43
3,4,New Jeffreyhaven,8294,"Psychotherapist, child",10176,60
4,5,Port Ryanside,46511,Tree surgeon,12032,47


In [292]:
ratings_path = 'ml-latest-small/ratings.csv'
ratings_df = pd.read_csv(os.path.join(DATA_DIR, ratings_path))
# ignore ratings < 3
ratings_df = ratings_df[ratings_df['rating'] >= 3]
ratings_df = ratings_df.drop(columns=['rating', 'timestamp'])
ratings_df.head()

Unnamed: 0,userId,movieId
0,1,1
1,1,3
2,1,6
3,1,47
4,1,50


In [293]:
# copy ratings df
ratings_df_copy = ratings_df.copy()
# this won't work, as we need to be working on a filtered set of users
# ratings_df_copy = pd.get_dummies(ratings_df_copy, columns=['userId'])
# ratings_df_copy.head()

In [294]:
# copy users df
users_df_copy = users_df.copy()

# quick and dirty normalisation
max_downloads = users_df_copy['downloads'].max()
max_contract_months = users_df_copy['contract_months'].max()
                                 
users_df_copy['downloads'] = users_df_copy['downloads'] / max_downloads
users_df_copy['contract_months'] = users_df_copy['contract_months'] / max_contract_months

users_df_copy.head()

Unnamed: 0,userId,city,post_code,job,downloads,contract_months
0,1,Lawsonview,9820,Learning mentor,0.596459,0.666667
1,2,Sandrafurt,46742,Diagnostic radiographer,0.477154,0.966667
2,3,Lake Brettfort,36388,Comptroller,0.474291,0.716667
3,4,New Jeffreyhaven,8294,"Psychotherapist, child",0.662328,1.0
4,5,Port Ryanside,46511,Tree surgeon,0.783129,0.783333


In [295]:
dummy_cols=['city', 'post_code', 'job', 'contract_months']
users_df_copy = pd.get_dummies(users_df_copy, columns=dummy_cols)
users_df_copy.head()

Unnamed: 0,userId,downloads,city_Aguirretown,city_Annaland,city_Baileyfurt,city_Barbaraberg,city_Collinshaven,city_Coxhaven,city_Crystalshire,city_Cynthiatown,...,contract_months_0.85,contract_months_0.8666666666666667,contract_months_0.8833333333333333,contract_months_0.9,contract_months_0.9166666666666666,contract_months_0.9333333333333333,contract_months_0.95,contract_months_0.9666666666666667,contract_months_0.9833333333333333,contract_months_1.0
0,1,0.596459,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0.477154,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,0.474291,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0.662328,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,5,0.783129,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [296]:
from sklearn.model_selection import train_test_split

X_users = np.array(users_df_copy.drop(columns=['userId']))
y_users = np.array(users_df_copy['userId'])

X_users_train, X_users_test, y_users_train, y_users_test = train_test_split(X_users, y_users, random_state=0)

In [297]:
from sklearn.neighbors import NearestNeighbors

class Recommendations():
    def __init__(self, ratings_df, user_neighbours=10, rating_neighbours=10):
        self.ratings_df = ratings_df
        self.users_model = NearestNeighbors(n_neighbors=user_neighbours,
                         metric='cosine',
                         algorithm='brute',
                         n_jobs=-1)
        self.items_model = NearestNeighbors(n_neighbors=rating_neighbours,
                         metric='cosine',
                         algorithm='brute',
                         n_jobs=-1)
    
    def fit(self, X_users, y_users=None):
        self.users_model.fit(X_users, y_users)
        
    def predict_items(self, similar_users):
        ratings = self.ratings_df[self.ratings_df['userId'].isin(similar_users)]
        ratings = pd.get_dummies(ratings, columns=['userId'])
        X_ratings = np.array(ratings.drop(columns=['movieId']))
        y_ratings = np.array(ratings['movieId'])
        print("X_ratings", X_ratings.shape)
        print("y_ratings", y_ratings.shape)
        # TODO: decide what the items_model should be fitted on
        # similar_items_dist, similar_items = self.items_model.kneighbors(X_ratings, return_distance=True)
        # print('similar_items', similar_items)
        
    def predict(self, X_users, y=None):
        similar_users_dist, similar_users = self.users_model.kneighbors(X_users, return_distance=True)
        similar_users_0 = similar_users[0].tolist()
        self.predict_items(similar_users_0)
        return similar_users_dist, similar_users

In [298]:
from sklearn.neighbors import NearestNeighbors

model = Recommendations(ratings_df=ratings_df_copy)

model.fit(X_users_train, y_users_train)
dist, neighbours = model.predict(X_users_test, y_users_test)
print('neighbours', neighbours.shape)
dist, neighbours

X_ratings (1816, 10)
y_ratings (1816,)
neighbours (153, 10)


(array([[0.67649434, 0.676678  , 0.6879414 , ..., 0.69593667, 0.69593667,
         0.69593667],
        [0.22880224, 0.45475496, 0.45791075, ..., 0.66366162, 0.66373943,
         0.67468297],
        [0.62253372, 0.62669532, 0.6327238 , ..., 0.64997119, 0.65002452,
         0.65193765],
        ...,
        [0.44106043, 0.45749263, 0.65594967, ..., 0.66135088, 0.66478883,
         0.66500031],
        [0.45598278, 0.45806403, 0.46168039, ..., 0.68630992, 0.68677764,
         0.68677764],
        [0.45408713, 0.65637198, 0.66593358, ..., 0.67699741, 0.68089697,
         0.6809149 ]]),
 array([[407, 357,  53, ..., 198, 177, 435],
        [213, 364, 185, ..., 156,  99, 316],
        [140, 156, 437, ..., 240, 287, 318],
        ...,
        [148,  52, 438, ...,  44, 102, 301],
        [286,   3, 158, ..., 386, 131, 364],
        [268, 241, 191, ...,  22,  16, 186]]))

In [299]:
test_user_id=y_test[0]

test_user = users_df[users_df['userId'] == test_user_id]
test_user

Unnamed: 0,userId,city,post_code,job,downloads,contract_months
577,578,Crystalshire,76320,Food technologist,8009,48


In [300]:
test_user_similar_ids = neighbours[0]
test_user_similar = users_df[users_df['userId'].isin(test_user_similar_ids)]
test_user_similar

Unnamed: 0,userId,city,post_code,job,downloads,contract_months
52,53,Meganfurt,72562,Building services engineer,9316,55
75,76,Collinshaven,31191,Housing manager/officer,9194,54
168,169,Murraymouth,74847,Games developer,8697,48
170,171,Hughesmouth,23526,Diagnostic radiographer,7524,42
176,177,East Brittneyhaven,90411,"Engineer, land",9295,41
197,198,Raymondland,87769,Technical author,14196,24
234,235,Fuentesstad,13923,Applications developer,9576,39
356,357,South Craigview,42110,Learning mentor,9146,46
406,407,Juanland,45624,Chief Technology Officer,10180,52
434,435,Meganfurt,58963,Clothing/textile technologist,9082,51
