
<center><u><H1>Recommender Systems with Item Based & User Based and KNN for finding similar users based

In [1]:
import pandas as pd
import numpy as np

In [2]:
header = ['user_id', 'movie_id', 'rating', 'timestamp']
df_ratings = pd.read_csv('../Data/ml-100k/u.data', sep='\t', names=header)
df_ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [3]:
movie_details = df_ratings.groupby('movie_id').agg({'rating':
                                                   [np.size, np.mean]})
movie_details.head()

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
movie_id,Unnamed: 1_level_2,Unnamed: 2_level_2
1,452,3.878319
2,131,3.206107
3,90,3.033333
4,209,3.550239
5,86,3.302326


In [4]:
# For Visualizing the title as well we are merging two data sets 

cols = ['movie_id', 'title']
movies = pd.read_csv('../Data/ml-100k/u.item', sep='|', names=cols, usecols=range(2), encoding='latin-1')
ratings_titles = pd.merge(movies, df_ratings).sort_values(['rating'], ascending=False)
ratings_titles.head()

Unnamed: 0,movie_id,title,user_id,rating,timestamp
33615,218,Cape Fear (1991),388,5,886441083
53455,357,One Flew Over the Cuckoo's Nest (1975),532,5,892519935
53463,357,One Flew Over the Cuckoo's Nest (1975),499,5,885599372
31674,208,Young Frankenstein (1974),643,5,891448136
17608,125,Phenomenon (1996),109,5,880564534


## Similarity matrix

In [5]:
ratings_ = df_ratings.pivot(index = 'user_id', columns ='movie_id', values = 'rating').fillna(0)
ratings_.head(5)

movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
matrix = ratings_.as_matrix()

  """Entry point for launching an IPython kernel.


In [7]:
from sklearn.model_selection import train_test_split

In [8]:
ratings_train, ratings_test = train_test_split(matrix, test_size=0.3, random_state=42)

In [9]:
ratings_train.shape

(660, 1682)

In [10]:
ratings_test.shape

(283, 1682)

## Finding the top N nearest neighbors

In [11]:
from sklearn.neighbors import NearestNeighbors
k = 5

In [12]:
knn = NearestNeighbors(k, 'cosine')

In [13]:
knn.fit(ratings_train)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=None, n_neighbors=5, p=2,
         radius='cosine')

In [14]:
top_k_dist, top_k_users = knn.kneighbors(ratings_train, return_distance=True)

In [15]:
top_k_dist.shape
top_k_users.shape

(660, 5)

## Top 5, similar users to user 1:

In [16]:
top_k_users[0][:5]

array([  0, 211,  16, 583, 428], dtype=int64)

In [17]:
top_k_dist[0][:5]

array([ 0.        , 41.71330723, 43.3474336 , 45.04442252, 45.04442252])

## Prediction matrix for rating movies:
### Selecting the top 5 users for each user and use their rating info
### while predicting the ratings using the weighted sum of all of the
### ratings of the top 5 similar users.

In [18]:
user_pred = np.zeros(ratings_train.shape)
for i in range(ratings_train.shape[0]):
    user_pred[i,:] = top_k_dist[i].T.dot(ratings_train[top_k_users][i])/np.array([np.abs(top_k_dist[i].T).sum(axis=0)]).T

user_pred.shape

(660, 1682)

In [19]:
df = pd.DataFrame(user_pred, columns=ratings_.columns)
df.head(10)

movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.230096,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2.753043,0.499327,0.249663,1.500827,0.0,0.0,1.753989,0.499327,0.753749,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,4.0,0.0,0.0,0.0,0.0,0.0,2.733508,0.0,0.767198,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,3.758043,2.251767,0.0,2.759742,0.0,0.0,3.49635,0.743135,0.748233,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Evaluating the model:

In [20]:
from sklearn.metrics import mean_squared_error
from math import sqrt

In [21]:
def rmse(pred,actual):
    prediction = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, actual))

In [22]:
rmse(user_pred, ratings_train)

2.860210336806541

In [23]:
rmse(user_pred, ratings_test)

3.367330398504572

## Item-based recommendations

In [24]:
# defining k as the number of movies
k_ = ratings_train.shape[1]
k_

1682

In [25]:
neigh_ = NearestNeighbors(k_, 'cosine')

In [26]:
neigh_.fit(ratings_train.T)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=None, n_neighbors=1682, p=2,
         radius='cosine')

In [27]:
top_k_dist_, top_k_movies_ = neigh_.kneighbors(ratings_train.T, return_distance=True)

## Similar movies to movie 1:

In [28]:
top_k_movies_[0]

array([  0, 120, 404, ..., 285, 312, 301], dtype=int64)

In [29]:
top_k_dist_[0]

array([ 0.        , 53.12249994, 54.38749856, ..., 77.08436936,
       77.39509028, 79.3473377 ])

## Predicting the movie ratings

In [30]:
item_predictions = ratings_train.dot(top_k_dist_) / np.array([np.abs(top_k_dist_).sum(axis=1)])

In [31]:
df_predictions = pd.DataFrame(item_predictions, columns=ratings_.columns)
df_predictions.head(10)
#Because the query set matches the training set, the nearest neighbor of each point is the point itself, at a distance of zero.

movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
0,0.0,0.318426,0.370093,0.262237,0.369791,0.462891,0.188307,0.240255,0.208729,0.352278,...,1.298808,1.292362,1.36519,1.380232,1.393514,1.408556,1.4268,1.448504,1.45664,1.545308
1,0.0,0.054538,0.06228,0.044045,0.061896,0.077263,0.031369,0.039994,0.034737,0.058565,...,0.182118,0.180352,0.189936,0.19175,0.193745,0.195906,0.19783,0.200787,0.201121,0.213325
2,0.0,0.035225,0.040731,0.028824,0.040595,0.050637,0.020564,0.026209,0.022772,0.038423,...,0.166232,0.165331,0.174985,0.177631,0.179116,0.180139,0.181806,0.184569,0.186547,0.200763
3,0.0,0.087291,0.101434,0.07196,0.101578,0.126977,0.051637,0.0658,0.057136,0.096375,...,0.331993,0.330009,0.347356,0.351866,0.354874,0.358003,0.360961,0.365639,0.367251,0.390919
4,0.0,0.421538,0.488463,0.347368,0.490747,0.614978,0.250371,0.319556,0.277829,0.469281,...,1.790325,1.779009,1.881473,1.902814,1.922278,1.941175,1.968537,1.997288,2.013134,2.126436
5,0.0,0.062268,0.071446,0.05052,0.071088,0.088917,0.036149,0.04614,0.040088,0.067561,...,0.205173,0.203762,0.214561,0.216829,0.219402,0.221169,0.223249,0.225526,0.22653,0.240815
6,0.0,0.127591,0.14961,0.105822,0.149263,0.186985,0.076077,0.097033,0.084321,0.142406,...,0.449655,0.446358,0.470311,0.475094,0.478856,0.48246,0.487072,0.491807,0.490226,0.521446
7,0.0,0.047315,0.05484,0.038746,0.054376,0.067811,0.027574,0.035161,0.03052,0.051402,...,0.156799,0.155842,0.164196,0.165506,0.166585,0.168679,0.170884,0.172929,0.171849,0.185213
8,0.0,0.038242,0.044113,0.031148,0.043863,0.054703,0.022212,0.028351,0.024608,0.041484,...,0.132831,0.131897,0.138646,0.140299,0.141655,0.142829,0.144261,0.146185,0.145816,0.156544
9,0.0,0.521024,0.606827,0.431687,0.609992,0.763399,0.310718,0.396555,0.344655,0.581956,...,2.121418,2.108141,2.229936,2.254356,2.276204,2.297575,2.326097,2.360383,2.365945,2.521556


In [32]:
df_predictions.shape

(660, 1682)

## Evaluating the model:

In [33]:
rmse(item_predictions, ratings_train)

3.342538761948114

In [34]:
rmse(item_predictions, ratings_test)

3.44626980973761

## Reference:

http://scikit-learn.org/stable/modules/neighbors.html