## Importing libraries

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
plt.figure(figsize=(12,6))
%matplotlib inline
import seaborn as sns

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix

## Importing Dataset

In [2]:
ori_user = pd.read_csv('user_rating.tsv', delimiter='\t', quoting=3, names=['user_id', 'item_id', 'rating', 'timestamp'])
ori_movie = pd.read_csv('movie_id_titles.csv')

In [3]:
ori_user.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,0,50,5,881250949
1,0,172,5,881250949
2,0,133,1,881250949
3,196,242,3,881250949
4,186,302,3,891717742


In [4]:
ori_movie.head()

Unnamed: 0,item_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [5]:
print('User Rating:', ori_user.shape)
print('Movie:', ori_movie.shape)

User Rating: (100003, 4)
Movie: (1682, 2)


In [6]:
ori = ori_user.merge(ori_movie, on='item_id', how='inner')
ori.head()

Unnamed: 0,user_id,item_id,rating,timestamp,title
0,0,50,5,881250949,Star Wars (1977)
1,290,50,5,880473582,Star Wars (1977)
2,79,50,4,891271545,Star Wars (1977)
3,2,50,5,888552084,Star Wars (1977)
4,8,50,5,879362124,Star Wars (1977)


In [7]:
ori.dtypes

user_id       int64
item_id       int64
rating        int64
timestamp     int64
title        object
dtype: object

In [8]:
ori.isnull().sum()

user_id      0
item_id      0
rating       0
timestamp    0
title        0
dtype: int64

In [9]:
df = pd.DataFrame(ori.groupby('title')['rating'].mean())

In [10]:
df['no. of users rated'] = pd.DataFrame(df.groupby('title')['rating'].count())

In [11]:
df.head()

Unnamed: 0_level_0,rating,no. of users rated
title,Unnamed: 1_level_1,Unnamed: 2_level_1
'Til There Was You (1997),2.333333,1
1-900 (1994),2.6,1
101 Dalmatians (1996),2.908257,1
12 Angry Men (1957),4.344,1
187 (1997),3.02439,1


# Collaborative Recommender System

Make use Ratings from other Users!

In [12]:
def standardise(row):
    return (row-row.mean())/(row.max()-row.min())

## Based on Correlation

In [13]:
def correlate(data, x):
    dt = pd.DataFrame(data.corrwith(data[x], axis=0, method='pearson'), columns=['Correlation '+str(x)])
    return dt.sort_values(dt.columns[0], ascending=False, axis=0, kind='quicksort').dropna().reset_index()

![image.jpg](Correlation.jpg)

Various methods are available for correlation & everyone of them has different formula/concept. The above one is a general formula. Correlation ranges from -1 to 1.
<hr/>

### Item-Based

In [14]:
df1 = ori

In [15]:
df1 = df1.pivot_table(index='user_id', columns='title', values='rating', aggfunc='mean')
df1.head()

title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,,,,,,...,,,,,,,,,,
1,,,2.0,5.0,,,3.0,4.0,,,...,,,,5.0,3.0,,,,4.0,
2,,,,,,,,,1.0,,...,,,,,,,,,,
3,,,,,2.0,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [16]:
df1.fillna(0, inplace=True)
df1.head()

title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,2.0,5.0,0.0,0.0,3.0,4.0,0.0,0.0,...,0.0,0.0,0.0,5.0,3.0,0.0,0.0,0.0,4.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
df1 = df1.apply(standardise)
df1.head()

title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,-0.005561,-0.003443,-0.067161,-0.115042,-0.026271,-0.063559,-0.05339,-0.217797,-0.005297,-0.050636,...,-0.001059,-0.004873,-0.001059,-0.167161,-0.068644,-0.025847,-0.029025,-0.00459,-0.006568,-0.001059
1,-0.005561,-0.003443,0.332839,0.884958,-0.026271,-0.063559,0.54661,0.582203,-0.005297,-0.050636,...,-0.001059,-0.004873,-0.001059,0.832839,0.531356,-0.025847,-0.029025,-0.00459,0.793432,-0.001059
2,-0.005561,-0.003443,-0.067161,-0.115042,-0.026271,-0.063559,-0.05339,-0.217797,0.994703,-0.050636,...,-0.001059,-0.004873,-0.001059,-0.167161,-0.068644,-0.025847,-0.029025,-0.00459,-0.006568,-0.001059
3,-0.005561,-0.003443,-0.067161,-0.115042,0.373729,-0.063559,-0.05339,-0.217797,-0.005297,-0.050636,...,-0.001059,-0.004873,-0.001059,-0.167161,-0.068644,-0.025847,-0.029025,-0.00459,-0.006568,-0.001059
4,-0.005561,-0.003443,-0.067161,-0.115042,-0.026271,-0.063559,-0.05339,-0.217797,-0.005297,-0.050636,...,-0.001059,-0.004873,-0.001059,-0.167161,-0.068644,-0.025847,-0.029025,-0.00459,-0.006568,-0.001059


In [18]:
# 'Star Wars (1977)' is taken as input from user
correlate(df1, 'Star Wars (1977)').head()

Unnamed: 0,title,Correlation Star Wars (1977)
0,Star Wars (1977),1.0
1,Return of the Jedi (1983),0.746797
2,"Empire Strikes Back, The (1980)",0.555233
3,Raiders of the Lost Ark (1981),0.540455
4,Indiana Jones and the Last Crusade (1989),0.467523


### User-Based

In [19]:
df2 = ori

In [20]:
df2 = df2.pivot_table(index='title', columns='user_id', values='rating', aggfunc='mean')
df2.head()

user_id,0,1,2,3,4,5,6,7,8,9,...,934,935,936,937,938,939,940,941,942,943
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'Til There Was You (1997),,,,,,,,,,,...,,,,,,,,,,
1-900 (1994),,,,,,,,,,,...,,,,,,,,,,
101 Dalmatians (1996),,2.0,,,,2.0,,,,,...,2.0,,,2.0,4.0,,,,,
12 Angry Men (1957),,5.0,,,,,4.0,4.0,,,...,,,,,,,,,,
187 (1997),,,,2.0,,,,,,,...,,,,,,,,,,


In [21]:
df2.fillna(0, inplace=True)
df2.head()

user_id,0,1,2,3,4,5,6,7,8,9,...,934,935,936,937,938,939,940,941,942,943
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'Til There Was You (1997),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1-900 (1994),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101 Dalmatians (1996),0.0,2.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,2.0,4.0,0.0,0.0,0.0,0.0,0.0
12 Angry Men (1957),0.0,5.0,0.0,0.0,0.0,0.0,4.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
187 (1997),0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
df2 = df2.apply(standardise)
df2.head()

user_id,0,1,2,3,4,5,6,7,8,9,...,934,935,936,937,938,939,940,941,942,943
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'Til There Was You (1997),-0.001322,-0.117428,-0.027163,-0.017668,-0.0125,-0.060457,-0.090986,-0.190986,-0.026923,-0.011298,...,-0.076863,-0.018389,-0.063462,-0.015745,-0.042428,-0.02488,-0.044471,-0.010697,-0.039904,-0.06887
1-900 (1994),-0.001322,-0.117428,-0.027163,-0.017668,-0.0125,-0.060457,-0.090986,-0.190986,-0.026923,-0.011298,...,-0.076863,-0.018389,-0.063462,-0.015745,-0.042428,-0.02488,-0.044471,-0.010697,-0.039904,-0.06887
101 Dalmatians (1996),-0.001322,0.282572,-0.027163,-0.017668,-0.0125,0.339543,-0.090986,-0.190986,-0.026923,-0.011298,...,0.323137,-0.018389,-0.063462,0.384255,0.757572,-0.02488,-0.044471,-0.010697,-0.039904,-0.06887
12 Angry Men (1957),-0.001322,0.882572,-0.027163,-0.017668,-0.0125,-0.060457,0.709014,0.609014,-0.026923,-0.011298,...,-0.076863,-0.018389,-0.063462,-0.015745,-0.042428,-0.02488,-0.044471,-0.010697,-0.039904,-0.06887
187 (1997),-0.001322,-0.117428,-0.027163,0.382332,-0.0125,-0.060457,-0.090986,-0.190986,-0.026923,-0.011298,...,-0.076863,-0.018389,-0.063462,-0.015745,-0.042428,-0.02488,-0.044471,-0.010697,-0.039904,-0.06887


In [23]:
# '3' is taken as input from user as user_id
correlate(df2, 3).head()

Unnamed: 0,user_id,Correlation 3
0,3,1.0
1,489,0.479135
2,863,0.470898
3,784,0.461843
4,317,0.461101


In [24]:
similar_user = correlate(df2, 3)['user_id'][1]
print(similar_user)

489


In [25]:
df2[similar_user].dropna().sort_values(ascending=False, kind='quicksort').head().reset_index()

Unnamed: 0,title,489
0,"Jackal, The (1997)",0.952043
1,Kull the Conqueror (1997),0.952043
2,Contact (1997),0.952043
3,Kiss the Girls (1997),0.952043
4,Dante's Peak (1997),0.952043


## Based on Cosine Distance/Similarity

![image.png](cosine.png)

### Using item-based cosine similarity

In [26]:
df3 = ori

In [27]:
df3 = df3.pivot_table(index='user_id', columns='title', values='rating', aggfunc='mean')
df3.head()

title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,,,,,,...,,,,,,,,,,
1,,,2.0,5.0,,,3.0,4.0,,,...,,,,5.0,3.0,,,,4.0,
2,,,,,,,,,1.0,,...,,,,,,,,,,
3,,,,,2.0,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [28]:
df3.fillna(0, inplace=True)
df3.head()

title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,2.0,5.0,0.0,0.0,3.0,4.0,0.0,0.0,...,0.0,0.0,0.0,5.0,3.0,0.0,0.0,0.0,4.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
df3 = df3.apply(standardise)
df3.head()

title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,-0.005561,-0.003443,-0.067161,-0.115042,-0.026271,-0.063559,-0.05339,-0.217797,-0.005297,-0.050636,...,-0.001059,-0.004873,-0.001059,-0.167161,-0.068644,-0.025847,-0.029025,-0.00459,-0.006568,-0.001059
1,-0.005561,-0.003443,0.332839,0.884958,-0.026271,-0.063559,0.54661,0.582203,-0.005297,-0.050636,...,-0.001059,-0.004873,-0.001059,0.832839,0.531356,-0.025847,-0.029025,-0.00459,0.793432,-0.001059
2,-0.005561,-0.003443,-0.067161,-0.115042,-0.026271,-0.063559,-0.05339,-0.217797,0.994703,-0.050636,...,-0.001059,-0.004873,-0.001059,-0.167161,-0.068644,-0.025847,-0.029025,-0.00459,-0.006568,-0.001059
3,-0.005561,-0.003443,-0.067161,-0.115042,0.373729,-0.063559,-0.05339,-0.217797,-0.005297,-0.050636,...,-0.001059,-0.004873,-0.001059,-0.167161,-0.068644,-0.025847,-0.029025,-0.00459,-0.006568,-0.001059
4,-0.005561,-0.003443,-0.067161,-0.115042,-0.026271,-0.063559,-0.05339,-0.217797,-0.005297,-0.050636,...,-0.001059,-0.004873,-0.001059,-0.167161,-0.068644,-0.025847,-0.029025,-0.00459,-0.006568,-0.001059


In [30]:
# Taking transpose since we want similarity between items which we need to be in rows
item_similarity = cosine_similarity(df3.T)
print(item_similarity)

[[ 1.         -0.00587776 -0.0045618  ... -0.00671374  0.10263022
  -0.00296024]
 [-0.00587776  1.         -0.00680396 ...  0.07524803  0.03651169
  -0.00210559]
 [-0.0045618  -0.00680396  1.         ...  0.15728435  0.01692501
  -0.01095536]
 ...
 [-0.00671374  0.07524803  0.15728435 ...  1.         -0.0069193
  -0.00240506]
 [ 0.10263022  0.03651169  0.01692501 ... -0.0069193   1.
  -0.00305088]
 [-0.00296024 -0.00210559 -0.01095536 ... -0.00240506 -0.00305088
   1.        ]]


In [31]:
item_similarity_df = pd.DataFrame(item_similarity, index=df3.columns, columns=df3.columns)
item_similarity_df.head()

title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'Til There Was You (1997),1.0,-0.005878,-0.004562,0.072144,0.171589,0.139004,-0.025532,0.007325,-0.006633,0.011575,...,-0.00296,-0.006929,-0.00296,-0.014438,0.095536,0.127084,0.011623,-0.006714,0.10263,-0.00296
1-900 (1994),-0.005878,1.0,-0.006804,-0.014894,-0.005317,-0.015471,-0.007695,0.039896,-0.004718,-0.016314,...,0.150792,0.010632,-0.002106,0.045223,-0.002224,0.01054,-0.006065,0.075248,0.036512,-0.002106
101 Dalmatians (1996),-0.004562,-0.006804,1.0,0.059375,-0.001026,0.052983,0.128832,0.078451,0.004693,0.015592,...,-0.010955,-0.002402,0.021669,0.155599,0.1153,0.039243,-0.005846,0.157284,0.016925,-0.010955
12 Angry Men (1957),0.072144,-0.014894,0.059375,1.0,-0.014261,0.066459,0.230361,0.298878,0.050011,0.33926,...,0.052807,-0.011592,-0.012527,0.290332,0.165072,0.079418,0.038188,0.013382,0.028636,0.074586
187 (1997),0.171589,-0.005317,-0.001026,-0.014261,1.0,0.078831,-0.010273,-0.039807,0.029612,-0.021359,...,-0.006461,0.13006,-0.006461,-0.021764,0.006881,0.053885,0.063828,0.011091,0.127496,0.09192


In [32]:
def recommend(movie, rating):
    # 2.5 is the mean of rating range i.e., [0, 5]
    res = item_similarity_df[movie]*(rating-2.5)
    return res.sort_values(ascending=False, axis=0, kind='quicksort').reset_index()

In [33]:
# 'Star Wars (1977)' & '1' are taken as input from user as movie & its rating resp.
recommend('Star Wars (1977)', 1).head()

Unnamed: 0,title,Star Wars (1977)
0,Seven Years in Tibet (1997),0.22452
1,Kiss the Girls (1997),0.215894
2,I Know What You Did Last Summer (1997),0.198227
3,"Smile Like Yours, A (1997)",0.18069
4,Anna Karenina (1997),0.174465


In [34]:
# user_data is taken as input from user
user_data = [('Star Wars (1977)', 5), ('12 Angry Men (1957)', 4), ('187 (1997)', 1)]

c = 0
for movie, rating in user_data:
    if c==0:
        similar_movies = recommend(movie, rating)
        c = 1
    else:
        similar_movies = similar_movies.merge(recommend(movie, rating), on='title')

similar_movies.set_index('title', inplace=True)
similar_movies.sum(axis=1).sort_values(ascending=False, axis=0, kind='quicksort').reset_index().head()

Unnamed: 0,title,0
0,Star Wars (1977),2.871054
1,Return of the Jedi (1983),2.134488
2,12 Angry Men (1957),2.052848
3,"Empire Strikes Back, The (1980)",1.839049
4,Raiders of the Lost Ark (1981),1.80608


### Using item-based Nearest Neighbors

In [35]:
df4 = ori

In [36]:
df4 = df4.pivot_table(index='title', columns='user_id', values='rating', aggfunc='mean')
df4.head()

user_id,0,1,2,3,4,5,6,7,8,9,...,934,935,936,937,938,939,940,941,942,943
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'Til There Was You (1997),,,,,,,,,,,...,,,,,,,,,,
1-900 (1994),,,,,,,,,,,...,,,,,,,,,,
101 Dalmatians (1996),,2.0,,,,2.0,,,,,...,2.0,,,2.0,4.0,,,,,
12 Angry Men (1957),,5.0,,,,,4.0,4.0,,,...,,,,,,,,,,
187 (1997),,,,2.0,,,,,,,...,,,,,,,,,,


In [37]:
df4.fillna(0, inplace=True)
df4.head()

user_id,0,1,2,3,4,5,6,7,8,9,...,934,935,936,937,938,939,940,941,942,943
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'Til There Was You (1997),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1-900 (1994),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
101 Dalmatians (1996),0.0,2.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,2.0,4.0,0.0,0.0,0.0,0.0,0.0
12 Angry Men (1957),0.0,5.0,0.0,0.0,0.0,0.0,4.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
187 (1997),0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
df4 = df4.apply(standardise)
df4.head()

user_id,0,1,2,3,4,5,6,7,8,9,...,934,935,936,937,938,939,940,941,942,943
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'Til There Was You (1997),-0.001322,-0.117428,-0.027163,-0.017668,-0.0125,-0.060457,-0.090986,-0.190986,-0.026923,-0.011298,...,-0.076863,-0.018389,-0.063462,-0.015745,-0.042428,-0.02488,-0.044471,-0.010697,-0.039904,-0.06887
1-900 (1994),-0.001322,-0.117428,-0.027163,-0.017668,-0.0125,-0.060457,-0.090986,-0.190986,-0.026923,-0.011298,...,-0.076863,-0.018389,-0.063462,-0.015745,-0.042428,-0.02488,-0.044471,-0.010697,-0.039904,-0.06887
101 Dalmatians (1996),-0.001322,0.282572,-0.027163,-0.017668,-0.0125,0.339543,-0.090986,-0.190986,-0.026923,-0.011298,...,0.323137,-0.018389,-0.063462,0.384255,0.757572,-0.02488,-0.044471,-0.010697,-0.039904,-0.06887
12 Angry Men (1957),-0.001322,0.882572,-0.027163,-0.017668,-0.0125,-0.060457,0.709014,0.609014,-0.026923,-0.011298,...,-0.076863,-0.018389,-0.063462,-0.015745,-0.042428,-0.02488,-0.044471,-0.010697,-0.039904,-0.06887
187 (1997),-0.001322,-0.117428,-0.027163,0.382332,-0.0125,-0.060457,-0.090986,-0.190986,-0.026923,-0.011298,...,-0.076863,-0.018389,-0.063462,-0.015745,-0.042428,-0.02488,-0.044471,-0.010697,-0.039904,-0.06887


In [39]:
df4_csr = csr_matrix(df4.values)
df4_csr.toarray()

array([[-0.00132212, -0.11742788, -0.02716346, ..., -0.01069712,
        -0.03990385, -0.06887019],
       [-0.00132212, -0.11742788, -0.02716346, ..., -0.01069712,
        -0.03990385, -0.06887019],
       [-0.00132212,  0.28257212, -0.02716346, ..., -0.01069712,
        -0.03990385, -0.06887019],
       ...,
       [-0.00132212, -0.11742788, -0.02716346, ..., -0.01069712,
        -0.03990385, -0.06887019],
       [-0.00132212,  0.68257212, -0.02716346, ..., -0.01069712,
        -0.03990385, -0.06887019],
       [-0.00132212, -0.11742788, -0.02716346, ..., -0.01069712,
        -0.03990385, -0.06887019]])

In [40]:
# Unsupervised Learning Method
model_knn = NearestNeighbors(metric='cosine', n_neighbors=5, n_jobs=-1)
model_knn.fit(df4_csr)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=-1, n_neighbors=5, p=2, radius=1.0)

In [41]:
# 'Star Wars (1977)' is taken as input from user
pos = df4.index.get_loc('Star Wars (1977)')
print(pos)

1398


In [42]:
distances, indices = model_knn.kneighbors(df4_csr[pos, :], n_neighbors=6)

In [43]:
print('Recommendations for {0}:\n'.format(df4.index[pos]))

for i in range(0, len(distances.flatten())):
    print('{0}: {1}, with distance of {2}:'.format(i, df4.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for Star Wars (1977):

0: Star Wars (1977), with distance of 0.0:
1: Return of the Jedi (1983), with distance of 0.13490930269895018:
2: Raiders of the Lost Ark (1981), with distance of 0.2746225839147497:
3: Empire Strikes Back, The (1980), with distance of 0.2921526864476328:
4: Toy Story (1995), with distance of 0.3087503034807333:
5: Godfather, The (1972), with distance of 0.349482568104476:


<hr/>
User-based methods can also be used!
Generally item-based methods are better!

## Based on Matrix Factorization

In [44]:
df5 = ori

In [45]:
df5 = df5.pivot_table(index='user_id', columns='title', values='rating', aggfunc='mean')
df5.head()

title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,,,,,,...,,,,,,,,,,
1,,,2.0,5.0,,,3.0,4.0,,,...,,,,5.0,3.0,,,,4.0,
2,,,,,,,,,1.0,,...,,,,,,,,,,
3,,,,,2.0,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [46]:
df5.fillna(0, inplace=True)
df5.head()

title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,2.0,5.0,0.0,0.0,3.0,4.0,0.0,0.0,...,0.0,0.0,0.0,5.0,3.0,0.0,0.0,0.0,4.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [47]:
df5.apply(standardise)
df5.head()

title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,2.0,5.0,0.0,0.0,3.0,4.0,0.0,0.0,...,0.0,0.0,0.0,5.0,3.0,0.0,0.0,0.0,4.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [48]:
# Taking transpose since we want similarity between items which we need to be in rows
SVD = TruncatedSVD(n_components=12)
matrix = SVD.fit_transform(df5.T)
matrix.shape

(1664, 12)

In [49]:
corr = np.corrcoef(matrix)
corr.shape
# Just like item-based cosine similarity above (items x items)

(1664, 1664)

In [50]:
# 'Star Wars (1977)' is taken as input from user
pos = df5.columns.get_loc('Star Wars (1977)')

In [51]:
corr[pos]

array([0.35523263, 0.42836068, 0.59050517, ..., 0.33719314, 0.62974331,
       0.26959791])

In [52]:
res = pd.DataFrame(data=corr[pos], index=df5.columns, columns=['Score'])
res.sort_values('Score', ascending=False, axis=0, kind='quicksort').head()

Unnamed: 0_level_0,Score
title,Unnamed: 1_level_1
Star Wars (1977),1.0
Return of the Jedi (1983),0.988103
Terminator 2: Judgment Day (1991),0.943186
Toy Story (1995),0.932889
"Terminator, The (1984)",0.932262
