# Collaborative Filtering

In [3]:
from zipfile import ZipFile
file = 'movies_data.zip'
with ZipFile(file,'r') as zip:
    zip.printdir()
    zip.extractall()

File Name                                             Modified             Size
ml-latest-small/                               2018-09-26 15:50:12            0
ml-latest-small/links.csv                      2018-09-26 15:50:10       197979
ml-latest-small/tags.csv                       2018-09-26 15:49:40       118660
ml-latest-small/ratings.csv                    2018-09-26 15:49:38      2483723
ml-latest-small/README.txt                     2018-09-26 15:50:12         8342
ml-latest-small/movies.csv                     2018-09-26 15:49:56       494431


In [4]:
import zipfile
zf = zipfile.ZipFile('movies_data.zip')


In [5]:
import pandas as pd
import numpy as np

In [6]:
movies_df = pd.read_csv('ml-latest-small/movies.csv')
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
ratings_df = pd.read_csv('ml-latest-small/ratings.csv')
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [8]:
ratings_df = ratings_df.drop('timestamp',1)
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


We'll remove the year from title column and put it in separate year column

In [9]:
movies_df['year'] = movies_df.title.str.extract('(\(\d\d\d\d\))',expand=False)
movies_df['year'].head()

0    (1995)
1    (1995)
2    (1995)
3    (1995)
4    (1995)
Name: year, dtype: object

In [10]:
movies_df['year'] = movies_df.year.str.extract('(\d\d\d\d)',expand=False)
movies_df['year'].head()

0    1995
1    1995
2    1995
3    1995
4    1995
Name: year, dtype: object

In [11]:
movies_df['title'] = movies_df.title.str.replace('(\(\d\d\d\d\))','')
movies_df['title'] = movies_df['title'].apply(lambda x : x.strip())
movies_df.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


we'll also drop genre because there is no use of genre in collaborative filtering

In [12]:
movies_df = movies_df.drop('genres',1)
movies_df.head()

Unnamed: 0,movieId,title,year
0,1,Toy Story,1995
1,2,Jumanji,1995
2,3,Grumpier Old Men,1995
3,4,Waiting to Exhale,1995
4,5,Father of the Bride Part II,1995


In collborating filtering, we recommend items which have been rated by other users to input user where users have similar set of interest.
We'll be  using pearson correlation to find the similarity of other users to input user.

The process for creating a User Based recommendation system is as follows:

- Select a user with the movies the user has watched
- Based on his rating to movies, find the top X neighbours
- Get the watched movie record of the user for each neighbour.
- Calculate a similarity score using some formula
- Recommend the items with the highest score

In [13]:
userInput = [
            {'title':'Breakfast Club, The', 'rating':5},
            {'title':'Toy Story', 'rating':3.5},
            {'title':'Jumanji', 'rating':2},
            {'title':"Pulp Fiction", 'rating':5},
            {'title':'Akira', 'rating':4.5},
            {'title':'Heat', 'rating' : 4},
            {'title' :'Sabrina','rating' : 3.5}
         ] 
inputMovies = pd.DataFrame(userInput)
inputMovies

Unnamed: 0,rating,title
0,5.0,"Breakfast Club, The"
1,3.5,Toy Story
2,2.0,Jumanji
3,5.0,Pulp Fiction
4,4.5,Akira
5,4.0,Heat
6,3.5,Sabrina


let's extract the input movies's ID's from the movies dataframe and add them into it.

In [14]:
inputId = movies_df[movies_df['title'].isin(inputMovies['title']).tolist()]
inputId

Unnamed: 0,movieId,title,year
0,1,Toy Story,1995
1,2,Jumanji,1995
5,6,Heat,1995
6,7,Sabrina,1995
257,296,Pulp Fiction,1994
697,915,Sabrina,1954
973,1274,Akira,1988
1445,1968,"Breakfast Club, The",1985


In [15]:
inputMovies = pd.merge(inputId,inputMovies)
inputMovies

Unnamed: 0,movieId,title,year,rating
0,1,Toy Story,1995,3.5
1,2,Jumanji,1995,2.0
2,6,Heat,1995,4.0
3,7,Sabrina,1995,3.5
4,915,Sabrina,1954,3.5
5,296,Pulp Fiction,1994,5.0
6,1274,Akira,1988,4.5
7,1968,"Breakfast Club, The",1985,5.0


In [16]:
inputMovies= inputMovies.drop('year',1)
inputMovies

Unnamed: 0,movieId,title,rating
0,1,Toy Story,3.5
1,2,Jumanji,2.0
2,6,Heat,4.0
3,7,Sabrina,3.5
4,915,Sabrina,3.5
5,296,Pulp Fiction,5.0
6,1274,Akira,4.5
7,1968,"Breakfast Club, The",5.0


Now, with given movieId we'll find the subset of users who have reviewed the movie

In [17]:
userSubset = ratings_df[ratings_df['movieId'].isin(inputMovies['movieId'].tolist())]
userSubset.head(10)

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
2,1,6,4.0
16,1,296,3.0
320,4,296,1.0
422,4,1968,4.0
516,5,1,4.0
533,5,296,5.0
560,6,2,4.0
564,6,6,4.0
565,6,7,4.0


In [18]:
userSubset.tail()

Unnamed: 0,userId,movieId,rating
99534,610,1,5.0
99535,610,6,5.0
99552,610,296,5.0
99636,610,1274,5.0
99664,610,1968,4.0


In [19]:
#now we groupby user id
userSubsetGroup = userSubset.groupby(['userId'])
userSubsetGroup.head(10)

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
2,1,6,4.0
16,1,296,3.0
320,4,296,1.0
422,4,1968,4.0
516,5,1,4.0
533,5,296,5.0
560,6,2,4.0
564,6,6,4.0
565,6,7,4.0


In [20]:
userSubsetGroup.tail()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
2,1,6,4.0
16,1,296,3.0
320,4,296,1.0
422,4,1968,4.0
516,5,1,4.0
533,5,296,5.0
560,6,2,4.0
564,6,6,4.0
565,6,7,4.0


let's look at one of the user

In [21]:
userSubsetGroup.get_group(605)

Unnamed: 0,userId,movieId,rating
97143,605,1,4.0
97144,605,2,3.5
97151,605,296,2.0


In [22]:
#Sorting it so users with movie most in common with the input will have priority
userSubsetGroup = sorted(userSubsetGroup,  key=lambda x: len(x[1]), reverse=True)

In [23]:
userSubsetGroup[0:5]

[(414,        userId  movieId  rating
  62294     414        1     4.0
  62295     414        2     3.0
  62298     414        6     3.0
  62299     414        7     3.0
  62425     414      296     5.0
  62637     414      915     4.0
  62769     414     1274     4.0
  62957     414     1968     5.0), (474,        userId  movieId  rating
  73092     474        1     4.0
  73093     474        2     3.0
  73095     474        6     3.0
  73096     474        7     3.0
  73172     474      296     4.0
  73298     474      915     4.0
  73466     474     1274     2.0
  73641     474     1968     3.5), (177,        userId  movieId  rating
  24900     177        1     5.0
  24901     177        2     3.5
  24902     177        7     1.0
  24930     177      296     5.0
  24997     177      915     5.0
  25069     177     1274     2.0
  25129     177     1968     3.5), (599,        userId  movieId  rating
  92623     599        1     3.0
  92624     599        2     2.5
  92626     599     

now we'll use pearson correlation to find the similarity of users to our specified user and find the most common users.

formula of pearson correlation :

![alt text](https://wikimedia.org/api/rest_v1/media/math/render/svg/bd1ccc2979b0fd1c1aec96e386f686ae874f9ec0 "Pearson Correlation")

value of r varies from -1 to 1. r=1 means users have same interest while r=-1 means users have opposite interest.

In [24]:
#we'll not select every user, rather use a subset of users

userSubsetGroup = userSubsetGroup[0:100]
userSubsetGroup

[(414,        userId  movieId  rating
  62294     414        1     4.0
  62295     414        2     3.0
  62298     414        6     3.0
  62299     414        7     3.0
  62425     414      296     5.0
  62637     414      915     4.0
  62769     414     1274     4.0
  62957     414     1968     5.0), (474,        userId  movieId  rating
  73092     474        1     4.0
  73093     474        2     3.0
  73095     474        6     3.0
  73096     474        7     3.0
  73172     474      296     4.0
  73298     474      915     4.0
  73466     474     1274     2.0
  73641     474     1968     3.5), (177,        userId  movieId  rating
  24900     177        1     5.0
  24901     177        2     3.5
  24902     177        7     1.0
  24930     177      296     5.0
  24997     177      915     5.0
  25069     177     1274     2.0
  25129     177     1968     3.5), (599,        userId  movieId  rating
  92623     599        1     3.0
  92624     599        2     2.5
  92626     599     

![alt text](https://wikimedia.org/api/rest_v1/media/math/render/svg/435a23c499a2450f0752112e69a9b808336a7cce)

rearranging the pearson correlation's formula

In [94]:
import numpy as np
from math import*
def square_rooted(x):
    return (sqrt(sum([a*a for a in x])))
def pearson_correlation(x,y):
    x_m =np.mean(x)
    y_m =np.mean(y)
    for i in range(len(x)):
        x[i]=x[i] - x_m
    for j in range(len(y)):
        y[j]=y[j] - y_m
    if(square_rooted(x)*square_rooted(y) == 0):
        return 0
    return (np.dot(x,y) / float(square_rooted(x )*square_rooted(y )))

In [95]:
import math
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorrelationDict = {}

for name,group in userSubsetGroup:
    group = group.sort_values(by='movieId')
    inputMovies = inputMovies.sort_values(by='movieId')
    n= len(group)
    temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]
    tempRating = temp_df['rating'].tolist()
    #print(tempRating)
    groupRating = group['rating'].tolist()
    #print(groupRating)
    pearsonCorrelationDict[name] =pearson_correlation(tempRating,groupRating)


In [96]:
pearsonCorrelationDict.items()

dict_items([(414, 0.7557086549031783), (474, 0.06399221901922213), (177, 0.05594542388644596), (599, 0.7231450959435695), (57, -0.6640906067234306), (68, 0.0983611844205792), (91, 0.38240032376574457), (219, 0.43452409462674085), (274, 0.7136412401400631), (480, 0.7136412401400632), (483, 0.1353903836185097), (600, 0.19425717247145283), (606, 0.6406221326384731), (19, -0.3952847075210475), (45, 0.6993786061802353), (182, 0.9432422182837987), (202, 0.43994134506406), (217, 0.2566324512873683), (298, 0.9592712306918569), (318, 0.4466298001344307), (357, 0.5345224838248487), (434, 0.9749789778183842), (469, 0.7333587976225691), (470, -0.05157106231293968), (477, 0.43852900965351466), (489, 0.5345224838248488), (561, 0.4939145805736311), (590, 0.9174063600174063), (603, -0.07585826061362601), (608, 0.9207368843792512), (610, -0.5144957554275266), (6, -0.7333333333333333), (18, 0.9400193421607682), (32, 0.7385489458759965), (50, 0.15713484026367722), (64, -0.17407765595569785), (84, 0.81649

In [73]:
len(pearsonCorrelationDict.items())

100

In [100]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
pearsonDF.head(10)

Unnamed: 0,similarityIndex,userId
0,0.755709,414
1,0.063992,474
2,0.055945,177
3,0.723145,599
4,-0.664091,57
5,0.098361,68
6,0.3824,91
7,0.434524,219
8,0.713641,274
9,0.713641,480


now take the top 50 similar users

In [102]:
top50 = pearsonDF.sort_values(by ='similarityIndex',ascending = False)[0:50]
top50

Unnamed: 0,similarityIndex,userId
90,1.0,169
88,1.0,144
40,0.980196,132
21,0.974979,434
53,0.965581,307
84,0.960769,112
18,0.959271,298
78,0.944911,62
94,0.944911,191
46,0.94388,226


## Rating of selected users to all movies
We're going to do this by taking the weighted average of the ratings of the movies using the Pearson Correlation as the weight. But to do this, we first need to get the movies watched by the users in our pearsonDF from the ratings dataframe and then store their correlation in a new column called "similarityIndex". This is achieved below by merging of these two tables.

In [106]:
topUsersRating=top50.merge(ratings_df, left_on='userId', right_on='userId', how='inner')
topUsersRating.head(50)

Unnamed: 0,similarityIndex,userId,movieId,rating
0,1.0,169,1,4.5
1,1.0,169,2,4.0
2,1.0,169,3,5.0
3,1.0,169,5,5.0
4,1.0,169,7,4.5
5,1.0,169,11,4.0
6,1.0,169,34,4.0
7,1.0,169,39,3.5
8,1.0,169,48,3.5
9,1.0,169,60,4.0


Now  simply multiply the movie rating by its weight (The similarity index), then sum up the new ratings and divide it by the sum of the weights.

In [107]:
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating,weightedRating
0,1.0,169,1,4.5,4.5
1,1.0,169,2,4.0,4.0
2,1.0,169,3,5.0,5.0
3,1.0,169,5,5.0,5.0
4,1.0,169,7,4.5,4.5


In [110]:
#Applies a sum to the topUsers after grouping it up by userId
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
tempTopUsersRating.head()

Unnamed: 0_level_0,sum_similarityIndex,sum_weightedRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,30.630623,110.066343
2,25.292611,72.91312
3,10.626151,34.693173
4,0.816497,2.44949
5,8.256908,25.740715


In [115]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()
#Now we take the weighted average
recommendation_df['weighted average'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
#recommendation_df['movieId'] = tempTopUsersRating.index
recommendation_df.head()

Unnamed: 0_level_0,weighted average
movieId,Unnamed: 1_level_1
1,3.593343
2,2.882783
3,3.264886
4,3.0
5,3.117476


In [116]:
recommendation_df = recommendation_df.sort_values(by='weighted average', ascending=False)
recommendation_df.head(10)

Unnamed: 0_level_0,weighted average
movieId,Unnamed: 1_level_1
4298,5.0
3653,5.0
3296,5.0
80124,5.0
104780,5.0
158027,5.0
126430,5.0
127052,5.0
1596,5.0
347,5.0


#### Top 20 movies to be recommended

In [119]:
movies_df.loc[movies_df['movieId'].isin(recommendation_df.head(20).index.tolist())]

Unnamed: 0,movieId,title,year
305,347,Bitter Moon,1992
1198,1596,Career Girls,1997
2364,3134,Grand Illusion (La grande illusion),1937
2474,3296,To Sir with Love,1967
2721,3653,"Endless Summer, The",1966
3068,4117,Hope and Glory,1987
3189,4298,Rififi (Du rififi chez les hommes),1955
3406,4634,Penn & Teller Get Killed,1989
7404,80124,Sisters (Syostry),2001
7500,83134,Tucker & Dale vs Evil,2010
