In [1]:
# Importing necesaary modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math

In [2]:
# Reading the dataset
movie = pd.read_csv('data/movie.csv')
rating = pd.read_csv('data/rating.csv')

In [3]:
movie.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
movie.shape

(27278, 3)

In [5]:
# Checking for missing values
movie.isna().sum()

movieId    0
title      0
genres     0
dtype: int64

In [6]:
rating.shape

(20000263, 4)

In [7]:
rating.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [8]:
rating.rating.value_counts()

rating
4.0    5561926
3.0    4291193
5.0    2898660
3.5    2200156
4.5    1534824
2.0    1430997
2.5     883398
1.0     680732
1.5     279252
0.5     239125
Name: count, dtype: int64

In [9]:
# Checking for missing values
rating.isna().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [10]:
# For collaborative filterings there is no need for genre since recommendations are done based on the similar users 
movie.drop('genres', inplace = True, axis = 1)
movie.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


### Separating year column

In [11]:
# Extracting the year from title
movie['year'] = movie.title.str.extract('(\(\d\d\d\d\))',expand=False)
# Removing the parenthesis
movie['year'] = movie.year.str.extract('(\d\d\d\d)', expand = False)
movie.head()

Unnamed: 0,movieId,title,year
0,1,Toy Story (1995),1995
1,2,Jumanji (1995),1995
2,3,Grumpier Old Men (1995),1995
3,4,Waiting to Exhale (1995),1995
4,5,Father of the Bride Part II (1995),1995


In [12]:
movie = movie.replace(to_replace = '(\(\d\d\d\d\))', value = '', regex = True)
movie.head()

Unnamed: 0,movieId,title,year
0,1,Toy Story,1995
1,2,Jumanji,1995
2,3,Grumpier Old Men,1995
3,4,Waiting to Exhale,1995
4,5,Father of the Bride Part II,1995


In [13]:
# Removing white spaces
movie['title'] = movie['title'].apply(lambda x: x.strip())
rating.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [14]:
# Timestamp is irrelevant
rating.drop('timestamp', inplace = True, axis = 1)
rating.head()

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5


### Selecting a user who have watched some movies and based on its reading finding neighbours

In [15]:
user = [
            {'title':'Wonderful Ice Cream Suit, The', 'rating':4},
            {'title':'Toy Story', 'rating':2.5},
            {'title':'Jumanji', 'rating':3},
            {'title':"Dark Shadows", 'rating':4.5},
            {'title':'Akira', 'rating':5}
         ] 
userMovie = pd.DataFrame(user)
userMovie.head()

Unnamed: 0,title,rating
0,"Wonderful Ice Cream Suit, The",4.0
1,Toy Story,2.5
2,Jumanji,3.0
3,Dark Shadows,4.5
4,Akira,5.0


In [16]:
Id = movie[movie.title.isin(userMovie.title).tolist()]
Id

Unnamed: 0,movieId,title,year
0,1,Toy Story,1995
1,2,Jumanji,1995
1246,1274,Akira,1988
7599,7985,"Wonderful Ice Cream Suit, The",1998
18995,94478,Dark Shadows,2012


In [17]:
userMovie = pd.merge(Id, userMovie)
userMovie

Unnamed: 0,movieId,title,year,rating
0,1,Toy Story,1995,2.5
1,2,Jumanji,1995,3.0
2,1274,Akira,1988,5.0
3,7985,"Wonderful Ice Cream Suit, The",1998,4.0
4,94478,Dark Shadows,2012,4.5


In [18]:
userMovie.drop('year', inplace = True, axis = 1)
userMovie

Unnamed: 0,movieId,title,rating
0,1,Toy Story,2.5
1,2,Jumanji,3.0
2,1274,Akira,5.0
3,7985,"Wonderful Ice Cream Suit, The",4.0
4,94478,Dark Shadows,4.5


In [19]:
#Filtering out users that have watched movies that the input has watched and storing it
neighbors = rating[rating['movieId'].isin(userMovie['movieId'].tolist())]
neighbors.head()

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
236,3,1,4.0
451,5,2,3.0
517,6,1,5.0
817,8,1,4.0


In [20]:
neighbors.shape

(80768, 3)

In [21]:
neighborsGroup = neighbors.groupby(['userId'])
neighborsGroup.head()

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
236,3,1,4.0
451,5,2,3.0
517,6,1,5.0
817,8,1,4.0
...,...,...,...
19999573,138488,1,3.0
19999574,138488,2,3.0
19999786,138491,1,2.0
19999890,138493,1,3.5


In [22]:
neighborsGroup.get_group(3029)

Unnamed: 0,userId,movieId,rating
444462,3029,1,3.5
445213,3029,94478,2.5


In [23]:
#Sorting it so users with movie most in common with the input will have priority
neighborsGroup = sorted(neighborsGroup,  key=lambda x: len(x[1]), reverse=True)

In [24]:
neighborsGroup = neighborsGroup[0:77]

In [25]:
neighborsGroup[:3]

[((18611,),
           userId  movieId  rating
  2745113   18611        1     4.0
  2745114   18611        2     3.0
  2745646   18611     1274     2.0
  2747524   18611     7985     3.0
  2748692   18611    94478     2.0),
 ((440,),
         userId  movieId  rating
  62454     440        1     4.0
  62455     440        2     3.5
  62515     440     1274     4.0
  62748     440    94478     4.0),
 ((2988,),
          userId  movieId  rating
  439191    2988        1     3.0
  439192    2988        2     2.5
  439352    2988     1274     2.5
  439979    2988    94478     3.5)]

In [26]:
def pearson_correlation(x, y):
    ''' Calculates Pearson Correlation between two list x and y'''
    n = len(x)
    sum_x = sum(x)
    sum_y = sum(y)
    sum_x2 = sum(i**2 for i in x)
    sum_y2 = sum(i**2 for i in y)
    sum_xy = sum(i * j for i, j in zip(x, y))
    
    if sum_x2 * sum_y2 == 0:
        return 0

    denominator = math.sqrt(n*sum_x2 - sum_x**2) * math.sqrt(n*sum_y2 - sum_y**2)

    if denominator == 0:
        return 0

    return float((n * sum_xy - (sum_x * sum_y)) / float(denominator))

In [27]:
# Calculating pearson correlation and saving in into a dictionary
pearsonCorrelation = {}

for name, group in neighborsGroup:
    group = group.sort_values(by = 'movieId')
    userMovie = userMovie.sort_values(by = 'movieId')
    
    # Getting the common movies
    tempMovie = userMovie[userMovie['movieId'].isin(group['movieId']).tolist()]
    tempMovieRating = tempMovie['rating'].tolist()
    groupRating = group['rating'].tolist()
    
    correlation = pearson_correlation(tempMovieRating, groupRating)
    pearsonCorrelation[name] = correlation

In [28]:
pearsonCorrelation

{(18611,): -0.922224668326776,
 (440,): 0.420084025208403,
 (2988,): 0.07312724241271307,
 (6431,): 0.6859943405700353,
 (8364,): 0.599933529053139,
 (9797,): -0.6708203932499369,
 (10443,): -0.9848916356764204,
 (11517,): 0.6305282887554631,
 (12793,): 0.2970442628930023,
 (15030,): 0.12964074471043288,
 (15701,): 0.5118906968889915,
 (18706,): -0.03157544889753363,
 (19711,): -0.21693045781865616,
 (19881,): -0.30678599553894814,
 (21672,): 0.4338609156373123,
 (23101,): -0.2970442628930023,
 (24688,): 0.22102814228273543,
 (25180,): -0.7463466005026361,
 (31921,): -0.9429080709444059,
 (32094,): 0.140028008402801,
 (32632,): -0.2782074420373286,
 (36281,): 0.8868440532177395,
 (42032,): -0.5118906968889915,
 (44070,): -0.36563621206356534,
 (45096,): -0.05564148840746571,
 (45919,): -0.12964074471043288,
 (46470,): -0.16980890270283108,
 (46645,): -0.05564148840746571,
 (48527,): 0.6123000982747524,
 (49668,): 0.420084025208403,
 (51703,): -0.6859943405700353,
 (57420,): -0.21938172

In [29]:
df = pd.DataFrame.from_dict(pearsonCorrelation, orient = 'index')
df.columns = ['similarities']

In [30]:
df['userId'] = df.index

In [31]:
df.index = range(len(df))
df.head()

Unnamed: 0,similarities,userId
0,-0.922225,"(18611,)"
1,0.420084,"(440,)"
2,0.073127,"(2988,)"
3,0.685994,"(6431,)"
4,0.599934,"(8364,)"


In [32]:
df.isna().sum()

similarities    0
userId          0
dtype: int64

In [33]:
df.userId = df['userId'].str[0]

In [34]:
df.dtypes

similarities    float64
userId            int64
dtype: object

In [35]:
df = df.sort_values(by='similarities', ascending=False)
df.head()

Unnamed: 0,similarities,userId
32,0.942908,58184
36,0.899735,61168
21,0.886844,36281
38,0.808452,66533
50,0.8044,82927


In [36]:
df = df.merge(rating)
df.head()

Unnamed: 0,similarities,userId,movieId,rating
0,0.942908,58184,1,2.0
1,0.942908,58184,2,3.0
2,0.942908,58184,31,3.0
3,0.942908,58184,32,5.0
4,0.942908,58184,47,3.5


In [37]:
df['weightedRatings'] = df.similarities*df.rating
df.head()

Unnamed: 0,similarities,userId,movieId,rating,weightedRatings
0,0.942908,58184,1,2.0,1.885816
1,0.942908,58184,2,3.0,2.828724
2,0.942908,58184,31,3.0,2.828724
3,0.942908,58184,32,5.0,4.71454
4,0.942908,58184,47,3.5,3.300178


In [38]:
df_sum = abs(df.groupby('userId').sum())[['similarities', 'weightedRatings']]
df_sum.columns = ['similarities_sum', 'weightedRatings_sum']
df_sum.head()

Unnamed: 0_level_0,similarities_sum,weightedRatings_sum
userId,Unnamed: 1_level_1,Unnamed: 2_level_1
440,124.764955,458.101629
2988,60.622484,225.853488
6431,409.538621,1516.39049
8364,405.555066,1262.260145
9797,499.761193,1932.298143


In [39]:
recommendation = pd.DataFrame()
recommendation['score'] = df_sum.weightedRatings_sum/df_sum.similarities_sum
recommendation['movieId'] = df.movieId.astype(int)
recommendation.head()

Unnamed: 0_level_0,score,movieId
userId,Unnamed: 1_level_1,Unnamed: 2_level_1
440,3.671717,91485.0
2988,3.725573,74324.0
6431,3.70268,68358.0
8364,3.112426,36363.0
9797,3.866443,3914.0


In [40]:
recommendation = recommendation.sort_values(by='score', ascending=False)
recommendation.head(10)

Unnamed: 0_level_0,score,movieId
userId,Unnamed: 1_level_1,Unnamed: 2_level_1
131894,4.220777,
36281,4.204595,31878.0
58424,4.055921,69644.0
59971,3.925789,44022.0
98808,3.911051,77427.0
134497,3.897033,
9797,3.866443,3914.0
44070,3.792279,73344.0
117315,3.751565,4520.0
2988,3.725573,74324.0


In [42]:
movie.loc[movie['movieId'].isin(recommendation.head(10)['movieId'].tolist())]

Unnamed: 0,movieId,title,year
3821,3914,"Broken Hearts Club, The",2000
4425,4520,License to Drive,1988
9781,31878,Kung Fu Hustle (Gong fu),2004
10873,44022,Ice Age 2: The Meltdown,2006
13890,69644,Ice Age: Dawn of the Dinosaurs,2009
14677,73344,"Prophet, A (Un Prophète)",2009
14846,74324,Temple Grandin,2010
15183,77427,"Human Centipede, The (First Sequence)",2009
