# Recommendation system(Lesson 2)
## Collaborative based recommender(collaborative filtering)

### First of all import needed packages

In [10]:
from math import sqrt
import pandas as pd
import numpy as np
# Use this to show matplotlib and plots properly(But not essential)
%matplotlib inline

### Read the csv(Comma seperated value) file with pandas

In [11]:
Movies_data_frame = pd.read_csv('Movies.csv')
Ratings_data_frame = pd.read_csv('Ratings.csv')

In [12]:
# show the first 5 lines of this data frame
Movies_data_frame.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


### Cleaning data

In [13]:
# Findind the years in parantheses(not those which are in the movie's names)
Movies_data_frame['year'] = Movies_data_frame.title.str.extract(r'\((\d{4})\)',expand=False)
# Removing the parentheses
Movies_data_frame['year'] = Movies_data_frame.year.str.extract(r'(\d{4})',expand=False)
# Removing the years from the title column
Movies_data_frame['title'] = Movies_data_frame.title.str.replace(r'\(\d{4}\)','',regex=True)
# The strip function : to get rid of any ending whitespace characters that may have appeared
Movies_data_frame['title'] = Movies_data_frame['title'].apply(lambda x : x.strip())
Movies_data_frame.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


In [14]:
# Removing the genres column
Movies_data_frame = Movies_data_frame.drop('genres',axis=1)
Movies_data_frame.head()

Unnamed: 0,movieId,title,year
0,1,Toy Story,1995
1,2,Jumanji,1995
2,3,Grumpier Old Men,1995
3,4,Waiting to Exhale,1995
4,5,Father of the Bride Part II,1995


In [15]:
# Show the first 5 lines of this data frame
Ratings_data_frame.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [16]:
# Removing the timestamp column 
Ratings_data_frame = Ratings_data_frame.drop('timestamp',axis=1)
Ratings_data_frame.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [18]:
# Make sample user(guy) to make a recommendation later on
# This user has seen five movies with give ratings
User_inputs = [
            {'title':'Breakfast Club, The','rating':5},
            {'title':'Toy Story','rating':3.5},
            {'title':'Jumanji','rating':2},
            {'title':"Pulp Fiction",'rating':5},
            {'title':'Akira','rating':4.5}
         ]
# Create a data frame with those inputs
User_input_movies_data_frame = pd.DataFrame(User_inputs)
User_input_movies_data_frame.head()

Unnamed: 0,title,rating
0,"Breakfast Club, The",5.0
1,Toy Story,3.5
2,Jumanji,2.0
3,Pulp Fiction,5.0
4,Akira,4.5


In [20]:
# Filtering out the movies by title
Input_id = Movies_data_frame[Movies_data_frame['title'].isin(User_input_movies_data_frame['title'].tolist())]
User_input_movies_data_frame = pd.merge(Input_id,User_input_movies_data_frame)
User_input_movies_data_frame = User_input_movies_data_frame.drop('year',axis=1)
User_input_movies_data_frame.head()

Unnamed: 0,movieId,title,rating
0,1,Toy Story,3.5
1,2,Jumanji,2.0
2,296,Pulp Fiction,5.0
3,1274,Akira,4.5
4,1968,"Breakfast Club, The",5.0


In [21]:
User_subset = Ratings_data_frame[Ratings_data_frame['movieId'].isin(User_input_movies_data_frame['movieId'].tolist())]
User_subset.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
16,1,296,3.0
320,4,296,1.0
422,4,1968,4.0
516,5,1,4.0


In [22]:
print(f'{User_subset.shape[0]} people are at least matched with the sample guy in case of movies they have seen')

784 people are at least matched with the sample guy in case of movies they have seen


In [23]:
User_subset_group = User_subset.groupby(['userId'])

In [24]:
User_subset_group.get_group((4,))

Unnamed: 0,userId,movieId,rating
320,4,296,1.0
422,4,1968,4.0


In [25]:
User_subset_group = sorted(User_subset_group,key=lambda x : len(x[1]),reverse=True)

In [26]:
User_subset_group[:3]

[((91,),
         userId  movieId  rating
  14121      91        1     4.0
  14122      91        2     3.0
  14173      91      296     4.5
  14316      91     1274     5.0
  14383      91     1968     3.0),
 ((177,),
         userId  movieId  rating
  24900     177        1     5.0
  24901     177        2     3.5
  24930     177      296     5.0
  25069     177     1274     2.0
  25129     177     1968     3.5),
 ((219,),
         userId  movieId  rating
  31524     219        1     3.5
  31525     219        2     2.5
  31554     219      296     4.0
  31628     219     1274     2.5
  31680     219     1968     3.0)]

In [27]:
print(f'User {User_subset_group[0][0][0]} has seen at least one similar movies as the sample user')
print('Maximum similar seen movies cannot be more than 5, because the sample user has seen 5')

User 91 has seen at least one similar movies as the sample user
Maximum similar seen movies cannot be more than 5, because the sample user has seen 5


In [28]:
User_subset_group_ = User_subset_group[0:100]

In [29]:
# Store the pearson correlation in a dictionary, where the key is the user ID and the value is the coefficient
Pearson_correlation_dictionary = {}
# For every user group in our subset
for name , group in User_subset_group_ :
    # start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='movieId')
    input_movies = User_input_movies_data_frame.sort_values(by='movieId')
    # n_ratings is for pearson formula 
    n_ratings = len(group)
    # get the review scores for the movies that they both have in common
    temporary_data_frame = input_movies[input_movies['movieId'].isin(group['movieId'].tolist())]
    # then store them in a temporary buffer variable in a list format to facilitate future calculations
    temporary_rating_list = temporary_data_frame['rating'].tolist()
    # put the current user group reviews in a list format
    temporary_group_list = group['rating'].tolist()
    # calculate the pearson correlation between two users, so called, x and y
    Sxx = sum([i**2 for i in temporary_rating_list])-pow(sum(temporary_rating_list),2)/float(n_ratings)
    Syy = sum([i**2 for i in temporary_group_list])-pow(sum(temporary_group_list),2)/float(n_ratings)
    Sxy = sum(i*j for i , j in zip(temporary_rating_list,temporary_group_list)) - sum(temporary_rating_list) * sum(temporary_group_list)/float(n_ratings)
    # if the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0 :
        Pearson_correlation_dictionary[name] = Sxy/sqrt(Sxx*Syy)
    else :
        Pearson_correlation_dictionary[name] = 0

In [30]:
print(f'Similarity of user 91 to sample guy :',Pearson_correlation_dictionary[((91,))])

Similarity of user 91 to sample guy : 0.43852900965351443


In [31]:
Pearson_correlation_dictionary

{(91,): 0.43852900965351443,
 (177,): 0.0,
 (219,): 0.45124262819713973,
 (274,): 0.716114874039432,
 (298,): 0.9592712306918567,
 (414,): 0.9376144618769914,
 (474,): 0.11720180773462392,
 (477,): 0.4385290096535153,
 (480,): 0.7844645405527362,
 (483,): 0.08006407690254357,
 (599,): 0.7666866491579839,
 (608,): 0.920736884379251,
 (50,): 0.15713484026367722,
 (57,): -0.7385489458759964,
 (68,): 0.0,
 (103,): 0.5222329678670935,
 (135,): 0.8703882797784892,
 (182,): 0.9428090415820635,
 (202,): 0.5222329678670935,
 (217,): 0.30151134457776363,
 (226,): 0.9438798074485389,
 (288,): 0.6005325641789633,
 (307,): 0.9655810287305759,
 (318,): 0.44486512077567225,
 (322,): 0.5057805388588731,
 (330,): 0.9035942578600878,
 (357,): 0.5606119105813882,
 (434,): 0.9864036607532465,
 (448,): 0.30151134457776363,
 (469,): 0.8164965809277261,
 (561,): 0.5222329678670935,
 (600,): 0.18442777839082938,
 (606,): 0.9146591207600472,
 (610,): -0.47140452079103173,
 (18,): 1.0,
 (19,): -0.5,
 (21,): 0,


In [32]:
Pearson_correlation_data_frame = pd.DataFrame.from_dict(Pearson_correlation_dictionary,orient='index')
Pearson_correlation_data_frame.columns = ['similarityIndex']
Pearson_correlation_data_frame['userId'] = Pearson_correlation_data_frame.index
Pearson_correlation_data_frame.index = range(len(Pearson_correlation_data_frame))
Pearson_correlation_data_frame.head()

Unnamed: 0,similarityIndex,userId
0,0.438529,"(91,)"
1,0.0,"(177,)"
2,0.451243,"(219,)"
3,0.716115,"(274,)"
4,0.959271,"(298,)"


In [33]:
Pearson_correlation_data_frame['userId'] = Pearson_correlation_data_frame['userId'].apply(lambda x:x[0] if isinstance(x,tuple) else x).astype('int64')

In [34]:
Pearson_correlation_data_frame.head()

Unnamed: 0,similarityIndex,userId
0,0.438529,91
1,0.0,177
2,0.451243,219
3,0.716115,274
4,0.959271,298


In [35]:
Top_users = Pearson_correlation_data_frame.sort_values(by='similarityIndex',ascending=False)[0:50]
Top_users.head()

Unnamed: 0,similarityIndex,userId
43,1.0,132
34,1.0,18
63,1.0,305
82,1.0,489
86,1.0,525


In [36]:
Top_users['userId'] = Top_users['userId'].apply(lambda x:int(x[0][0]) if isinstance(x,tuple) and isinstance(x[0],tuple) else int(x[0]) if isinstance(x,tuple) else int(x))

In [37]:
Top_user_ratings = Top_users.merge(Ratings_data_frame, left_on='userId', right_on='userId', how='inner')
Top_user_ratings.head()

Unnamed: 0,similarityIndex,userId,movieId,rating
0,1.0,132,1,2.0
1,1.0,132,17,3.0
2,1.0,132,29,2.0
3,1.0,132,32,3.0
4,1.0,132,34,1.5


In [38]:
Top_user_ratings.shape

(27831, 4)

In [40]:
# multiplies the similarity by the user's ratings
Top_user_ratings['weightedRating'] = Top_user_ratings['similarityIndex']*Top_user_ratings['rating']
Top_user_ratings.head(100)

Unnamed: 0,similarityIndex,userId,movieId,rating,weightedRating
0,1.0,132,1,2.0,2.0
1,1.0,132,17,3.0,3.0
2,1.0,132,29,2.0,2.0
3,1.0,132,32,3.0,3.0
4,1.0,132,34,1.5,1.5
...,...,...,...,...,...
95,1.0,132,1258,4.0,4.0
96,1.0,132,1259,3.0,3.0
97,1.0,132,1270,3.5,3.5
98,1.0,132,1271,3.5,3.5


In [41]:
# Applies a sum to the top users after grouping it up by userId
Temporary_top_user_ratings = Top_user_ratings.groupby(by='movieId').sum()[['similarityIndex','weightedRating']]
Temporary_top_user_ratings.columns = ['sum_similarityIndex','sum_weightedRating']
Temporary_top_user_ratings.head()

Unnamed: 0_level_0,sum_similarityIndex,sum_weightedRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,36.354096,133.167946
2,31.005292,94.904257
3,8.783859,26.381456
4,0.866025,1.732051
5,7.165336,19.775255


In [42]:
# Create an empty dataframe
Recommendation_data_frame = pd.DataFrame()
# Now we take the weighted average
Recommendation_data_frame['weighted average recommendation score'] = Temporary_top_user_ratings['sum_weightedRating']/Temporary_top_user_ratings['sum_similarityIndex']
Recommendation_data_frame['movieId'] = Temporary_top_user_ratings.index
Recommendation_data_frame.head()

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.66308,1
2,3.060905,2
3,3.003402,3
4,2.0,4
5,2.75985,5


In [43]:
Recommendation_data_frame = Recommendation_data_frame.sort_values(by='weighted average recommendation score')
Recommendation_data_frame.head(10)

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
26696,0.5,26696
80166,0.5,80166
1322,0.5,1322
136297,0.5,136297
1453,0.5,1453
80693,0.5,80693
2017,0.5,2017
70641,0.5,70641
44243,0.5,44243
27595,0.5,27595


In [44]:
Movies_data_frame.loc[Movies_data_frame['movieId'].isin(Recommendation_data_frame.head(10)['movieId'].tolist())]

Unnamed: 0,movieId,title,year
1011,1322,Amityville 1992: It's About Time,1992
1114,1453,"Beautician and the Beast, The",1997
1492,2017,Babes in Toyland,1961
5554,26696,Lionheart,1990
5662,27595,Jesus Christ Vampire Hunter,2001
6160,44243,Leprechaun 4: In Space,1997
7105,70641,Miss March,2009
7408,80166,"Switch, The",2010
7424,80693,It's Kind of a Funny Story,2010
8931,136297,Mortal Kombat: The Journey Begins,1995
