# Collaborative filtering

In [1]:
#importing neccesary packages
import pandas as pd 
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Step 1 - Data Preparation

In [2]:
#Reading u.data-main data from the zipped data file

my_df = pd.read_csv('C:\\Users\\RIYANRIYA\\Desktop\\ML\\ml-100k\\u.data', sep='\t', names=['user_id','item_id','rating','timestamp'])

In [3]:
#Checking Data
my_df.head()

#reading the 4th row,196th user have given a rating of 3 to 242 movie id
#user_id refers to individual users,item_id refers to movies

Unnamed: 0,user_id,item_id,rating,timestamp
0,0,50,5,881250949
1,0,172,5,881250949
2,0,133,1,881250949
3,196,242,3,881250949
4,186,302,3,891717742


In [4]:
#Importing movie title and joining with main data
movie_titles = pd.read_csv('Movie_Titles.csv',encoding= 'unicode_escape')
my_df = pd.merge(my_df, movie_titles, on='item_id')
my_df.head()

#in this case the item_id was each movie item in our previous dataframe. So in this case we merge the movie 
#titles with our original df to get the movie titles too. so Star Wars have item_id of 50,and
#290 users have given a 5 rating to this movie

Unnamed: 0,user_id,item_id,rating,timestamp,title
0,0,50,5,881250949,Star Wars (1977)
1,290,50,5,880473582,Star Wars (1977)
2,79,50,4,891271545,Star Wars (1977)
3,2,50,5,888552084,Star Wars (1977)
4,8,50,5,879362124,Star Wars (1977)


In [5]:
#The Unicode Standard provides a unique number for every character, no matter what platform, device, application or language. It has been adopted by all modern software providers and now allows data to be transported through many different platforms,
#devices and applications without corruption.
#What does unicode_escape mean?
#The Python-specific encoding unicode_escape is a dummy encoding that converts all non-ASCII characters 
#into their \uXXXX representations.

# user_id - the ID of the user who rated the movie.
# item_id - the ID of the movie.
# rating - The rating the user gave the movie, between 1 and 5.
# timestamp - The time the movie was rated.
# title - The title of the movie.

# Step 2 - Data exploration

In [6]:
#Statitical features of the movie
my_df.describe()

#From the output we see that the mean rating is 3.5 out of 5,the maximum rating is 5 while the min rating is 1

Unnamed: 0,user_id,item_id,rating,timestamp
count,100003.0,100003.0,100003.0,100003.0
mean,462.470876,425.520914,3.529864,883528800.0
std,266.622454,330.797791,1.125704,5343791.0
min,0.0,1.0,1.0,874724700.0
25%,254.0,175.0,3.0,879448700.0
50%,447.0,322.0,4.0,882826900.0
75%,682.0,631.0,4.0,888260000.0
max,943.0,1682.0,5.0,893286600.0


In [7]:
#creating mean ratings data
ratings = pd.DataFrame(my_df.groupby('title')['rating'].mean())
ratings.head()

#In this case we are finding out the mean rating of each movie given by 'n' number of users.
#We create a DF called ratings,where we group by the movie
#title and the rating

Unnamed: 0_level_0,rating
title,Unnamed: 1_level_1
'Til There Was You (1997),2.333333
1-900 (1994),2.6
101 Dalmatians (1996),2.908257
12 Angry Men (1957),4.344
187 (1997),3.02439


In [None]:
#creating number of ratings data where we check which movie is rated how many times.The idea is a very popular movie might
#get more ratings than a less popular movie
ratings['number_of_ratings'] = my_df.groupby('title')['rating'].count()
ratings.head()

In [None]:
#Plotting the jointplot
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.jointplot(x='rating', y='number_of_ratings', data=ratings)


## As we create the jointplot,we check on the histogram on the right which shows the histogram for 'number of ratings' .We find majority users have not rated any movie,as we go higher up we see less movies have been rated by more users.Now coming to the histogram on the top which shows the hist for ratings,we see very few movies have got a 5 rating,majorly movies have been rated a 3.5(mean). Now coming to the density plot in the centre,we see most users have rated a 3.5(density is maximum)

# Creating User - Item interaction matrix

In [None]:
#creating the user-item interaction matrix,by taking a pivot of my original dataframe.
movie_matrix = my_df.pivot_table(index='user_id', columns='title', values='rating')
movie_matrix.head()

#In this case we see that user 1 has rated 101 dalmatians a '2' and 12 angry men a '5',and we have a lot of Nan values which
#imply most users do not rate a movie,hence we have missing ratings.

In [None]:
#Sorting the ratings in the ascending order to identify movies with highest number of ratings.In this case we see Star Wars
#has the highest number of ratings,i.e 584
ratings.sort_values('number_of_ratings', ascending=False).head(10)

# Making recommendation - Example movie- SCREAM

In [None]:
#Fetching ratings for Fargo
Scream_user_rating = movie_matrix['Scream (1996)']

In [None]:
Scream_user_rating
#we see user 2 has given a rating of 3,user 3 a rating of 2 and so on

In [None]:
#Here we try to find the movies which are similar to Scream in terms of ratings.For this we compute the correlation
#of the original data with the Scream ratings,so Scream will be in a column and its correlation with other movies 
#will be computed. We create a correlation matrix with the function corrwith
similar_to_Scream=movie_matrix.corrwith(Scream_user_rating)

In [None]:
#Movies similar to Scream
similar_to_Scream.head()

# As we will be recommending movies which truely make sense,we will create a threshold for minimum number of ratings. So we will recommend movies which are beyond a certain number of rating

In [None]:
#creating dataframe to bring in of the above correlation result
corr_Scream = pd.DataFrame(similar_to_Scream, columns=['Correlation'])
corr_Scream.dropna(inplace=True)
corr_Scream.head()

#When inplace = True , the data is modified in place, which means it will return nothing and the dataframe is now updated. 
#When inplace = False , 
#which is the default, then the operation is performed and it returns a copy of the object

In [None]:
#We join the created dataframe with the 'number_of_ratings' from ratings dataframe,so that we have a list of similar movies
#with their correlation and the number of ratings together.
corr_Scream = corr_Scream.join(ratings['number_of_ratings'])

corr_Scream.head()

In [None]:
#Now we apply a filter to the above DF by fetching only those movings which has more than 30 ratings.
#As from the output we can refer Scream2,I know what you did last summer etc,which share high correlation with the movie
#scream

corr_Scream[corr_Scream['number_of_ratings'] > 30].sort_values(by='Correlation', ascending=False).head(10)