In [1]:
#Import Libraries
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from difflib import get_close_matches
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Importing dataset
raw_data = pd.read_csv("movies.csv")
raw_data.head(10)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [3]:
ratings = pd.read_csv('ratings.csv')
movies = pd.read_csv('movies.csv')

In [4]:
#We can merge them together
data = pd.merge(ratings,movies,on='movieId')
data.head(1000)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
...,...,...,...,...,...,...
995,374,110,5.0,849089153,Braveheart (1995),Action|Drama|War
996,376,110,3.5,1364993995,Braveheart (1995),Action|Drama|War
997,379,110,5.0,847397381,Braveheart (1995),Action|Drama|War
998,380,110,4.0,1493419805,Braveheart (1995),Action|Drama|War


In [5]:
#Selecting usefull features
data = data[['movieId','title','userId','rating']]
data.head()

Unnamed: 0,movieId,title,userId,rating
0,1,Toy Story (1995),1,4.0
1,1,Toy Story (1995),5,4.0
2,1,Toy Story (1995),7,4.5
3,1,Toy Story (1995),15,2.5
4,1,Toy Story (1995),17,4.5


In [6]:
n_users = data['userId'].nunique()
n_items = data['movieId'].nunique()

print('Num. of Users: '+ str(n_users))
print('Num of Movies: '+str(n_items))

Num. of Users: 610
Num of Movies: 9724


In [7]:
#groupby title and count how many people rated each and every movie & reseting index because I dont want title to be my index.
movie_rating_count = pd.DataFrame(data.groupby('title')['rating'].count().reset_index())
movie_rating_count = movie_rating_count.rename(columns={'rating':'total rating count'})
movie_rating_count.head()

Unnamed: 0,title,total rating count
0,'71 (2014),1
1,'Hellboy': The Seeds of Creation (2004),1
2,'Round Midnight (1986),2
3,'Salem's Lot (2004),1
4,'Til There Was You (1997),2


In [8]:
#adding new column called total rating count
rating_with_totalRatingCount = pd.merge(data,movie_rating_count,on='title')
rating_with_totalRatingCount.head(1000)

Unnamed: 0,movieId,title,userId,rating,total rating count
0,1,Toy Story (1995),1,4.0,215
1,1,Toy Story (1995),5,4.0,215
2,1,Toy Story (1995),7,4.5,215
3,1,Toy Story (1995),15,2.5,215
4,1,Toy Story (1995),17,4.5,215
...,...,...,...,...,...
995,110,Braveheart (1995),374,5.0,237
996,110,Braveheart (1995),376,3.5,237
997,110,Braveheart (1995),379,5.0,237
998,110,Braveheart (1995),380,4.0,237


In [9]:
#only taking those movie whose total rating count is greater than 50
rating_popular_movie = rating_with_totalRatingCount[rating_with_totalRatingCount['total rating count']>=5]

In [10]:
rating_popular_movie.head(1000)

Unnamed: 0,movieId,title,userId,rating,total rating count
0,1,Toy Story (1995),1,4.0,215
1,1,Toy Story (1995),5,4.0,215
2,1,Toy Story (1995),7,4.5,215
3,1,Toy Story (1995),15,2.5,215
4,1,Toy Story (1995),17,4.5,215
...,...,...,...,...,...
995,110,Braveheart (1995),374,5.0,237
996,110,Braveheart (1995),376,3.5,237
997,110,Braveheart (1995),379,5.0,237
998,110,Braveheart (1995),380,4.0,237


In [11]:

movie_feature_df = rating_with_totalRatingCount.pivot_table(index='userId',columns='title',values='rating').fillna(0)
movie_feature_df.head(5)

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
user_similarity = movie_feature_df.corr()

In [13]:
#Function that takes in movie title and ratings as input and outputs most similar movies
def get_recomandation(movie_name,ratings):
    similar_score = user_similarity[movie_name]*(ratings-2.5)
    similar_score = similar_score.sort_values(ascending=False)
    
    return similar_score

In [14]:
#Now lets make predictions
get_recomandation('Toy Story (1995)',2).head(10)

title
Man from Earth, The (2007)                       0.040539
Wild Tales (2014)                                0.039545
Intouchables (2011)                              0.037117
Hunt, The (Jagten) (2012)                        0.033501
Incendies (2010)                                 0.032223
Planet Earth II (2016)                           0.031829
The Lair of the White Worm (1988)                0.028170
Departures (Okuribito) (2008)                    0.025979
Sacrifice, The (Offret - Sacraficatio) (1986)    0.025034
Single Man, A (2009)                             0.025027
Name: Toy Story (1995), dtype: float64

In [15]:
item_similarity = cosine_similarity(movie_feature_df.T)
item_similarity_df = pd.DataFrame(item_similarity,index=movie_feature_df.columns,columns=movie_feature_df.columns)

In [16]:
# Function that takes in movie title and ratings as input and outputs most similar movies
def get_recomandation2(movie_name,ratings):
    similar_score = item_similarity_df[movie_name]*(ratings-2.5)
    similar_score = similar_score.sort_values(ascending=False)
    
    return similar_score

In [17]:
# Now lets make predictions
get_recomandation2('Grumpier Old Men (1995)',3).head(10)

title
Grumpier Old Men (1995)                0.500000
Grumpy Old Men (1993)                  0.223993
Striptease (1996)                      0.223392
Nutty Professor, The (1996)            0.222541
Twister (1996)                         0.218123
Father of the Bride Part II (1995)     0.208901
Broken Arrow (1996)                    0.205067
Bio-Dome (1996)                        0.203702
Truth About Cats & Dogs, The (1996)    0.201541
Sabrina (1995)                         0.201416
Name: Grumpier Old Men (1995), dtype: float64

In [18]:
data.to_csv('rating_with_totalRatingCount.csv',index=False)