In [24]:
import pandas as pd

In [25]:
movies = pd.read_csv('dataset/movies.csv')
movie_ratings = pd.read_csv('dataset/ratings.csv')
tags = pd.read_csv('dataset/tags.csv')

#### Movie Recommendation using Content Based Filtering

In [26]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [27]:
movie_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [28]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [29]:
movies['genres'] = movies['genres'].str.replace('|',' ')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),Adventure Children Fantasy
2,3,Grumpier Old Men (1995),Comedy Romance
3,4,Waiting to Exhale (1995),Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy


In [30]:
len(movies.movieId.unique())

9742

In [31]:
#filtering out the users who have rated more than 25 movies
ratings_f = movie_ratings.groupby('userId').filter(lambda x: len(x) >= 25)

#list of the movies that remained pulled through the filtering
movies_list = ratings_f.movieId.unique().tolist()

In [32]:
#filter the movies data frame
movies = movies[movies.movieId.isin(movies_list)]

In [33]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),Adventure Children Fantasy
2,3,Grumpier Old Men (1995),Comedy Romance
3,4,Waiting to Exhale (1995),Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy


In [34]:
#link movie to its id
Mapping_file = dict(zip(movies.title.tolist(), movies.movieId.tolist()))

In [35]:
tags.drop(['timestamp'],1, inplace = True)
ratings_f.drop(['timestamp'],1, inplace = True)

In [36]:
#create a merged dataframe of movies, genres and all the tags given to the movies
merged_dataset = pd.merge(movies, tags, on = 'movieId', how = 'left')
merged_dataset.head()

Unnamed: 0,movieId,title,genres,userId,tag
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,336.0,pixar
1,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,474.0,pixar
2,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,567.0,fun
3,2,Jumanji (1995),Adventure Children Fantasy,62.0,fantasy
4,2,Jumanji (1995),Adventure Children Fantasy,62.0,magic board game


In [37]:
#create metadata from tags and genres
merged_dataset.fillna("", inplace = True)
merged_dataset = pd.DataFrame(merged_dataset.groupby('movieId')['tag'].apply(
                                                        lambda x:"%s" % ' '.join(x)))
Final = pd.merge(movies, merged_dataset, on = 'movieId', how = 'left')
Final ['metadata'] = Final[['tag', 'genres']].apply(
                                            lambda x:' '.join(x), axis = 1)
Final[['movieId', 'title', 'metadata']].head()

Unnamed: 0,movieId,title,metadata
0,1,Toy Story (1995),pixar pixar fun Adventure Animation Children C...
1,2,Jumanji (1995),fantasy magic board game Robin Williams game A...
2,3,Grumpier Old Men (1995),moldy old Comedy Romance
3,4,Waiting to Exhale (1995),Comedy Drama Romance
4,5,Father of the Bride Part II (1995),pregnancy remake Comedy


In [38]:
#Create count matrix from this new combined column
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
count_matrix = cv.fit_transform(Final["metadata"])

In [39]:
# Now Compute the Cosine Similarity based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(count_matrix)

In [40]:
# This Function takes movie title as input and return 15 most similar movies.
def get_recommendation(title):
    
    # Get the index of the movie that matches the title
    idx = Final['title'][Final['title']==title].index[0]
    
    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x:x[1],reverse=True)
    
    # Get the scores of the 15 most similar movies
    sim_scores = sim_scores[0:16]
    
    for i in sim_scores:
        movie_index = i[0]
        print(Final['title'].iloc[movie_index])

In [41]:
# Now lets make predictions
get_recommendation("Toy Story (1995)")

Toy Story (1995)
Bug's Life, A (1998)
Toy Story 2 (1999)
Antz (1998)
Adventures of Rocky and Bullwinkle, The (2000)
Emperor's New Groove, The (2000)
Monsters, Inc. (2001)
Wild, The (2006)
Shrek the Third (2007)
Tale of Despereaux, The (2008)
Asterix and the Vikings (Astérix et les Vikings) (2006)
Turbo (2013)
The Good Dinosaur (2015)
Moana (2016)
Twelve Tasks of Asterix, The (Les douze travaux d'Astérix) (1976)
Valiant (2005)


#### Movie Recommendation using Collaborative Filtering

In [42]:
ratings = pd.read_csv('dataset/ratings.csv')
ratings = pd.merge(movies,ratings).drop(['genres','timestamp'],axis=1)
print(ratings.shape)
ratings.head()

(100822, 4)


Unnamed: 0,movieId,title,userId,rating
0,1,Toy Story (1995),1,4.0
1,1,Toy Story (1995),5,4.0
2,1,Toy Story (1995),7,4.5
3,1,Toy Story (1995),15,2.5
4,1,Toy Story (1995),17,4.5


In [43]:
#create a merged dataframe of movies and all the ratings given to the movies by the users
userRatings = ratings.pivot_table(index=['userId'],columns=['title'],values='rating')
print("Before: ",userRatings.shape)
userRatings = userRatings.dropna(thresh=10, axis=1).fillna(0,axis=1)
#userRatings.fillna(0, inplace=True)
print("After: ",userRatings.shape)
userRatings.head()

Before:  (610, 9705)
After:  (610, 2269)


title,"'burbs, The (1989)",(500) Days of Summer (2009),10 Cloverfield Lane (2016),10 Things I Hate About You (1999),"10,000 BC (2008)",101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),12 Angry Men (1957),12 Years a Slave (2013),127 Hours (2010),...,Zack and Miri Make a Porno (2008),Zero Dark Thirty (2012),Zero Effect (1998),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zootopia (2016),eXistenZ (1999),xXx (2002),¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [44]:
#using cosine_similarity
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(userRatings.T)

cosine_sim_df = pd.DataFrame(cosine_sim, index = userRatings.columns, columns =userRatings.columns)
cosine_sim_df.head()

title,"'burbs, The (1989)",(500) Days of Summer (2009),10 Cloverfield Lane (2016),10 Things I Hate About You (1999),"10,000 BC (2008)",101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),12 Angry Men (1957),12 Years a Slave (2013),127 Hours (2010),...,Zack and Miri Make a Porno (2008),Zero Dark Thirty (2012),Zero Effect (1998),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zootopia (2016),eXistenZ (1999),xXx (2002),¡Three Amigos! (1986)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"'burbs, The (1989)",1.0,0.099735,0.0,0.180401,0.036417,0.125357,0.254537,0.078676,0.033942,0.034303,...,0.045145,0.055071,0.155553,0.184692,0.14075,0.091918,0.038152,0.211467,0.089634,0.372876
(500) Days of Summer (2009),0.099735,1.0,0.173843,0.326122,0.225255,0.206167,0.198959,0.222581,0.169654,0.233237,...,0.401963,0.208675,0.10434,0.450463,0.402251,0.306076,0.260308,0.097935,0.276512,0.169385
10 Cloverfield Lane (2016),0.0,0.173843,1.0,0.036935,0.133272,0.045497,0.022956,0.074102,0.0,0.291018,...,0.262842,0.119092,0.0,0.297371,0.271721,0.227363,0.341125,0.200462,0.12177,0.032083
10 Things I Hate About You (1999),0.180401,0.326122,0.036935,1.0,0.276708,0.282549,0.27049,0.095586,0.132512,0.088857,...,0.28066,0.142059,0.169472,0.154749,0.227712,0.34059,0.110769,0.167085,0.17672,0.160426
"10,000 BC (2008)",0.036417,0.225255,0.133272,0.276708,1.0,0.265326,0.155047,0.102822,0.0,0.113494,...,0.281309,0.109188,0.117015,0.21519,0.274656,0.272574,0.126702,0.114869,0.226588,0.112168


In [45]:
def get_similar(movie_name,rating):
    similar_ratings = cosine_sim_df[movie_name]*(rating-2.5)
    similar_ratings = similar_ratings.sort_values(ascending=False)
    #print(type(similar_ratings))
    return similar_ratings

In [46]:
animated_movie_lover = [("Toy Story (1995)",5),("Moana (2016)",4),("Toy Story 3 (2010)",5),("2 Fast 2 Furious (Fast and the Furious 2, The) (2003)",1)]
similar_movies = pd.DataFrame()
for movie,rating in animated_movie_lover:
    similar_movies = similar_movies.append(get_similar(movie,rating),ignore_index = True)

similar_movies.head()
print(similar_movies.sum().sort_values(ascending=False).head(15))

Toy Story 3 (2010)                                               3.429892
Toy Story (1995)                                                 3.377559
Up (2009)                                                        2.331299
Toy Story 2 (1999)                                               2.274904
Monsters, Inc. (2001)                                            2.250337
Incredibles, The (2004)                                          2.223306
Inside Out (2015)                                                2.218249
Finding Nemo (2003)                                              2.175824
Shrek (2001)                                                     2.159670
Pirates of the Caribbean: The Curse of the Black Pearl (2003)    2.158750
Moana (2016)                                                     2.116675
Lion King, The (1994)                                            2.101769
Forrest Gump (1994)                                              2.078319
WALL·E (2008)                         