###Certification Project

###Content-Based Movie Recommender System

In [40]:
#import necessary libraries

import numpy as np
import pandas as pd
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [41]:
# loading Movies data

movies_data=pd.read_csv('movies.csv')

In [42]:
movies_data.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [43]:
# creating new index column

idx = 0
movies_data.insert(idx, 'index', value=np.arange(len(movies_data)))

In [44]:
# number of rows and columns in the movies data frame

movies_data.shape

(10329, 4)

In [45]:
# loading ratings data

ratings_data=pd.read_csv('ratings.csv')

In [46]:
ratings_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,16,4.0,1217897793
1,1,24,1.5,1217895807
2,1,32,4.0,1217896246
3,1,47,4.0,1217896556
4,1,50,4.0,1217896523


In [47]:
# number of rows and columns in the ratings data frame

ratings_data.shape

(105339, 4)

In [48]:
movies_data['genres'] = movies_data['genres'].apply(lambda x: x.replace("|", " "))

In [49]:
# selecting the relevant features for recommendation

selected_features = ['genres']
print(selected_features)

['genres']


In [50]:
# replacing the null valuess with null string

for feature in selected_features:
  movies_data[feature] = movies_data[feature].fillna('')

In [51]:
# combining the selected features

combined_features = movies_data['genres']+' '+ movies_data['title']

In [52]:
print(combined_features)

0        Adventure Animation Children Comedy Fantasy To...
1                Adventure Children Fantasy Jumanji (1995)
2                   Comedy Romance Grumpier Old Men (1995)
3            Comedy Drama Romance Waiting to Exhale (1995)
4                Comedy Father of the Bride Part II (1995)
                               ...                        
10324    Animation Children Comedy Cosmic Scrat-tastrop...
10325                    Comedy Le Grand Restaurant (1966)
10326                Comedy A Very Murray Christmas (2015)
10327                           Drama The Big Short (2015)
10328    (no genres listed) Marco Polo: One Hundred Eye...
Length: 10329, dtype: object


In [53]:
# converting the text data to feature vectors

vectorizer = TfidfVectorizer()

In [54]:
feature_vectors = vectorizer.fit_transform(combined_features)

In [55]:
print(feature_vectors)

  (0, 111)	0.3362698867204709
  (0, 8108)	0.43265717692986444
  (0, 8624)	0.6128595248435812
  (0, 3010)	0.2707878348138484
  (0, 1895)	0.1506742792993607
  (0, 1695)	0.2831287189871249
  (0, 493)	0.3079404316718635
  (0, 282)	0.2304681375927779
  (1, 4578)	0.7746211424208029
  (1, 111)	0.3761309147202813
  (1, 3010)	0.30288669912427474
  (1, 1695)	0.3166904568672631
  (1, 282)	0.2577875534799532
  (2, 5457)	0.41683593991106527
  (2, 6068)	0.49744684518089693
  (2, 3687)	0.6508918226536827
  (2, 7148)	0.18759152435485202
  (2, 111)	0.3160519681577258
  (2, 1895)	0.14161512643234506
  (3, 2924)	0.6452233972204463
  (3, 8547)	0.3226061062636555
  (3, 9110)	0.5605803893909848
  (3, 2601)	0.11365604308767434
  (3, 7148)	0.18595784494653286
  (3, 111)	0.3132995645290772
  :	:
  (10324, 493)	0.22994250312518305
  (10325, 7007)	0.6165987088179458
  (10325, 3629)	0.5231200278026358
  (10325, 82)	0.41024276778508634
  (10325, 4921)	0.397773340816688
  (10325, 1895)	0.14010249680635486
  (10326,

In [56]:
# getting the similarity scores using linear_kernel similarity

item_sim = linear_kernel(feature_vectors, feature_vectors)

In [57]:
print(item_sim)

[[1.         0.35757551 0.12761652 ... 0.02041226 0.         0.        ]
 [0.35757551 1.         0.11887692 ... 0.         0.         0.        ]
 [0.12761652 0.11887692 1.         ... 0.01918499 0.         0.        ]
 ...
 [0.02041226 0.         0.01918499 ... 1.         0.16400297 0.08303514]
 [0.         0.         0.         ... 0.16400297 1.         0.10730401]
 [0.         0.         0.         ... 0.08303514 0.10730401 1.        ]]


In [58]:
print(item_sim.shape)

(10329, 10329)


Getting the movie name from the user

In [59]:
# getting the movie name from the user

movie_name = input(' Enter your favourite movie name : ')

 Enter your favourite movie name : Toy Story


In [60]:
# creating a list with all the movie names given in the dataset

list_of_all_titles = movies_data['title'].tolist()
print(list_of_all_titles)

['Toy Story (1995)', 'Jumanji (1995)', 'Grumpier Old Men (1995)', 'Waiting to Exhale (1995)', 'Father of the Bride Part II (1995)', 'Heat (1995)', 'Sabrina (1995)', 'Tom and Huck (1995)', 'Sudden Death (1995)', 'GoldenEye (1995)', 'American President, The (1995)', 'Dracula: Dead and Loving It (1995)', 'Balto (1995)', 'Nixon (1995)', 'Cutthroat Island (1995)', 'Casino (1995)', 'Sense and Sensibility (1995)', 'Four Rooms (1995)', 'Ace Ventura: When Nature Calls (1995)', 'Money Train (1995)', 'Get Shorty (1995)', 'Copycat (1995)', 'Assassins (1995)', 'Powder (1995)', 'Leaving Las Vegas (1995)', 'Othello (1995)', 'Now and Then (1995)', 'Persuasion (1995)', 'City of Lost Children, The (Cité des enfants perdus, La) (1995)', 'Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)', 'Dangerous Minds (1995)', 'Twelve Monkeys (a.k.a. 12 Monkeys) (1995)', 'Wings of Courage (1995)', 'Babe (1995)', 'Carrington (1995)', 'Dead Man Walking (1995)', 'Clueless (1995)', 'Cry, the Beloved Country (1995)', '

In [61]:
# finding the close match for the movie name given by the user

find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)
print(find_close_match)

['Toy Story (1995)', 'Toy Story 3 (2010)', 'Toy Story 2 (1999)']


In [62]:
close_match = find_close_match[0]
print(close_match)

Toy Story (1995)


In [63]:
# finding the index of the movie with title

index_of_the_movie = movies_data[movies_data.title == close_match]['index'].values[0]
print(index_of_the_movie)

0


In [64]:
# getting a list of similar movies

item_sim_score = list(enumerate(item_sim[index_of_the_movie]))
print(item_sim_score)

[(0, 1.0), (1, 0.357575514248231), (2, 0.12761651664326176), (3, 0.12650514193633663), (4, 0.12569368306403525), (5, 0.1523635450891477), (6, 0.1730909217142068), (7, 0.23035656108200742), (8, 0.12816970036275546), (9, 0.19576481423701436), (10, 0.15113618151134164), (11, 0.112672391256531), (12, 0.3765937743855592), (13, 0.1568197665320454), (14, 0.16495908162700249), (15, 0.1502725149025609), (16, 0.10691483010050959), (17, 0.14775304810860546), (18, 0.09591395595907606), (19, 0.14915912047003596), (20, 0.13846661559612405), (21, 0.12495051814350032), (22, 0.1428888057384346), (23, 0.13559808675215745), (24, 0.09684176503282721), (25, 0.15007855227252148), (26, 0.19429286697311238), (27, 0.1511366643398459), (28, 0.1900893023761166), (29, 0.04362182031775797), (30, 0.12382554785948309), (31, 0.0653038134975415), (32, 0.15300802553150566), (33, 0.24945042651039923), (34, 0.14053743653229495), (35, 0.12543013709088463), (36, 0.16774311846488454), (37, 0.10423355514451531), (38, 0.12956

In [65]:
len(item_sim_score)

10329

In [66]:
# sorting the movies based on their similarity score

sorted_similar_movies = sorted(item_sim_score, key = lambda x:x[1], reverse = True) 
print(sorted_similar_movies)

[(0, 1.0), (2496, 0.8846447646351113), (8599, 0.8324188277533814), (3838, 0.5176654230116386), (1716, 0.3956781610164634), (6997, 0.3922477667421378), (4403, 0.39078721595335586), (12, 0.3765937743855592), (112, 0.36384961322222253), (2696, 0.3612340526686145), (1, 0.357575514248231), (10208, 0.3519461750433348), (9732, 0.3508164875819181), (3379, 0.34819046612901294), (2245, 0.3444764598638045), (1815, 0.33861901099341707), (1667, 0.33528493827240063), (10052, 0.3316799327879598), (214, 0.32989476678391516), (7558, 0.3282898164529055), (6718, 0.3245367204423873), (6471, 0.3177458271798628), (1662, 0.31771069721109485), (210, 0.30873741398945365), (8120, 0.3081778101918886), (1322, 0.306932414726796), (500, 0.3064675557605359), (3811, 0.3013022888817072), (45, 0.300930834078455), (9896, 0.29855650001765704), (6521, 0.29771948811168714), (8954, 0.2974628153757696), (7382, 0.29641958652928474), (6626, 0.2948007298370276), (9946, 0.2939897243987354), (9394, 0.29359308689655406), (1717, 0.

In [67]:
# print the name of similar movies based on the index

print('Movies suggested for you : \n')

i = 1
rec_value = int(input(' Enter Number of Recommendation : '))
for movie in sorted_similar_movies:
  index = movie[0]
  title_from_index = movies_data[movies_data.index==index]['title'].values[0]
  if (i<rec_value+1):
    print(i, '.',title_from_index)
    i+=1

Movies suggested for you : 

 Enter Number of Recommendation : 5
1 . Toy Story (1995)
2 . Toy Story 2 (1999)
3 . Toy Story 3 (2010)
4 . Toy, The (1982)
5 . NeverEnding Story, The (1984)


**Content-Based Movie Recommendation System**

In [72]:
movie_name = input(' Enter your favourite movie name : ')

list_of_all_titles = movies_data['title'].tolist()

find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)

close_match = find_close_match[0]

index_of_the_movie = movies_data[movies_data.title == close_match]['index'].values[0]

item_sim_score = list(enumerate(item_sim[index_of_the_movie]))

sorted_similar_movies = sorted(item_sim_score, key = lambda x:x[1], reverse = True) 

print('Movies suggested for you : \n')

i = 1
rec_value = int(input(' Enter Number of Recommendation : '))
for movie in sorted_similar_movies:
  index = movie[0]
  title_from_index = movies_data[movies_data.index==index]['title'].values[0]
  if (i<rec_value+1):
    print(i, '.',title_from_index)
    i+=1

 Enter your favourite movie name : 2 Fast 2 Furious (Fast and the Furious 2, The)
Movies suggested for you : 

 Enter Number of Recommendation : 10
1 . 2 Fast 2 Furious (Fast and the Furious 2, The) (2003)
2 . Fast & Furious (Fast and the Furious 4, The) (2009)
3 . Fast & Furious 6 (Fast and the Furious 6, The) (2013)
4 . Fast and the Furious, The (2001)
5 . Fast and the Furious: Tokyo Drift, The (Fast and the Furious 3, The) (2006)
6 . Fast Five (Fast and the Furious 5, The) (2011)
7 . Furious 7 (2015)
8 . Fast Food, Fast Women (2000)
9 . Fast Runner, The (Atanarjuat) (2001)
10 . Fast Food Nation (2006)
