In [31]:
## Lets create a Movies recommendation system with machine learning
## Import the required libraries

import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer


In [32]:
# load the dataset
movies_data =  pd.read_csv('../movies.csv')


In [33]:
movies_data.describe()


Unnamed: 0,Audience score %,Profitability,Rotten Tomatoes %,Year
count,77.0,77.0,77.0,77.0
mean,63.727273,4.599483,46.701299,2009.077922
std,13.657113,8.03199,26.095001,1.354974
min,35.0,0.0,3.0,2007.0
25%,52.0,1.751351,26.0,2008.0
50%,64.0,2.642353,45.0,2009.0
75%,76.0,5.103117,64.0,2010.0
max,89.0,66.934,96.0,2011.0


In [34]:
movies_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77 entries, 0 to 76
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Film               77 non-null     object 
 1   Genre              77 non-null     object 
 2   Lead Studio        77 non-null     object 
 3   Audience score %   77 non-null     int64  
 4   Profitability      77 non-null     float64
 5   Rotten Tomatoes %  77 non-null     int64  
 6   Worldwide Gross    77 non-null     object 
 7   Year               77 non-null     int64  
dtypes: float64(1), int64(3), object(4)
memory usage: 4.9+ KB


In [35]:
movies_data.head()


Unnamed: 0,Film,Genre,Lead Studio,Audience score %,Profitability,Rotten Tomatoes %,Worldwide Gross,Year
0,Zack and Miri Make a Porno,Romance,The Weinstein Company,70,1.747542,64,$41.94,2008
1,Youth in Revolt,Comedy,The Weinstein Company,52,1.09,68,$19.62,2010
2,You Will Meet a Tall Dark Stranger,Comedy,Independent,35,1.211818,43,$26.66,2010
3,When in Rome,Comedy,Disney,44,0.0,15,$43.04,2010
4,What Happens in Vegas,Comedy,Fox,72,6.267647,28,$219.37,2008


In [36]:
movies_data.shape


(77, 8)

In [37]:
# Feionatures selections that impact recommendation system
feature_selection =['Genre']
print(feature_selection)


['Genre']


In [38]:
## check the null and missing values and handle them
for features in feature_selection:
   movies_data[features]= movies_data[features].fillna('')


In [39]:
# Combine all respection features
combine_features = movies_data['Genre']
print(combine_features)


0     Romance
1      Comedy
2      Comedy
3      Comedy
4      Comedy
       ...   
72    romance
73      Drama
74      Drama
75     Comedy
76     comedy
Name: Genre, Length: 77, dtype: object


In [40]:
# text to vector
text_to_vector= TfidfVectorizer()


In [41]:
feature_vectorizer = text_to_vector.fit_transform(combine_features)


In [42]:
print(feature_vectorizer)


<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 77 stored elements and shape (77, 8)>
  Coords	Values
  (0, 6)	1.0
  (1, 3)	1.0
  (2, 3)	1.0
  (3, 3)	1.0
  (4, 3)	1.0
  (5, 4)	1.0
  (6, 1)	1.0
  (7, 6)	1.0
  (8, 6)	1.0
  (9, 3)	1.0
  (10, 6)	1.0
  (11, 6)	1.0
  (12, 6)	1.0
  (13, 3)	1.0
  (14, 4)	1.0
  (15, 4)	1.0
  (16, 3)	1.0
  (17, 3)	1.0
  (18, 3)	1.0
  (19, 4)	1.0
  (20, 5)	1.0
  (21, 3)	1.0
  (22, 1)	1.0
  (23, 6)	1.0
  (24, 3)	1.0
  :	:
  (52, 3)	1.0
  (53, 3)	1.0
  (54, 3)	1.0
  (55, 0)	1.0
  (56, 3)	1.0
  (57, 6)	1.0
  (58, 3)	1.0
  (59, 3)	1.0
  (60, 3)	1.0
  (61, 3)	1.0
  (62, 3)	1.0
  (63, 3)	1.0
  (64, 1)	1.0
  (65, 1)	1.0
  (66, 3)	1.0
  (67, 3)	1.0
  (68, 4)	1.0
  (69, 3)	1.0
  (70, 4)	1.0
  (71, 3)	1.0
  (72, 6)	1.0
  (73, 4)	1.0
  (74, 4)	1.0
  (75, 3)	1.0
  (76, 3)	1.0


In [43]:
## which movies data is similar
similarity = cosine_similarity(feature_vectorizer)


In [44]:
similarity.shape


(77, 77)

In [45]:
# get /input / recommendation from user
movie_name = 'A Dangerous Method '


In [46]:
list_movies_names=  movies_data['Film'].tolist()
print(list_movies_names)


['Zack and Miri Make a Porno', 'Youth in Revolt', 'You Will Meet a Tall Dark Stranger', 'When in Rome', 'What Happens in Vegas', 'Water For Elephants', 'WALL-E', 'Waitress', 'Waiting For Forever', "Valentine's Day", "Tyler Perry's Why Did I get Married", 'Twilight: Breaking Dawn', 'Twilight', 'The Ugly Truth', 'The Twilight Saga: New Moon', "The Time Traveler's Wife", 'The Proposal', 'The Invention of Lying', 'The Heartbreak Kid', 'The Duchess', 'The Curious Case of Benjamin Button', 'The Back-up Plan', 'Tangled', 'Something Borrowed', "She's Out of My League", 'Sex and the City Two', 'Sex and the City 2', 'Sex and the City', 'Remember Me', 'Rachel Getting Married', 'Penelope', 'P.S. I Love You', 'Over Her Dead Body', 'Our Family Wedding', 'One Day', 'Not Easily Broken', 'No Reservations', "Nick and Norah's Infinite Playlist", "New Year's Eve", 'My Week with Marilyn', 'Music and Lyrics', 'Monte Carlo', 'Miss Pettigrew Lives for a Day', 'Midnight in Paris', 'Marley and Me', 'Mamma Mia!'

In [47]:
import difflib
find_close_matcher = difflib.get_close_matches(movie_name, list(movie_name))
print(find_close_matcher)


[]


In [48]:
## find the index of movie with titles
movie_index = movies_data[movies_data.Film == find_close_matcher[0]['index']].values[0]
print(movie_index)


IndexError: list index out of range

In [49]:
similarity_list = list(enumerate(similarity[movie_index]))
print(similarity_list)


NameError: name 'movie_index' is not defined

In [None]:
## sort the similarity index
sort_similarity_list = sorted(similarity_list, key=lambda, x:x[1],reverse=True)


In [50]:
print(sort_similarity_list)


NameError: name 'sort_similarity_list' is not defined

In [51]:
# print movies based on there index
print('movies name for me')
i=1

for movie in sort_similarity_list:
   index= movie[0]
   movie_title = movies_data[movies_data.index==index]['title'].valuesif
   if(i<40):
     print(i, movie_title)

movies name for me


NameError: name 'sort_similarity_list' is not defined