In [None]:
#Description: Building a movie recommendation system using python

In [1]:
#Creating libraries
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
#Loading the data
df = pd.read_csv('C:/Users/vinee/Desktop/recommendation_system/movie-data.csv', low_memory = False)

In [4]:
#Viewing the data
df.head()

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0
3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270.32,59.0
4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325.02,40.0


In [5]:
#Providing movie id to the dataset
df['Movie_id'] = range(0,1000)

In [6]:
df.head()

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore,Movie_id
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0,0
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0,1
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0,2
3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270.32,59.0,3
4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325.02,40.0,4


In [7]:
#Checking the number of rows/movies and number of columns
df.shape

(1000, 13)

In [8]:
#Creating list of important columns in the dataset
imp_columns = ['Actors', 'Director', 'Genre', 'Title']

In [9]:
df[imp_columns].head(2)

Unnamed: 0,Actors,Director,Genre,Title
0,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",James Gunn,"Action,Adventure,Sci-Fi",Guardians of the Galaxy
1,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",Ridley Scott,"Adventure,Mystery,Sci-Fi",Prometheus


In [10]:
#Checking if any column has null values
df[imp_columns].isnull().values.any()

False

In [11]:
#Creating a function to combine important columns in a single string
def get_important_features(data):
    important_features = []
    for i in range(0, data.shape[0]):
        important_features.append(data['Actors'][i]+' '+data['Director'][i]+' '+data['Genre'][i]+' '+data['Title'][i])
        
    return important_features

In [12]:
#Creating a column to hold the combined strings
df['important_features'] = get_important_features(df)

#Viewing the above data
df.head()

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore,Movie_id,important_features
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0,0,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S..."
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0,1,"Noomi Rapace, Logan Marshall-Green, Michael Fa..."
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0,2,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar..."
3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270.32,59.0,3,"Matthew McConaughey,Reese Witherspoon, Seth Ma..."
4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325.02,40.0,4,"Will Smith, Jared Leto, Margot Robbie, Viola D..."


In [13]:
#Converting the text to a matrix using CountVectorizer
cm = CountVectorizer().fit_transform(df['important_features'])

In [14]:
#Getting the cosine similarity matrix and then printing it
cs = cosine_similarity(cm)

In [15]:
#Checking the number of rows in cosine similarity matrix
cs.shape

(1000, 1000)

In [2]:
#Getting the title of movie that the user likes
title = 'Split'
#finding the movie_id
movie_id = df[df.Title == title]['Movie_id'].values[0]
#Creating a list of enum for similarities of the above movie
scores = list(enumerate(cs[movie_id]))

NameError: name 'df' is not defined

In [24]:
#Sorting the above formed list
sorted_scores = sorted(scores, key = lambda x:x[1], reverse = True)
sorted_scores = sorted_scores[1:] #So that the first movie's name does not get repeated

In [25]:
#Creating a loop to print the names of first 7 similar movies
j=0
print('The 7 most similar movies recommended for', title, 'are:\n')
for item in sorted_scores:
    movie_title = df[df.Movie_id == item[0]]['Title'].values[0]
    print(j+1, movie_title)
    j=j+1
    if j>6:
        break

The 7 most similar movies recommended for Split are:

1 Morgan
2 The Conjuring
3 The Visit
4 Victor Frankenstein
5 The VVitch: A New-England Folktale
6 Insidious
7 Mama
