Movie Recommender

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
sb.set()

In [2]:
from google.colab import files
uploaded = files.upload()

Saving IMDB Movies.csv to IMDB Movies.csv


In [8]:
df = pd.read_csv('IMDB Movies.csv')
df[['Rank']] = df[['Rank']] - 1
df.head(3)

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore
0,0,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0
1,1,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0
2,2,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0


In [9]:
# Get counts of Datasets and Columns
df.shape

(1000, 12)

In [10]:
# Creating a list of important columns for Recommender
print(df.columns)
columns = ['Actors','Director','Genre','Title']

Index(['Rank', 'Title', 'Genre', 'Description', 'Director', 'Actors', 'Year',
       'Runtime (Minutes)', 'Rating', 'Votes', 'Revenue (Millions)',
       'Metascore'],
      dtype='object')


In [11]:
df[columns].head()

Unnamed: 0,Actors,Director,Genre,Title
0,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",James Gunn,"Action,Adventure,Sci-Fi",Guardians of the Galaxy
1,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",Ridley Scott,"Adventure,Mystery,Sci-Fi",Prometheus
2,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",M. Night Shyamalan,"Horror,Thriller",Split
3,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",Christophe Lourdelet,"Animation,Comedy,Family",Sing
4,"Will Smith, Jared Leto, Margot Robbie, Viola D...",David Ayer,"Action,Adventure,Fantasy",Suicide Squad


In [12]:
# Check for Null Values
df[columns].isnull().any()

Actors      False
Director    False
Genre       False
Title       False
dtype: bool

In [13]:
# Function to merge relevant columns
def merging_features(data):
  features = []
  for i in range(0, data.shape[0]):
    features.append(data['Actors'][i] + ' '+data['Director'][i] + ' ' + data['Genre'][i] + ' ' + data['Title'][i])
  return features

In [14]:
# Column with merged columns
df['Merged Features'] = merging_features(df)
df.head()

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore,Merged Features
0,0,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S..."
1,1,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0,"Noomi Rapace, Logan Marshall-Green, Michael Fa..."
2,2,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar..."
3,3,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270.32,59.0,"Matthew McConaughey,Reese Witherspoon, Seth Ma..."
4,4,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325.02,40.0,"Will Smith, Jared Leto, Margot Robbie, Viola D..."


In [17]:
# Convert the text to a matrix of token counts 
countmatrix = CountVectorizer().fit_transform(df['Merged Features'])
print(countmatrix.toarray())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [18]:
# Get the cosine similarity matrix from count matrix
cosinesimilarity = cosine_similarity(countmatrix)
print(cosinesimilarity)

[[1.         0.1767767  0.06085806 ... 0.0571662  0.06537205 0.        ]
 [0.1767767  1.         0.         ... 0.         0.06933752 0.        ]
 [0.06085806 0.         1.         ... 0.         0.         0.        ]
 ...
 [0.0571662  0.         0.         ... 1.         0.06726728 0.        ]
 [0.06537205 0.06933752 0.         ... 0.06726728 1.         0.07161149]
 [0.         0.         0.         ... 0.         0.07161149 1.        ]]


In [19]:
# Getting the shape of the cosine similarity matrix
cosinesimilarity.shape

(1000, 1000)

In [29]:
# Getting the title of the movie that a user likes
title = 'The Avengers'

In [30]:
# Find the movies ID
movie_id = df[df.Title == title]['Rank'].values[0]
movie_id

76

In [31]:
# Creating a list of enumerations for the similarity score [ (movie_id, similarity score),(...) ]
scores = list(enumerate(cosinesimilarity[movie_id]))


In [32]:
sorted_scores = sorted(scores, key = lambda x: x[1], reverse = True)
# Element at Position 1 is the Similarity Score, reverse = True will sort the list in descending order

In [33]:
# Eliminating the first element which is the movie itself
sorted_scores = sorted_scores[1:]

In [34]:
# Print the Sorted Scores
sorted_scores

[(94, 0.6546536707079773),
 (35, 0.5735393346764044),
 (216, 0.516185401208764),
 (203, 0.3638034375544995),
 (450, 0.3638034375544995),
 (279, 0.3535533905932738),
 (494, 0.3535533905932738),
 (195, 0.34412360080584264),
 (64, 0.3125),
 (315, 0.3125),
 (533, 0.3125),
 (0, 0.2946278254943948),
 (708, 0.2946278254943948),
 (761, 0.2946278254943948),
 (875, 0.2946278254943948),
 (19, 0.2581988897471611),
 (140, 0.2581988897471611),
 (173, 0.2581988897471611),
 (588, 0.2581988897471611),
 (97, 0.25),
 (257, 0.25),
 (281, 0.25),
 (529, 0.25),
 (48, 0.24253562503633297),
 (85, 0.24253562503633297),
 (362, 0.24253562503633297),
 (388, 0.24253562503633297),
 (396, 0.24253562503633297),
 (799, 0.24253562503633297),
 (253, 0.23570226039551587),
 (316, 0.23570226039551587),
 (371, 0.23570226039551587),
 (34, 0.22941573387056174),
 (177, 0.22941573387056174),
 (429, 0.22941573387056174),
 (465, 0.22941573387056174),
 (710, 0.22941573387056174),
 (500, 0.22360679774997896),
 (566, 0.22360679774997

In [35]:
# Create a loop to print the first 7 Similar Movies
j = 0
print("The top 7 recommended movies to", title, 'are:\n')
for item in sorted_scores:
  movie_title = df[df.Rank == item[0]]['Title'].values[0]
  print(j+1, movie_title)
  j+=1
  if(j >= 7):
    break

The top 7 recommended movies to The Avengers are:

1 Avengers: Age of Ultron
2 Captain America: Civil War
3 Captain America: The Winter Soldier
4 Iron Man
5 Iron Man 2
6 Iron Man Three
7 Under the Skin
