# Content Based Simple Movie Recommendation System

In [1]:
# Importing the dependancies
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os
import opendatasets as od

In [2]:
# Loading our csv file into pandas dataframe
df = pd.read_csv('movies.csv')

In [3]:
# Dropping duplicates from our original dataframes
df.drop_duplicates(inplace = True)

In [4]:
df[df.title == 'Avatar']

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron


In [55]:
df.columns

Index(['index', 'budget', 'genres', 'homepage', 'id', 'keywords',
       'original_language', 'original_title', 'overview', 'popularity',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title',
       'vote_average', 'vote_count', 'cast', 'crew', 'director'],
      dtype='object')

In [56]:
df.shape

(4803, 24)

In [57]:
df.head(2)

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski


In [58]:
# The following features will be used, and their values will be fed to CountVectorizer to build a vocabulary for movies
selected_features = ['genres','keywords','cast','director']

In [59]:
# Filling the null values for the selected features
for i in selected_features:
    df[i].fillna('', inplace = True)

In [60]:
df[selected_features].isnull().sum()

genres      0
keywords    0
cast        0
director    0
dtype: int64

In [61]:
# Combining all values of selected features from the df dataframes into a single series
combined_features = df[selected_features[0]]+" "+df[selected_features[1]]+" "+df[selected_features[2]]+" "+df[selected_features[3]] # +" "+df[selected_features[4]]+" "+df[selected_features[5]]

In [62]:
type(combined_features)

pandas.core.series.Series

In [63]:
# Converting entire series into lowercase
combined_features = combined_features.str.lower()

In [64]:
print(combined_features)

0       action adventure fantasy science fiction cultu...
1       adventure fantasy action ocean drug abuse exot...
2       action adventure crime spy based on novel secr...
3       action crime drama thriller dc comics crime fi...
4       action adventure science fiction based on nove...
                              ...                        
4798    action crime thriller united states\u2013mexic...
4799    comedy romance  edward burns kerry bish\u00e9 ...
4800    comedy drama romance tv movie date love at fir...
4801      daniel henney eliza coupe bill paxton alan r...
4802    documentary obsession camcorder crush dream gi...
Length: 4803, dtype: object


In [65]:
# Importing count Vectorizer
vect = CountVectorizer()

In [66]:
# Learning the vocabulary of the series, and converting the vocab into a sparse matrix
feature_vectors = vect.fit_transform(combined_features)

In [67]:
feature_vectors

<4803x14845 sparse matrix of type '<class 'numpy.int64'>'
	with 97547 stored elements in Compressed Sparse Row format>

In [68]:
# convert sparse matrix to a dense matrix
feature_vectors.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

<b> Using the Cosine Similarity

In [69]:
# This function measures the similarities of 4803 movies with each other resulting the resultant dimension being 4903*4803
similarity = cosine_similarity(feature_vectors)

In [70]:
similarity

array([[1.        , 0.10540926, 0.12038585, ..., 0.        , 0.        ,
        0.        ],
       [0.10540926, 1.        , 0.0761387 , ..., 0.03651484, 0.        ,
        0.        ],
       [0.12038585, 0.0761387 , 1.        , ..., 0.        , 0.11145564,
        0.        ],
       ...,
       [0.        , 0.03651484, 0.        , ..., 1.        , 0.        ,
        0.04264014],
       [0.        , 0.        , 0.11145564, ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.04264014, 0.        ,
        1.        ]])

In [71]:
similarity.shape

(4803, 4803)

# Developing the System

In [72]:
# Getting the User's Favourite Movie
movie = input("Enter the Name of your Favourite Movie: ")

Enter the Name of your Favourite Movie: the godfather


In [73]:
# Converting user's input into lowercase
movie = movie.lower()

In [74]:
# Accessing all movies from the dataframe
all_movies = df['title'].tolist()

In [75]:
all_movies = [x.lower() for x in all_movies]

In [76]:
all_movies

['avatar',
 "pirates of the caribbean: at world's end",
 'spectre',
 'the dark knight rises',
 'john carter',
 'spider-man 3',
 'tangled',
 'avengers: age of ultron',
 'harry potter and the half-blood prince',
 'batman v superman: dawn of justice',
 'superman returns',
 'quantum of solace',
 "pirates of the caribbean: dead man's chest",
 'the lone ranger',
 'man of steel',
 'the chronicles of narnia: prince caspian',
 'the avengers',
 'pirates of the caribbean: on stranger tides',
 'men in black 3',
 'the hobbit: the battle of the five armies',
 'the amazing spider-man',
 'robin hood',
 'the hobbit: the desolation of smaug',
 'the golden compass',
 'king kong',
 'titanic',
 'captain america: civil war',
 'battleship',
 'jurassic world',
 'skyfall',
 'spider-man 2',
 'iron man 3',
 'alice in wonderland',
 'x-men: the last stand',
 'monsters university',
 'transformers: revenge of the fallen',
 'transformers: age of extinction',
 'oz: the great and powerful',
 'the amazing spider-man 2',

In [77]:
import difflib

This module provides classes and functions for comparing sequences. It can be used for example, for comparing files, and can produce information about file differences in various formats, including HTML and context and unified diffs.

In [78]:
detecting_similar_movies = difflib.get_close_matches(movie,all_movies)
detecting_similar_movies

['the godfather', 'the last godfather', 'the others']

In [79]:
detecting_similar_movies[0]

'the godfather'

In [80]:
df['title'] = df.title.str.lower()

In [81]:
# finding the index of the movie
index_of_the_movie = df[df.title == detecting_similar_movies[0]]['index'].values[0]
index_of_the_movie

3337

In [82]:
# getting a list of similar movies
similarity_score = list(enumerate(similarity[index_of_the_movie]))
print(similarity_score)

[(0, 0.036369648372665396), (1, 0.06900655593423541), (2, 0.07881104062391006), (3, 0.1853123291652753), (4, 0.0), (5, 0.11338934190276817), (6, 0.0), (7, 0.034503277967117704), (8, 0.04029114820126901), (9, 0.0), (10, 0.03779644730092272), (11, 0.07559289460184544), (12, 0.0), (13, 0.03779644730092272), (14, 0.0), (15, 0.0), (16, 0.03394221166510653), (17, 0.0), (18, 0.0), (19, 0.04029114820126901), (20, 0.11572751247156893), (21, 0.0), (22, 0.04335549847620599), (23, 0.0), (24, 0.03340765523905304), (25, 0.08451542547285165), (26, 0.035714285714285705), (27, 0.0), (28, 0.0), (29, 0.0), (30, 0.11118739749916517), (31, 0.03779644730092272), (32, 0.0), (33, 0.0), (34, 0.0), (35, 0.0), (36, 0.0), (37, 0.042257712736425826), (38, 0.0), (39, 0.0), (40, 0.0), (41, 0.0), (42, 0.0), (43, 0.0), (44, 0.041239304942116126), (45, 0.07412493166611012), (46, 0.035093120317179816), (47, 0.0), (48, 0.0), (49, 0.041239304942116126), (50, 0.0), (51, 0.0), (52, 0.0), (53, 0.04029114820126901), (54, 0.0)

In [83]:
# sorting the movies based on their similarity score
sorted_similar_movies = sorted(similarity_score, key = lambda x : x[1], reverse = True) 
print(sorted_similar_movies)

[(3337, 0.9999999999999997), (2731, 0.42433421239575275), (867, 0.3706246583305506), (3450, 0.3401680257083045), (1525, 0.33806170189140655), (4638, 0.3273268353539886), (4135, 0.3086066999241838), (1408, 0.29649972666444047), (3831, 0.29580398915498074), (2649, 0.2927700218845599), (1728, 0.28203803740888306), (448, 0.2786391062876764), (2674, 0.27583864218368526), (1018, 0.27003086243366087), (3293, 0.27003086243366087), (4209, 0.2645751311064591), (1874, 0.264575131106459), (3391, 0.264575131106459), (512, 0.2545875386086578), (3012, 0.2545875386086578), (4065, 0.24999999999999994), (3352, 0.24999999999999992), (1394, 0.24743582965269678), (2792, 0.24743582965269678), (2255, 0.24174688920761406), (1765, 0.24174688920761403), (3743, 0.24174688920761403), (3966, 0.24174688920761403), (761, 0.2364331218717302), (877, 0.2364331218717302), (1209, 0.2364331218717302), (1850, 0.2364331218717302), (4370, 0.23622779563076698), (1243, 0.23145502494313788), (1946, 0.2267786838055363), (2218, 0

In [84]:
while True:
    try:
        n = int(input('How many Movies would you wish to be recommended?'))
        break
    except:
        print("Please! Enter a Number")

How many Movies would you wish to be recommended?5


In [85]:
# Recommending the Movies
print("Movies Recommended For You: \n")
i = 1

for value in sorted_similar_movies[0:n]:
    index = value[0]
    selected_movie = df.title.loc[index]
    similarity_value = value[1]
    percent = "("+str(round(similarity_value*100, 2))+" %)"

    print(i,". ",selected_movie,percent)
    i+=1

Movies Recommended For You: 

1 .  the godfather (100.0 %)
2 .  the godfather: part ii (42.43 %)
3 .  the godfather: part iii (37.06 %)
4 .  west side story (34.02 %)
5 .  apocalypse now (33.81 %)


# Finalized/Developed Movie Recommended System

In [86]:
# Importing the Dependencies
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Reading and Loading the csv file into pandas dataframe 
df = pd.read_csv('movies.csv')
df.drop_duplicates(inplace = True)

# Selecting Certain Features
selected_features = ['genres','keywords','cast','director']

# FIlling Null values in the selected columns in the df dataframe
for i in selected_features:
    df[i].fillna('', inplace = True)

# Creating and converting a Combined Features
combined_features = df[selected_features[0]]+" "+df[selected_features[1]]+" "+df[selected_features[2]]+" "+df[selected_features[3]]#+" "+df[selected_features[4]]+" "+df[selected_features[5]]
combined_features = combined_features.str.lower()

# Using CountVectorier to Learn Vocabulary for different Features
vect = CountVectorizer()
feature_vectors = vect.fit_transform(combined_features)

# Using cosine_similarity to get similarity scores
similarity = cosine_similarity(feature_vectors)

# Asking user for their Favourite Movie for making Recommendations
movie = input("Enter the Name of your Favourite Movie: ")
movie = movie.lower()

# Filling Null values and Converting whole title Feature column into lowercase
df['title'].fillna('', inplace = True)
df['titlelower'] = df.title.str.lower()

# Getting all movies into a list
all_movies = df['titlelower'].tolist()

#  Calculating the index of the Movie 
index_of_the_movie = df[df.titlelower == movie].index[0]

# Calculating the similar movies' indexes and their similarity values in a tuple inside another list
similarity_score = list(enumerate(similarity[index_of_the_movie]))

# Sorting the previous List in accordance to their similarity score using Scored Function
sorted_similar_movies = sorted(similarity_score, key = lambda x : x[1], reverse = True) 
sorted_similar_movies.remove(sorted_similar_movies[0])

# Asking user How Many Movies do You wish to be Recommended?
while True:
    try:
        n = int(input('\nHow many Movies would you wish to be recommended?'))
        break
    except:
        print("Please! Enter a Number")

# Recommending the Movies
print("\nMovies Recommended For You:")
print("(It is generally recommended to priotize movies with atleast similarity/accuracy around 40 % due to limitations of model and Dataset)\n")
i = 1

for value in sorted_similar_movies[0:n]:
    index = value[0]
    selected_movie = df.title.loc[index]
    similarity_value = value[1]
    percent = "("+str(round(similarity_value*100, 2))+" %)"
    print(i,". ",selected_movie,percent)
    i+=1

Enter the Name of your Favourite Movie: the departed

How many Movies would you wish to be recommended?5

Movies Recommended For You:
(It is generally recommended to priotize movies with atleast similarity/accuracy around 40 % due to limitations of model and Dataset)

1 .  Shutter Island (35.56 %)
2 .  The Wolf of Wall Street (33.37 %)
3 .  Gangs of New York (31.12 %)
4 .  Catch Me If You Can (30.46 %)
5 .  The Aviator (30.15 %)


# Steps

Pipeline:
    
    1.  Removing duplicates from the dataframe
    2.  combining selected features in lowercase
    3.  Using CountVectorizer to tokenize  and learn vocabulary form combined features
    4.  Using CosineSimilarity for calculating the similaraties of the array of vocabulary created through vectorizer, and               storing the array containing similarities in a variable
    5.  Asking user for their favourite movie, and converting the input into a lowercase
    6.  Creating a list containing all titles from the dataframe
    7.  Getting index value of the movie user entered from the dataframe
    8.  Sorting the list created from step 7 by the similarity value in descending order
    9.  Asking user how many number of movies to be recommended
    10. Performing final operation
    

In [88]:
import pickle

In [89]:
pickle.dump(df,open('movies.pkl','wb'))

In [90]:
pickle.dump(df.to_dict(),open('movie_dict.pkl', 'wb')) 

In [91]:
pickle.dump(similarity, open('similarity.pkl', 'wb'))

# Thank you

### Please Check my other projects as well