# Content Based Movie Recommendation

Dataset Source: - IMDB

In [1]:
# Important Libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

__Helper functions to find movies from index number and index number from movie name__

In [2]:
# Helper Functions
def get_title_from_index(index):
    return df[df.index == index]["title"].values[0]

def get_index_from_title(title):
    return df[df.title == title]["index"].values[0]

In [3]:
df = pd.read_csv('movie_dataset.csv')
df.head()

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes
3,3,250000000,Action Crime Drama Thriller,http://www.thedarkknightrises.com/,49026,dc comics crime fighter terrorist secret ident...,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,Christian Bale Michael Caine Gary Oldman Anne ...,"[{'name': 'Hans Zimmer', 'gender': 2, 'departm...",Christopher Nolan
4,4,260000000,Action Adventure Science Fiction,http://movies.disney.com/john-carter,49529,based on novel mars medallion space travel pri...,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,Taylor Kitsch Lynn Collins Samantha Morton Wil...,"[{'name': 'Andrew Stanton', 'gender': 2, 'depa...",Andrew Stanton


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   index                 4803 non-null   int64  
 1   budget                4803 non-null   int64  
 2   genres                4775 non-null   object 
 3   homepage              1712 non-null   object 
 4   id                    4803 non-null   int64  
 5   keywords              4391 non-null   object 
 6   original_language     4803 non-null   object 
 7   original_title        4803 non-null   object 
 8   overview              4800 non-null   object 
 9   popularity            4803 non-null   float64
 10  production_companies  4803 non-null   object 
 11  production_countries  4803 non-null   object 
 12  release_date          4802 non-null   object 
 13  revenue               4803 non-null   int64  
 14  runtime               4801 non-null   float64
 15  spoken_languages     

> __The Data contains a total of 4803 Movies and their corresponding information we will not use all the features for our recommender system as popularity in a content based system make no sense nor the budget and few other features too.__

We will use:
- Keywords
- Cast
- Genre
- Director
- Overview
- Title
***
So, lets take all the features and combine all to a single featur __'Combined_Features'__ in order to apply countvectorizer and cosine similarity.

In [5]:
features = ['overview', 'cast', 'genres', 'director', 'keywords']
df['combined_features'] = df['keywords']
#df['Combined_Features'] = df[ features ].agg( ' '.join, axis=1)
for feature in features:
    df[feature] = df[feature].fillna('')
    df['combined_features']+=df[feature]

In [6]:
df['combined_features'].head()

0    culture clash future space war space colony so...
1    ocean drug abuse exotic island east india trad...
2    spy based on novel secret agent sequel mi6A cr...
3    dc comics crime fighter terrorist secret ident...
4    based on novel mars medallion space travel pri...
Name: combined_features, dtype: object

## Count Vectorizer

![alt text](https://www.educative.io/api/edpresso/shot/5197621598617600/image/6596233398321152)

Convert a collection of text documents to a matrix of token counts

[Documentation](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html)

In [7]:
cv = CountVectorizer()
count_matrix = cv.fit_transform(df['combined_features'].values.astype('U'))

In [21]:
print(count_matrix.toarray())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


## Cosiine Similarity
<img src="https://www.machinelearningplus.com/wp-content/uploads/2018/10/3d_projection.png" height=500 width=500 >
Compute cosine similarity between samples in X and Y.
Cosine similarity, or the cosine kernel, computes similarity as the normalized dot product of X and Y.

[Documentation](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.cosine_similarity.html)

In [8]:
cosine_similarity = cosine_similarity(count_matrix)

In [9]:
def find_similar_movies(movie_by_user):
    movie_index = get_index_from_title(movie_by_user)
    
    similar_movies = list(enumerate(cosine_similarity[movie_index]))
    return sorted(similar_movies, key=lambda x:x[1], reverse=True)

In [13]:
def display_similar_movies(movie_user_liked):
    movies = find_similar_movies(movie_user_liked)
    ct = 1
    print('You may also like: ')
    for movie in movies:
        if ct == 1: pass
        else:
            print('{}. {} '.format(ct-1 ,get_title_from_index(movie[0])))
        ct+=1
        if ct==12: break

In [14]:
def main():
    movie_name = input('Enter your favourite movie: ')
    display_similar_movies(movie_name)

In [15]:
if __name__=='__main__':
    main()

Enter your favourite movie: Avatar
You may also like: 
1. Sunshine 
2. Aliens 
3. Moonraker 
4. Hellboy II: The Golden Army 
5. Starship Troopers 
6. Apollo 18 
7. Men in Black II 
8. Deep Impact 
9. Jason X 
10. Gravity 
