In [1]:
import pandas as pd 
import numpy as np 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [2]:
df1 = pd.read_csv('tmdb_5000_credits.csv')
df2 = pd.read_csv('tmdb_5000_movies.csv')
df1.head(3)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."


In [3]:
df1.columns = ['id','tittle','cast','crew']
df2= df2.merge(df1,on = 'id')

In [4]:
df2.head(5)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,tittle,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]",...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


2. User hasn’t liked any movie yet, what suggestions can be given?

In [5]:
C = df2['vote_average'].mean()
m = df2['vote_count'].quantile(0.75)
q_movies = df2.copy().loc[df2['vote_count'] >= m]
print(q_movies.shape)

(1203, 23)


In [6]:
# Calculation based on the IMDB formula

def weighted_rating(x, m = m, C = C):
    
    '''This function will return Weighted Rating(WR)'''
    
    v = x['vote_count']
    R = x['vote_average']
    
    return (v / (v + m) * R) + (m / (m + v) * C)

In [7]:
q_movies['weight'] = q_movies.apply(weighted_rating, axis=1)

In [8]:
q_movies = q_movies.sort_values('weight', ascending=False)

q_movies[['title', 'vote_average', 'vote_count', 'weight']].head(10)

Unnamed: 0,title,vote_average,vote_count,weight
1881,The Shawshank Redemption,8.5,8205,8.301547
3337,The Godfather,8.4,5893,8.143459
662,Fight Club,8.3,9413,8.139688
3232,Pulp Fiction,8.3,8428,8.122458
65,The Dark Knight,8.2,12002,8.078054
809,Forrest Gump,8.2,7927,8.020698
96,Inception,8.1,13752,7.997869
1818,Schindler's List,8.3,4329,7.978806
3865,Whiplash,8.3,4254,7.973979
95,Interstellar,8.1,10867,7.972478


In [9]:
# Calculation based on the popularity

popularity = df2.sort_values('popularity', ascending=False)

popularity[['title', 'vote_average', 'vote_count', 'popularity']].head(10)

Unnamed: 0,title,vote_average,vote_count,popularity
546,Minions,6.4,4571,875.581305
95,Interstellar,8.1,10867,724.247784
788,Deadpool,7.4,10995,514.569956
94,Guardians of the Galaxy,7.9,9742,481.098624
127,Mad Max: Fury Road,7.2,9427,434.278564
28,Jurassic World,6.5,8662,418.708552
199,Pirates of the Caribbean: The Curse of the Bla...,7.5,6985,271.972889
82,Dawn of the Planet of the Apes,7.3,4410,243.791743
200,The Hunger Games: Mockingjay - Part 1,6.6,5584,206.227151
88,Big Hero 6,7.8,6135,203.73459


1. Movie to movie (given that user liked a movie, what other movie can be suggested)

In [10]:
df2['overview'].head()

0    In the 22nd century, a paraplegic Marine is di...
1    Captain Barbossa, long believed to be dead, ha...
2    A cryptic message from Bond’s past sends him o...
3    Following the death of District Attorney Harve...
4    John Carter is a war-weary, former military ca...
Name: overview, dtype: object

In [11]:
df2['overview'] = df2['overview'].fillna('')
#df2['overview'] = df2['overview'].str.replace(' ','')
df2['overview'] = df2['overview'].str.replace('&','')
df2['overview'] = df2['overview'].str.replace(',','')
df2['overview'] = df2['overview'].str.lower()
df2['overview'] = df2['overview'].str.replace('.','')

In [12]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df2['overview'])
tfidf_matrix.shape

(4803, 21075)

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df2['overview'])
count_matrix.shape

(4803, 21075)

In [14]:
cosine = linear_kernel(tfidf_matrix, tfidf_matrix)

indices = pd.Series(df2.index, index=df2['title']).drop_duplicates()

In [15]:
def recommendations(titles, cosine = cosine):
    ''' This function will return 10 movies based on cosine similarity'''
    sim_scores = []
    for movie in titles:
        i = indices[movie]
        scores = list(enumerate(cosine[i]))
        for i in scores:
            sim_scores.append(i)
    
    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    #scores = scores[:11]   
    scores = scores[1:11]  
    movie_indices = [i[0] for i in scores]

    return df2['title'].iloc[movie_indices]


In [16]:
#let movie liked by person is 'Batman Forever','Titanic','Ghost Ship', 'Supernova' and 'The Matrix'

recommendations(['Batman Forever','Titanic','Ghost Ship','Supernova','The Matrix'])

1281                  Hackers
2996                 Commando
2088                    Pulse
1341     The Inhabited Island
333             Transcendence
0                      Avatar
261     Live Free or Die Hard
775                 Supernova
125       The Matrix Reloaded
2614          The Love Letter
Name: title, dtype: object

In [17]:
recommendations(['Batman Forever','Batman Forever'])

3                         The Dark Knight Rises
119                               Batman Begins
65                              The Dark Knight
428                              Batman Returns
210                              Batman & Robin
3854    Batman: The Dark Knight Returns, Part 2
1359                                     Batman
4343                                   Cry_Wolf
174                         The Incredible Hulk
9            Batman v Superman: Dawn of Justice
Name: title, dtype: object

In [18]:
recommendations(['Slow Burn'])

1181                                  JFK
879                   Law Abiding Citizen
2193                 Secret in Their Eyes
3                   The Dark Knight Rises
1202                         Legal Eagles
1349                Ghosts of Mississippi
4369                                 Slam
3862                          Margin Call
65                        The Dark Knight
906     Anchorman 2: The Legend Continues
Name: title, dtype: object