## Content Based Recommendation 
- making description based and metadata based recommender system 
    - Description Based: taglines and description of movies
    - Metadata Based: genres, keywords, cast 

In [1]:
import pandas as pd 
import numpy as np 

In [2]:
data_path = "../../movie_dataset/movies_metadata.csv"
df_movies = pd.read_csv(data_path, low_memory=False)
df_movies.head(1) 

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0


In [3]:
df_movies.columns 

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

## Implementing description based recommendation

In [4]:
# using tf-idf for the overview of the movies
# making a new df 
df = pd.DataFrame()
df["overview"], df["id"], df['title'] = df_movies["overview"], df_movies["id"], df_movies["original_title"]
df.head() 

Unnamed: 0,overview,id,title
0,"Led by Woody, Andy's toys live happily in his ...",862,Toy Story
1,When siblings Judy and Peter discover an encha...,8844,Jumanji
2,A family wedding reignites the ancient feud be...,15602,Grumpier Old Men
3,"Cheated on, mistreated and stepped on, the wom...",31357,Waiting to Exhale
4,Just when George Banks has recovered from his ...,11862,Father of the Bride Part II


In [5]:
# implementing Tf-idf vectorizer 
from sklearn.feature_extraction.text import TfidfVectorizer 

tfidf = TfidfVectorizer(stop_words='english')
print(tfidf) 

%time 

TfidfVectorizer(stop_words='english')
CPU times: total: 0 ns
Wall time: 0 ns


In [6]:
df["overview"] = df["overview"].fillna('') 
tfidf_matrix = tfidf.fit_transform(df["overview"])
tfidf_matrix, tfidf_matrix.shape 

(<45466x75827 sparse matrix of type '<class 'numpy.float64'>'
 	with 1210882 stored elements in Compressed Sparse Row format>,
 (45466, 75827))

### cell in 'i'th row and 'j'th column represents the similarity score between movies i and j 

### Diagonal element is 1, since it is similar to itself 

In [7]:
# for cosine similarity 
# but denominator is 1, since magnitude of all TF-IDF vectors is 1 
from sklearn.metrics.pairwise import linear_kernel

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
print(cosine_sim.shape) 

%time 

(45466, 45466)
CPU times: total: 0 ns
Wall time: 999 µs


In [8]:
indices = pd.Series(df.index, index= df['title']).drop_duplicates()
indices 

title
Toy Story                          0
Jumanji                            1
Grumpier Old Men                   2
Waiting to Exhale                  3
Father of the Bride Part II        4
                               ...  
رگ خواب                        45461
Siglo ng Pagluluwal            45462
Betrayal                       45463
Satana likuyushchiy            45464
Queerama                       45465
Length: 45466, dtype: int64

In [9]:
def content_recommender(title, cosine_sim=cosine_sim, df=df,indices=indices):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # top 10 movies 
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    
    return df['title'].iloc[movie_indices]

In [10]:
content_recommender('Inception')

44314                                 III
2039                                House
25299                       Borrowed Time
2114                    The Farmer's Wife
44792                            Altitude
22619                    The Monkey's Paw
3424     What Ever Happened to Baby Jane?
349                                  Cobb
37187                      Straight Story
8988                           Stone Cold
Name: title, dtype: object

In [11]:
df_movies.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [12]:
df_movies['revenue'].unique() 

array([3.73554033e+08, 2.62797249e+08, 0.00000000e+00, ...,
       1.32861200e+06, 1.26879300e+06, 1.41300000e+06])

In [13]:
# getting credits and keywords 
credits_df = pd.read_csv('../../movie_dataset/credits.csv')
keywords_df = pd.read_csv('../../movie_dataset/keywords.csv')
print(keywords_df.head(1))
print()
print(credits_df.head(1) ) 

    id                                           keywords
0  862  [{'id': 931, 'name': 'jealousy'}, {'id': 4290,...

                                                cast  \
0  [{'cast_id': 14, 'character': 'Woody (voice)',...   

                                                crew   id  
0  [{'credit_id': '52fe4284c3a36847f8024f49', 'de...  862  


In [14]:
keywords_df.columns , credits_df.columns 

(Index(['id', 'keywords'], dtype='object'),
 Index(['cast', 'crew', 'id'], dtype='object'))

In [15]:
df_movies.dtypes

adult                     object
belongs_to_collection     object
budget                    object
genres                    object
homepage                  object
id                        object
imdb_id                   object
original_language         object
original_title            object
overview                  object
popularity                object
poster_path               object
production_companies      object
production_countries      object
release_date              object
revenue                  float64
runtime                  float64
spoken_languages          object
status                    object
tagline                   object
title                     object
video                     object
vote_average             float64
vote_count               float64
dtype: object

In [16]:
# turning id of df_movies to integers 
def clean_ids(x):
    try:
        return int(x) 
    except:
        return np.nan 

In [17]:
df_movies["id"]  = df_movies["id"].apply(clean_ids) 
df_movies = df_movies[df_movies["id"].notnull()]
df_movies.dtypes

adult                     object
belongs_to_collection     object
budget                    object
genres                    object
homepage                  object
id                       float64
imdb_id                   object
original_language         object
original_title            object
overview                  object
popularity                object
poster_path               object
production_companies      object
production_countries      object
release_date              object
revenue                  float64
runtime                  float64
spoken_languages          object
status                    object
tagline                   object
title                     object
video                     object
vote_average             float64
vote_count               float64
dtype: object

In [18]:
df_movies['id'] = df_movies['id'].astype('int')
keywords_df['id'] = keywords_df['id'].astype('int')
credits_df['id'] = credits_df['id'].astype('int')

In [19]:
df_combined = df_movies.merge(keywords_df, on="id")
df_combined = df_combined.merge(credits_df, on="id")
df_combined.head(1) 

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,spoken_languages,status,tagline,title,video,vote_average,vote_count,keywords,cast,crew
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...","[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de..."


In [20]:
from ast import literal_eval

features = ['cast', 'crew', 'keywords', 'genres']

for feature in features:
    df_combined[feature] = df_combined[feature][:50].apply(literal_eval)
df_combined.iloc[0]['crew'][:3] 

[{'credit_id': '52fe4284c3a36847f8024f49',
  'department': 'Directing',
  'gender': 2,
  'id': 7879,
  'job': 'Director',
  'name': 'John Lasseter',
  'profile_path': '/7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg'},
 {'credit_id': '52fe4284c3a36847f8024f4f',
  'department': 'Writing',
  'gender': 2,
  'id': 12891,
  'job': 'Screenplay',
  'name': 'Joss Whedon',
  'profile_path': '/dTiVsuaTVTeGmvkhcyJvKp2A5kr.jpg'},
 {'credit_id': '52fe4284c3a36847f8024f55',
  'department': 'Writing',
  'gender': 2,
  'id': 7,
  'job': 'Screenplay',
  'name': 'Andrew Stanton',
  'profile_path': '/pvQWsu0qc8JFQhMVJkTHuexUAa1.jpg'}]

In [21]:
df_combined.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,spoken_languages,status,tagline,title,video,vote_average,vote_count,keywords,cast,crew
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...","[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de..."
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,"[{'id': 10090, 'name': 'board game'}, {'id': 1...","[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de..."
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392...","[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de..."
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,"[{'id': 818, 'name': 'based on novel'}, {'id':...","[{'cast_id': 1, 'character': 'Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de..."
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...","[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de..."


In [22]:
def get_director(x):
    try:
        for crew_member in x: 
            if crew_member['job']== 'Director':
                return crew_member['name']
    except:
        return np.nan 

In [23]:
df_combined.iloc[0]['crew'][0] 

{'credit_id': '52fe4284c3a36847f8024f49',
 'department': 'Directing',
 'gender': 2,
 'id': 7879,
 'job': 'Director',
 'name': 'John Lasseter',
 'profile_path': '/7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg'}

In [24]:
df_combined["director"] = df_combined["crew"].apply(get_director) 
df_combined.director.head(2)

0    John Lasseter
1     Joe Johnston
Name: director, dtype: object

In [25]:
df_combined.head(2) 

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,status,tagline,title,video,vote_average,vote_count,keywords,cast,crew,director
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,Released,,Toy Story,False,7.7,5415.0,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...","[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",John Lasseter
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,"[{'id': 10090, 'name': 'board game'}, {'id': 1...","[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",Joe Johnston


In [26]:
df_combined['cast'][0][:1]

[{'cast_id': 14,
  'character': 'Woody (voice)',
  'credit_id': '52fe4284c3a36847f8024f95',
  'gender': 2,
  'id': 31,
  'name': 'Tom Hanks',
  'order': 0,
  'profile_path': '/pQFoyx7rp09CJTAb932F2g8Nlho.jpg'}]

In [27]:
def get_cast(x):
    artists = [] 
    try: 
        for artist in x[:3]:
            artists.append(artist['name'])
        return artists 
    except: 
        pass 

In [28]:
df_combined["artists"] = df_combined["cast"].apply(get_cast)
df_combined["artists"].head(2) 

0               [Tom Hanks, Tim Allen, Don Rickles]
1    [Robin Williams, Jonathan Hyde, Kirsten Dunst]
Name: artists, dtype: object

In [29]:
df_combined.head(2) 

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,tagline,title,video,vote_average,vote_count,keywords,cast,crew,director,artists
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,,Toy Story,False,7.7,5415.0,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...","[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",John Lasseter,"[Tom Hanks, Tim Allen, Don Rickles]"
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,"[{'id': 10090, 'name': 'board game'}, {'id': 1...","[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",Joe Johnston,"[Robin Williams, Jonathan Hyde, Kirsten Dunst]"


In [30]:
df_combined['keywords'][0]

[{'id': 931, 'name': 'jealousy'},
 {'id': 4290, 'name': 'toy'},
 {'id': 5202, 'name': 'boy'},
 {'id': 6054, 'name': 'friendship'},
 {'id': 9713, 'name': 'friends'},
 {'id': 9823, 'name': 'rivalry'},
 {'id': 165503, 'name': 'boy next door'},
 {'id': 170722, 'name': 'new toy'},
 {'id': 187065, 'name': 'toy comes to life'}]

In [31]:
def get_list(row):
    if isinstance(row, list):
        names = [ele['name'] for ele in row]
        if len(names) >3:
            names = names[:3] 
        return names 
    else: 
        return [] 

In [32]:
df_combined["keywords"] = df_combined["keywords"].apply(get_list) 
df_combined["keywords"].head(2) 

0                                 [jealousy, toy, boy]
1    [board game, disappearance, based on children'...
Name: keywords, dtype: object

In [33]:
df_combined.head(2) 

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,tagline,title,video,vote_average,vote_count,keywords,cast,crew,director,artists
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,,Toy Story,False,7.7,5415.0,"[jealousy, toy, boy]","[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",John Lasseter,"[Tom Hanks, Tim Allen, Don Rickles]"
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,"[board game, disappearance, based on children'...","[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",Joe Johnston,"[Robin Williams, Jonathan Hyde, Kirsten Dunst]"


In [34]:
df_combined['genres'][12]

[{'id': 10751, 'name': 'Family'},
 {'id': 16, 'name': 'Animation'},
 {'id': 12, 'name': 'Adventure'}]

In [35]:
df_combined["genres"] = df_combined["genres"].apply(get_list)
df_combined["genres"].head(2)

0     [Animation, Comedy, Family]
1    [Adventure, Fantasy, Family]
Name: genres, dtype: object

In [36]:
# Removes spaces and converts to lowercase
def sanitize(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [37]:
df_combined.head(2) 

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,tagline,title,video,vote_average,vote_count,keywords,cast,crew,director,artists
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,,Toy Story,False,7.7,5415.0,"[jealousy, toy, boy]","[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",John Lasseter,"[Tom Hanks, Tim Allen, Don Rickles]"
1,False,,65000000,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,"[board game, disappearance, based on children'...","[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",Joe Johnston,"[Robin Williams, Jonathan Hyde, Kirsten Dunst]"


In [38]:
for feature in ['artists', 'director', 'genres', 'keywords']:
    df_combined[feature] = df_combined[feature].apply(sanitize)

In [39]:
df_combined.head(2) 

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,tagline,title,video,vote_average,vote_count,keywords,cast,crew,director,artists
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[animation, comedy, family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,,Toy Story,False,7.7,5415.0,"[jealousy, toy, boy]","[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",johnlasseter,"[tomhanks, timallen, donrickles]"
1,False,,65000000,"[adventure, fantasy, family]",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,"[boardgame, disappearance, basedonchildren'sbook]","[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",joejohnston,"[robinwilliams, jonathanhyde, kirstendunst]"


In [40]:
df_combined.director[0], type(df_combined.director[0]) 

('johnlasseter', str)

In [41]:
# vectorizer needs to take all combined form of the metadata 
# producing metadata soup 
# ' '.join(row['keywords']) + ' ' +
def create_soup(row):
    return  ' '.join(row['keywords']) + ' ' +' '.join(row["artists"]) + ' ' + row["director"] + ' ' + ' '.join(row["genres"])

In [42]:
df_combined.columns 

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'keywords', 'cast', 'crew', 'director',
       'artists'],
      dtype='object')

In [43]:
df_combined["soup"] = df_combined.apply(create_soup, axis=1)
df_combined["soup"][0] 

'jealousy toy boy tomhanks timallen donrickles johnlasseter animation comedy family'

In [44]:
# using count vectorizer
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(stop_words='english')
count_vect

In [45]:
count_vect_matrix = count_vect.fit_transform(df_combined["soup"]) 
type(count_vect_matrix), count_vect_matrix.shape 

(scipy.sparse._csr.csr_matrix, (46628, 344))

In [46]:
from sklearn.metrics.pairwise import cosine_similarity
import pickle 


cosine_sim2 = cosine_similarity(count_vect_matrix, count_vect_matrix)
print(cosine_sim2.shape) 

with open("cosine_sim.pkl", "wb") as f: 
    pickle.dump(cosine_sim2, f)
    print('pickle dumped for cosine similarity') 

(46628, 46628)
pickle dumped for cosine similarity


In [51]:
def content_recommender(title, cosine_sim=cosine_sim, df=df,indices=indices):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    try:
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    except:
        pass         
    # top 10 movies 
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    
    return df['title'].iloc[movie_indices]

In [52]:
df_combined.head(2) 

Unnamed: 0,index,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,title,video,vote_average,vote_count,keywords,cast,crew,director,artists,soup
0,0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[animation, comedy, family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,...,Toy Story,False,7.7,5415.0,"[jealousy, toy, boy]","[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",johnlasseter,"[tomhanks, timallen, donrickles]",jealousy toy boy tomhanks timallen donrickles ...
1,1,False,,65000000,"[adventure, fantasy, family]",,8844,tt0113497,en,Jumanji,...,Jumanji,False,6.9,2413.0,"[boardgame, disappearance, basedonchildren'sbook]","[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",joejohnston,"[robinwilliams, jonathanhyde, kirstendunst]",boardgame disappearance basedonchildren'sbook ...


In [58]:
df_combined = df_combined.reset_index() 
indices2 = pd.Series(df_combined.index, index = df_combined['title'])
indices2

ValueError: cannot insert level_0, already exists

In [56]:
with open("indices.pkl", "wb") as f: 
    pickle.dump(indices2, f)
    print("Indices dumped as pickle") 

Indices dumped as pickle


In [62]:
movie_name = 'Se7en'
content_recommender(movie_name, cosine_sim=cosine_sim2, df=df_combined, indices=indices2).tolist()

['Twelve Monkeys',
 'Heat',
 'Get Shorty',
 'The Usual Suspects',
 'Casino',
 'Copycat',
 'GoldenEye',
 'Four Rooms',
 'Ace Ventura: When Nature Calls',
 'Money Train']