In [229]:
import pandas as pd 
import numpy as np
import json
import nltk
import re
import heapq 

# nltk.download('punkt')
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

import matplotlib.pyplot as plt
%matplotlib inline

In [230]:
df = pd.read_csv("tmdb_5000_movies.csv")

In [231]:
df.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [232]:
col_drop = ["homepage","genres","keywords","production_companies","production_countries","spoken_languages"]
df.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [233]:
display(df.loc[(df.title == 'Avatar')])     

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [234]:
df['keywords'] = df['keywords'].apply(lambda x: json.loads(x) if isinstance(x, str) else x)
df['genres'] = df['genres'].apply(lambda x: json.loads(x) if isinstance(x, str) else x) 
df['production_companies'] = df['production_companies'].apply(lambda x: json.loads(x) if isinstance(x, str) else x) 
df['production_countries'] = df['production_countries'].apply(lambda x: json.loads(x) if isinstance(x, str) else x) 
df['spoken_languages'] = df['spoken_languages'].apply(lambda x: json.loads(x) if isinstance(x, str) else x) 

print(f"{type(df.genres[0])}, {type(df.keywords[0])}, {type(df.production_companies[0])}")

<class 'list'>, <class 'list'>, <class 'list'>


In [235]:
def clean_cols(df): 
    df['keywords_cl'] = df['keywords'].apply(
        lambda keywords: [item['name'] for item in keywords] if isinstance(keywords, list) else []
    )
    df['genres_cl'] = df['genres'].apply(
        lambda genres: [item['name'] for item in genres] if isinstance(genres, list) else []
    )
    df['production_companies_cl'] = df['production_companies'].apply(
        lambda production_companies: [item['name'] for item in production_companies] if isinstance(production_companies, list) else []
    )
    df['production_countries_cl'] = df['production_countries'].apply(
        lambda production_countries: [item['name'] for item in production_countries] if isinstance(production_countries, list) else []
    )
    df['spoken_languages_cl'] = df['spoken_languages'].apply(
        lambda spoken_languages: [item['name'] for item in spoken_languages] if isinstance(spoken_languages, list) else []
    )
    df['original_language'] = df['spoken_languages_cl'][0][0]

clean_cols(df)
df = df.drop(col_drop, axis=1)

In [290]:
display(df.loc[(df.title == 'The Grand Budapest Hotel')]['genres_cl'])     

1532    [Comedy, Drama]
Name: genres_cl, dtype: object

In [237]:
text_data = df['genres_cl'].apply(lambda x: ' '.join(x) if isinstance(x, list) else str(x))

tfidf = TfidfVectorizer(max_features=100)
X = tfidf.fit_transform(text_data).toarray()

In [238]:
knn = NearestNeighbors(n_neighbors=5, metric='cosine')

knn.fit(X)

distances, indices = knn.kneighbors([X[i]])



In [239]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=1, stop_words='english')
tfidf_matrix = tf.fit_transform(text_data)

tfidf_matrix.shape
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

cosine_sim[0]

array([1.        , 0.49309367, 0.29270708, ..., 0.        , 0.        ,
       0.        ])

In [240]:
# df = df.reset_index()
titles = df['title']
indices = pd.Series(df.index, index=df['title'])

def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    
    # Extract only the 'title' column and convert to list
    return titles.iloc[movie_indices].tolist()


In [241]:
get_recommendations('The Godfather')

['Wall Street: Money Never Sleeps',
 'Catch Me If You Can',
 'Casino',
 'American Hustle',
 'Mean Streets',
 '21',
 'Black Water Transit',
 'Once Upon a Time in America',
 'GoodFellas',
 'The Shawshank Redemption',
 'The Bad Lieutenant: Port of Call - New Orleans',
 'The Place Beyond the Pines',
 "Things to Do in Denver When You're Dead",
 'The Godfather: Part II',
 'Rounders',
 'Blood Done Sign My Name',
 'The Godfather',
 'Auto Focus',
 'Spring Breakers',
 'Light Sleeper',
 'City of God',
 'Trainspotting',
 'This Is England',
 'Mi America',
 'Kids',
 'In Bruges',
 'Party Monster',
 'The Gunman',
 'The Sweeney',
 'Collateral']

In [242]:
import joblib
import pandas as pd

# Export data
joblib.dump(indices, 'indices.pkl')
joblib.dump(cosine_sim, 'cosine_sim.pkl')
titles.to_csv('titles.csv', index=False)