In [1]:
import pandas as pd
import numpy as np
import ast
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [2]:
import os
print(os.listdir())


['.config', 'sample_data']


In [3]:
from google.colab import files
uploaded = files.upload('sample_data')

Saving tmdb_5000_movies.csv to sample_data/tmdb_5000_movies.csv
Saving tmdb_5000_credits.csv to sample_data/tmdb_5000_credits.csv


In [4]:
movies=pd.read_csv('sample_data/tmdb_5000_movies.csv')
credits=pd.read_csv('sample_data/tmdb_5000_credits.csv')

In [5]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [6]:
credits.head(1)['cast'].values

array(['[{"cast_id": 242, "character": "Jake Sully", "credit_id": "5602a8a7c3a3685532001c9a", "gender": 2, "id": 65731, "name": "Sam Worthington", "order": 0}, {"cast_id": 3, "character": "Neytiri", "credit_id": "52fe48009251416c750ac9cb", "gender": 1, "id": 8691, "name": "Zoe Saldana", "order": 1}, {"cast_id": 25, "character": "Dr. Grace Augustine", "credit_id": "52fe48009251416c750aca39", "gender": 1, "id": 10205, "name": "Sigourney Weaver", "order": 2}, {"cast_id": 4, "character": "Col. Quaritch", "credit_id": "52fe48009251416c750ac9cf", "gender": 2, "id": 32747, "name": "Stephen Lang", "order": 3}, {"cast_id": 5, "character": "Trudy Chacon", "credit_id": "52fe48009251416c750ac9d3", "gender": 1, "id": 17647, "name": "Michelle Rodriguez", "order": 4}, {"cast_id": 8, "character": "Selfridge", "credit_id": "52fe48009251416c750ac9e1", "gender": 2, "id": 1771, "name": "Giovanni Ribisi", "order": 5}, {"cast_id": 7, "character": "Norm Spellman", "credit_id": "52fe48009251416c750ac9dd", "ge

In [7]:
movies = movies.merge(credits, on='title', suffixes=('_movie', '_credit'))


In [8]:
movies = movies[['movie_id','title','overview','genres','keywords','cast','crew']]

In [9]:
def convert(text):
    if isinstance(text, list):
        return text
    try:
        parsed = ast.literal_eval(text)
    except:
        return []
    if isinstance(parsed, list) and all(isinstance(i, dict) and 'name' in i for i in parsed):
        return [i['name'] for i in parsed]
    return parsed

def convert3(obj):
    L=[]
    counter=0
    for i in ast.literal_eval(obj):
        if counter!=3:
            L.append(i['name'])
            counter+=1
        else:
            break
    return L

def fetch_director(obj):
    L=[]
    for i in ast.literal_eval(obj):
        if i['job']=='Director':
            L.append(i['name'])
            break
    return L


In [10]:
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)
movies['cast'] = movies['cast'].apply(convert3)
movies['crew'] = movies['crew'].apply(fetch_director)
movies.dropna(inplace=True)

# Clean spaces in names
movies['genres'] = movies['genres'].apply(lambda x:[i.replace(" ","") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x:[i.replace(" ","") for i in x])
movies['cast'] = movies['cast'].apply(lambda x:[i.replace(" ","") for i in x])
movies['crew'] = movies['crew'].apply(lambda x:[i.replace(" ","") for i in x])

# Split overview into words
movies['overview'] = movies['overview'].apply(lambda x: x.split())

In [11]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [12]:
ps = PorterStemmer()

def stem(text):
    return " ".join([ps.stem(word) for word in text.split()])

new_df = movies[['movie_id','title','tags']].copy()
new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))
new_df['tags'] = new_df['tags'].apply(stem)
new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())


In [13]:
cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(new_df['tags']).toarray()
similarity = cosine_similarity(vectors)


In [14]:
movie_dict = movies.to_dict()
pickle.dump(movie_dict, open('movie_dict.pkl','wb'))

In [15]:
pickle.dump(similarity, open('similarity.pkl','wb'))

In [17]:
print("Columns in saved movie_dict.pkl:", movies.columns)
print("Example genres:", movies['genres'][0])
print("Example cast:", movies['cast'][0])
print("Example director:", movies['crew'][0])
print("Similarity shape:", similarity.shape)

Columns in saved movie_dict.pkl: Index(['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew',
       'tags'],
      dtype='object')
Example genres: ['Action', 'Adventure', 'Fantasy', 'ScienceFiction']
Example cast: ['SamWorthington', 'ZoeSaldana', 'SigourneyWeaver']
Example director: ['JamesCameron']
Similarity shape: (4806, 4806)
