In [4]:
import pandas as pd
import numpy as np
import ast  # Abstract Syntax Tree (to handle string representations of lists)
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

Load the Data

In [5]:
# Load the datasets
movies = pd.read_csv('../dataset/tmdb_5000_movies.csv')
credits = pd.read_csv('../dataset/tmdb_5000_credits.csv')

# Merge them on the movie title so we have all info in one place
movies = movies.merge(credits, on='title')

# Check what we have
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [6]:
# Keep only these columns
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

print(movies.shape)

(4809, 7)


Data Cleaning 

In [7]:
# Helper function to extract names from the strange JSON format
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

# Apply it to genres and keywords
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)

# Helper for Cast: Get top 3 actors only
def convert3(obj):
    L = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter != 3:
            L.append(i['name'])
            counter += 1
        else:
            break
    return L

movies['cast'] = movies['cast'].apply(convert3)

# Helper for Crew: Get only the Director
def fetch_director(obj):
    L = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            L.append(i['name'])
            break
    return L

movies['crew'] = movies['crew'].apply(fetch_director)

Formatting for Vectorization

In [8]:
# Remove spaces
movies['genres'] = movies['genres'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['crew'] = movies['crew'].apply(lambda x: [i.replace(" ", "") for i in x])

# Split overview into a list of words
movies['overview'] = movies['overview'].apply(lambda x: x.split() if isinstance(x, str) else [])

# Create a massive "tags" column
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

# Create a new clean dataframe
new_df = movies[['movie_id', 'title', 'tags']]

# Convert list of tags back to a single string
new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))

# Lowercase everything (recommended for text analysis)
new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())

new_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())


Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,following the death of district attorney harve...
4,49529,John Carter,"john carter is a war-weary, former military ca..."


Vectorization

In [9]:
# Initialize Vectorizer (We pick the 5000 most frequent words)
cv = CountVectorizer(max_features=5000, stop_words='english')

# Convert tags to vectors
vectors = cv.fit_transform(new_df['tags']).toarray()

# See the shape (4806 movies, 5000 words)
print(vectors.shape)

(4809, 5000)


Cell 7: Calculate Similarity
We calculate the Cosine Similarity (angle) between every movie vector.

Score = 1.0 (Identical movies)

Score = 0.0 (Completely different)

In [10]:
similarity = cosine_similarity(vectors)

# Check the similarity of the first movie with all others
print(similarity[0])

[1.         0.08838835 0.05892557 ... 0.02457366 0.02777778 0.        ]


Save the Models (Pickling)

In [11]:
# Save the movie list
pickle.dump(new_df.to_dict(), open('../artifacts/movie_list.pkl', 'wb'))

# Save the similarity matrix
pickle.dump(similarity, open('../artifacts/similarity.pkl', 'wb'))

print("Artifacts saved successfully!")

Artifacts saved successfully!
