#### Import necessary libraries 

In [1]:
import pandas as pd 
import numpy as np 

# Entity featurization and similarity computation
from sklearn.metrics.pairwise import cosine_similarity 
from sklearn.feature_extraction.text import TfidfVectorizer

# Libraries used during sorting procedures.
import operator # <-- Convenient item retrieval during iteration 
import heapq # <-- Efficient sorting of large lists

import warnings
warnings.filterwarnings("ignore")

#### Load the dataset

In [2]:
movies_df = pd.read_csv("movies.csv")
movies_df.sample()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
2938,24000000,"[{""id"": 35, ""name"": ""Comedy""}]",,16888,"[{""id"": 293, ""name"": ""female nudity""}, {""id"": ...",en,The Ladies Man,"Because of his salacious language, late-night ...",2.259537,"[{""name"": ""SNL Studios"", ""id"": 2822}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2000-10-13,13700000,84.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,He's cool. He's clean. He's a love machine.,The Ladies Man,5.7,34


In [3]:
## Check the features of the dataset

movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

In [4]:
credit_df = pd.read_csv("credits.csv")
credit_df.sample()

Unnamed: 0,movie_id,title,cast,crew
411,22538,Scott Pilgrim vs. the World,"[{""cast_id"": 1, ""character"": ""Scott Pilgrim"", ...","[{""credit_id"": ""52fe4445c3a368484e0198d7"", ""de..."


In [5]:
credit_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  4803 non-null   int64 
 1   title     4803 non-null   object
 2   cast      4803 non-null   object
 3   crew      4803 non-null   object
dtypes: int64(1), object(3)
memory usage: 150.2+ KB


In [6]:
# Merge the datasets using the title column

movies_credit_df = movies_df.merge(credit_df, on="title")
movies_credit_df.sample()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
1508,32000000,"[{""id"": 18, ""name"": ""Drama""}]",,11978,"[{""id"": 269, ""name"": ""diving""}, {""id"": 4410, ""...",en,Men of Honor,Against formidable odds -- and an old-school d...,24.147269,"[{""name"": ""Fox 2000 Pictures"", ""id"": 711}]",...,129.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,History is made by those who break the rules.,Men of Honor,7.0,544,11978,"[{""cast_id"": 11, ""character"": ""Master Chief Le...","[{""credit_id"": ""52fe44ae9251416c7503d4f7"", ""de..."


#### Using Weighted average for each movie's average rating

W = (Rv + Cm) / (v+m)

W = weighted rating

R = average for the movie as a number from 0 to 10

v = number of votes

m = minimum votes required to be listed in the Top 250 (3000)

C = the mean vote across the whole report (6.9)

Source = "https://trailerpark.weebly.com/imdb-rating.html?source=post_page"

In [7]:
v = movies_credit_df['vote_count']
R = movies_credit_df['vote_average']
m = movies_credit_df['vote_count'].quantile(0.70)
C = movies_credit_df['vote_average'].mean()

movies_credit_df['ratings'] = round(((R*v) + (C*m)) / (v+m),1)
movies_credit_df.sample()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew,ratings
1824,22000000,"[{""id"": 18, ""name"": ""Drama""}, {""id"": 36, ""name...",http://www.schindlerslist.com/,424,"[{""id"": 1382, ""name"": ""factory""}, {""id"": 1631,...",en,Schindler's List,The true story of how businessman Oskar Schind...,104.469351,"[{""name"": ""Universal Pictures"", ""id"": 33}, {""n...",...,"[{""iso_639_1"": ""de"", ""name"": ""Deutsch""}, {""iso...",Released,"Whoever saves one life, saves the world entire.",Schindler's List,8.3,4329,424,"[{""cast_id"": 14, ""character"": ""Oskar Schindler...","[{""credit_id"": ""52fe4242c3a36847f8010303"", ""de...",8.0


In [8]:
movies_credit_df['ratings'].unique()

array([7.1, 6.8, 6.3, 7.5, 6.1, 5.9, 7.2, 7.3, 5.7, 5.6, 6.9, 6.5, 6.2,
       6.4, 7. , 7.4, 6.6, 6. , 5.8, 5.3, 7.7, 5.4, 8.1, 7.6, 5.5, 7.8,
       5.1, 8. , 6.7, 5.2, 4.8, 4.7, 7.9, 5. , 8.2, 8.3])

In [9]:
# Pick columns that will be used for recommendation

data =  movies_credit_df[['movie_id','title','overview','genres','keywords','cast','crew','ratings']]
data.sample()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew,ratings
281,4982,American Gangster,Following the death of his employer and mentor...,"[{""id"": 18, ""name"": ""Drama""}, {""id"": 80, ""name...","[{""id"": 240, ""name"": ""underdog""}, {""id"": 577, ...","[{""cast_id"": 12, ""character"": ""Frank Lucas"", ""...","[{""credit_id"": ""52fe43ebc3a36847f80783c7"", ""de...",7.0


In [10]:
# Check for null values

data.isnull().sum()

movie_id    0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
ratings     0
dtype: int64

In [11]:
# Drop null values

data.dropna(inplace=True)
data.isnull().sum()

movie_id    0
title       0
overview    0
genres      0
keywords    0
cast        0
crew        0
ratings     0
dtype: int64

In [12]:
# Explore the eatures that will need further cleaning

data.iloc[0]['genres']

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

#### Data Cleaning

In [13]:
import ast

# Function to extract 'name' values from the genres column
def extract_genres_names(genres):
    # Convert the string representation of the list of dictionaries to an actual list of dictionaries
    genres_list = ast.literal_eval(genres)
    # Extract the 'name' values
    names = [genre['name'] for genre in genres_list]
    return names

# Apply the function to the 'genres' column and create a new column 'genre_names'
data['genres'] = data['genres'].apply(extract_genres_names)
data['keywords'] = data['keywords'].apply(extract_genres_names)

data.sample()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew,ratings
3697,10914,All Hat,An ex-con returns to his rural Ontario roots a...,"[Action, Comedy, Drama, Western]","[indian territory, horse, ranch, stetson, urba...","[{""cast_id"": 3, ""character"": ""Chrissie"", ""cred...","[{""credit_id"": ""52fe43cf9251416c7501f1ab"", ""de...",6.1


In [14]:
def extract_character(text):
    characters=[]
    count = 0
    for i in ast.literal_eval(text):
        if count < 3:
            characters.append(i['name'])
            
        count += 1
    return characters

data['cast'] = data['cast'].apply(extract_character)
data.sample()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew,ratings
3048,14120,End of the Spear,"""End of the Spear"" is the story of Mincayani, ...","[Adventure, Drama]",[drama],"[Louie Leonardo, Chad Allen, Jack Guzman]","[{""credit_id"": ""52fe45cd9251416c75062eb3"", ""de...",6.1


In [15]:
def extract_director(text):
    director=[]
    for i in ast.literal_eval(text):
        if i['job'] == "Director":
            director.append(i['name'])
            break

    return director

data['director'] = data['crew'].apply(extract_director)
data.sample()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew,ratings,director
4118,22617,Dinner Rush,Is it just another evening at the hugely popul...,"[Drama, Action, Thriller]",[],"[Danny Aiello, John Rothman, Frank Bongiorno]","[{""credit_id"": ""52fe4448c3a368484e01a66f"", ""de...",6.1,[Bob Giraldi]


In [16]:
def remove_spaces(text):
    l = []
    for i in text:
        l.append(i.replace(" ",""))
    return l

data['cast'] = data['cast'].apply(remove_spaces)
data['director'] = data['director'].apply(remove_spaces)
data['keywords'] = data['keywords'].apply(remove_spaces)
data['genres'] = data['genres'].apply(remove_spaces)

In [17]:
data.sample()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew,ratings,director
2734,70829,The Last Godfather,"Young-goo the son of mafia boss Don Carini, is...","[Action, Comedy, Thriller]",[],"[HarveyKeitel, JasonMewes, BlakeClark]","[{""credit_id"": ""52fe4814c3a368484e0e8467"", ""de...",6.1,[ShimHyung-Rae]


In [18]:
data['auth_tag'] = data['genres'] + data['keywords'] + data['cast'] + data['director']

In [19]:
new_df = data[['movie_id','title','auth_tag']]
new_df.iloc[0]['auth_tag']

['Action',
 'Adventure',
 'Fantasy',
 'ScienceFiction',
 'cultureclash',
 'future',
 'spacewar',
 'spacecolony',
 'society',
 'spacetravel',
 'futuristic',
 'romance',
 'space',
 'alien',
 'tribe',
 'alienplanet',
 'cgi',
 'marine',
 'soldier',
 'battle',
 'loveaffair',
 'antiwar',
 'powerrelations',
 'mindandsoul',
 '3d',
 'SamWorthington',
 'ZoeSaldana',
 'SigourneyWeaver',
 'JamesCameron']

In [20]:
new_df['auth_tag'] = new_df['auth_tag'].apply(lambda x:(' ').join(x))
new_df.iloc[0]['auth_tag']

'Action Adventure Fantasy ScienceFiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d SamWorthington ZoeSaldana SigourneyWeaver JamesCameron'

#### Feature Extraction

In [21]:
# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the combined text data to numerical vectors
matrix = vectorizer.fit_transform(new_df["auth_tag"])

In [22]:
matrix.shape

(4806, 17840)

In [23]:
# Compute cosine similarities between the vectors
#cosine_similarities = linear_kernel(matrix, matrix)

from sklearn.metrics.pairwise import cosine_similarity 
cosine_similarities = cosine_similarity(matrix)

In [24]:
cosine_similarities.shape

(4806, 4806)

#### Movie Recommendation

Define a function that recommends top 5 similar movies based on content

In [25]:
def recommend(movie):
    index = new_df[new_df["title"]==movie].index[0]
    distances = sorted(list(enumerate(cosine_similarities[index])), reverse=True, key=lambda x:x[1])
    for i in distances[1:6]:
        print(new_df.iloc[i[0]].title)

In [26]:
# Pick the first movie in the sample for a test

recommend(new_df.iloc[0]['title'])

Star Trek Into Darkness
Aliens
Alien³
Predators
Jupiter Ascending


In [27]:
# Save the model

import pickle

pickle.dump(new_df, open("movie_list.pkl", 'wb'))
pickle.dump(cosine_similarities, open("similarity.pkl", 'wb'))

'# Save the model\n\nimport pickle\n\npickle.dump(new_df, open("movie_list.pkl", \'wb\'))\npickle.dump(cosine_similarities, open("similarity.pkl", \'wb\'))'