# Movie Recommender with Python

#### DF Capstone Project
##### Author: Richard V

### Importing Libraries

In [2]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min
from ast import literal_eval
from itertools import chain
import matplotlib.pyplot as plt
# Visualize clusters (2D PCA for simplicity)
from sklearn.decomposition import PCA


### Preprocessing the dataset
Here I explore the dataset, handling any null values or errors within the dataset, look for any correlations between variables and formatting the dataset for modelling later on.

In [3]:
df = pd.read_csv("10kmovies.csv")

In [4]:
df.head()

Unnamed: 0,id,title,release_date,genres,original_language,vote_average,vote_count,popularity,overview,budget,production_companies,revenue,runtime,tagline
0,758323,The Pope's Exorcist,2023-04-05,"['Horror', 'Mystery', 'Thriller']",English,7.4,619,5089.969,"Father Gabriele Amorth, Chief Exorcist of the ...",18000000,"['Screen Gems', '2.0 Entertainment', 'Jesus & ...",65675816,103,Inspired by the actual files of Father Gabriel...
1,640146,Ant-Man and the Wasp: Quantumania,2023-02-15,"['Action', 'Adventure', 'Science Fiction']",English,6.6,2294,4665.438,Super-Hero partners Scott Lang and Hope van Dy...,200000000,"['Marvel Studios', 'Kevin Feige Productions']",464566092,125,Witness the beginning of a new dynasty.
2,502356,The Super Mario Bros. Movie,2023-04-05,"['Animation', 'Adventure', 'Family', 'Fantasy'...",English,7.5,1861,3935.55,"While working underground to fix a water main,...",100000000,"['Universal Pictures', 'Illumination', 'Ninten...",1121048165,92,
3,868759,Ghosted,2023-04-18,"['Action', 'Comedy', 'Romance']",English,7.2,652,2791.532,Salt-of-the-earth Cole falls head over heels f...,0,"['Skydance Media', 'Apple Studios']",0,120,Finding that special someone can be a real adv...
4,594767,Shazam! Fury of the Gods,2023-03-15,"['Action', 'Comedy', 'Fantasy', 'Adventure']",English,6.8,1510,2702.593,"Billy Batson and his foster siblings, who tran...",125000000,"['New Line Cinema', 'The Safran Company', 'DC ...",133437105,130,Oh. My. Gods.


In [5]:
df.shape

(10000, 14)

In [6]:
df.isnull().sum() #tagline,overview and release date have null values. Not interested in this so will drop them.

id                         0
title                      0
release_date              21
genres                     0
original_language          0
vote_average               0
vote_count                 0
popularity                 0
overview                  77
budget                     0
production_companies       0
revenue                    0
runtime                    0
tagline                 2759
dtype: int64

In [7]:
df.drop(columns = ['release_date','overview','tagline'], inplace = True) # Dropping the columns will null values.

In [8]:
df.isnull().sum()

id                      0
title                   0
genres                  0
original_language       0
vote_average            0
vote_count              0
popularity              0
budget                  0
production_companies    0
revenue                 0
runtime                 0
dtype: int64

In [9]:
df.head()

Unnamed: 0,id,title,genres,original_language,vote_average,vote_count,popularity,budget,production_companies,revenue,runtime
0,758323,The Pope's Exorcist,"['Horror', 'Mystery', 'Thriller']",English,7.4,619,5089.969,18000000,"['Screen Gems', '2.0 Entertainment', 'Jesus & ...",65675816,103
1,640146,Ant-Man and the Wasp: Quantumania,"['Action', 'Adventure', 'Science Fiction']",English,6.6,2294,4665.438,200000000,"['Marvel Studios', 'Kevin Feige Productions']",464566092,125
2,502356,The Super Mario Bros. Movie,"['Animation', 'Adventure', 'Family', 'Fantasy'...",English,7.5,1861,3935.55,100000000,"['Universal Pictures', 'Illumination', 'Ninten...",1121048165,92
3,868759,Ghosted,"['Action', 'Comedy', 'Romance']",English,7.2,652,2791.532,0,"['Skydance Media', 'Apple Studios']",0,120
4,594767,Shazam! Fury of the Gods,"['Action', 'Comedy', 'Fantasy', 'Adventure']",English,6.8,1510,2702.593,125000000,"['New Line Cinema', 'The Safran Company', 'DC ...",133437105,130


In [10]:
df.shape

(10000, 11)

In [13]:
df.corr()

  df.corr()


Unnamed: 0,id,vote_average,vote_count,popularity,budget,revenue,runtime
id,1.0,-0.241569,-0.259859,0.102688,-0.243411,-0.207591,-0.256838
vote_average,-0.241569,1.0,0.253543,0.040162,0.074849,0.149643,0.38844
vote_count,-0.259859,0.253543,1.0,0.069693,0.600121,0.753206,0.288462
popularity,0.102688,0.040162,0.069693,1.0,0.143257,0.148195,0.038973
budget,-0.243411,0.074849,0.600121,0.143257,1.0,0.735239,0.282498
revenue,-0.207591,0.149643,0.753206,0.148195,0.735239,1.0,0.253162
runtime,-0.256838,0.38844,0.288462,0.038973,0.282498,0.253162,1.0


In [14]:
df.dtypes

id                        int64
title                    object
genres                   object
original_language        object
vote_average            float64
vote_count                int64
popularity              float64
budget                    int64
production_companies     object
revenue                   int64
runtime                   int64
dtype: object

In [15]:
df.head()

Unnamed: 0,id,title,genres,original_language,vote_average,vote_count,popularity,budget,production_companies,revenue,runtime
0,758323,The Pope's Exorcist,"['Horror', 'Mystery', 'Thriller']",English,7.4,619,5089.969,18000000,"['Screen Gems', '2.0 Entertainment', 'Jesus & ...",65675816,103
1,640146,Ant-Man and the Wasp: Quantumania,"['Action', 'Adventure', 'Science Fiction']",English,6.6,2294,4665.438,200000000,"['Marvel Studios', 'Kevin Feige Productions']",464566092,125
2,502356,The Super Mario Bros. Movie,"['Animation', 'Adventure', 'Family', 'Fantasy'...",English,7.5,1861,3935.55,100000000,"['Universal Pictures', 'Illumination', 'Ninten...",1121048165,92
3,868759,Ghosted,"['Action', 'Comedy', 'Romance']",English,7.2,652,2791.532,0,"['Skydance Media', 'Apple Studios']",0,120
4,594767,Shazam! Fury of the Gods,"['Action', 'Comedy', 'Fantasy', 'Adventure']",English,6.8,1510,2702.593,125000000,"['New Line Cinema', 'The Safran Company', 'DC ...",133437105,130


In [16]:
action_movies = df[df['genres'].str.contains('Action')] # will output rows with action genre in them

In [17]:
action_movies = df[df['genres'] == 'Action']# will output rows with ONLY action genre in them. - there are none.

In [18]:
action_movies

Unnamed: 0,id,title,genres,original_language,vote_average,vote_count,popularity,budget,production_companies,revenue,runtime


In [19]:
# Convert string representations to actual lists
df['genres'] = df['genres'].apply(literal_eval)


In [20]:
df.head()

Unnamed: 0,id,title,genres,original_language,vote_average,vote_count,popularity,budget,production_companies,revenue,runtime
0,758323,The Pope's Exorcist,"[Horror, Mystery, Thriller]",English,7.4,619,5089.969,18000000,"['Screen Gems', '2.0 Entertainment', 'Jesus & ...",65675816,103
1,640146,Ant-Man and the Wasp: Quantumania,"[Action, Adventure, Science Fiction]",English,6.6,2294,4665.438,200000000,"['Marvel Studios', 'Kevin Feige Productions']",464566092,125
2,502356,The Super Mario Bros. Movie,"[Animation, Adventure, Family, Fantasy, Comedy]",English,7.5,1861,3935.55,100000000,"['Universal Pictures', 'Illumination', 'Ninten...",1121048165,92
3,868759,Ghosted,"[Action, Comedy, Romance]",English,7.2,652,2791.532,0,"['Skydance Media', 'Apple Studios']",0,120
4,594767,Shazam! Fury of the Gods,"[Action, Comedy, Fantasy, Adventure]",English,6.8,1510,2702.593,125000000,"['New Line Cinema', 'The Safran Company', 'DC ...",133437105,130


In [21]:
# Flatten the lists and convert to Series
flattened_series = pd.Series(chain.from_iterable(df['genres']))

# Get unique genres
unique_genres = flattened_series.unique()

print(unique_genres)

['Horror' 'Mystery' 'Thriller' 'Action' 'Adventure' 'Science Fiction'
 'Animation' 'Family' 'Fantasy' 'Comedy' 'Romance' 'Drama' 'History' 'War'
 'Crime' 'Music' 'Western' 'TV Movie' 'Documentary']


In [22]:
len(unique_genres) #This will be my k cluster number

19

### Basic Movie Recommender

So the essential idea of my movie recommender is to generate a list of movies to the user based on their genre of interest.

In [23]:
#Create a user prompt.
print (f"Here are the list of genres. {unique_genres}")
goi = [] #genres of interest
str = ""
while True:
    user_genre = input("Please enter a genre of interest (or 'exit' to quit): ").capitalize().strip() # strip in case a user will enter white spaces by accident.

    if user_genre == 'Exit':
        print("Exiting the program.")

        break
    elif user_genre in unique_genres:
        print(f"Great choice! {user_genre} is a popular genre.")
        str += user_genre
        goi.append(user_genre)
    else:
        print("Sorry, that's not a valid genre. Please choose from the following genres:")
        print(', '.join(unique_genres))

Here are the list of genres. ['Horror' 'Mystery' 'Thriller' 'Action' 'Adventure' 'Science Fiction'
 'Animation' 'Family' 'Fantasy' 'Comedy' 'Romance' 'Drama' 'History' 'War'
 'Crime' 'Music' 'Western' 'TV Movie' 'Documentary']
Great choice! Action is a popular genre.
Great choice! Romance is a popular genre.
Great choice! Horror is a popular genre.
Exiting the program.


In [24]:
# Convert list genres to string for TF-IDF vectorization later on but also to
df['genres_str'] = df['genres'].apply(', '.join)

In [25]:
goi #Our genres of interest from the user.

['Action', 'Romance', 'Horror']

In [26]:
# df.groupby('vote_average')['title','genres','vote_average'].head(100).sort_values('vote_average',ascending=False) ## checking how to use sort_values.

In [86]:

# Create a boolean mask using str.contains for each genre
genre_masks = [df['genres_str'].str.contains(genre) for genre in goi]

# Combine the masks using logical OR (|)
combined_mask = pd.concat(genre_masks, axis=1).any(axis=1)

# Apply the combined mask to filter the DataFrame
filtered_df = df[combined_mask]
#
filter10 = filtered_df.head(10).sort_values('vote_average',ascending=False)

filter10['title'].reset_index(drop=True, inplace=True)
print(filter10['title'])

0          Guardians of the Galaxy Volume 3
1                  Avatar: The Way of Water
2                       The Pope's Exorcist
3                                 Scream VI
4                                 Creed III
5    The Last Kingdom: Seven Kings Must Die
6                                   Ghosted
7                  Shazam! Fury of the Gods
8         Ant-Man and the Wasp: Quantumania
9                         Peter Pan & Wendy
Name: title, dtype: object


### Applying Clustering
Now that the basic idea of the movie recommender is created. How can we make a better movie recommender? What if we wanted to find movies similar to other movies rather than by genre? What if we were to include all the features we have such as vote_average, popularity, profit (revenue - budget), runtime, and production companies with genre this time?

##### Revamping the dataframe

In [104]:
df1 = df.copy()

In [105]:
df1['genres'] = df1['genres'].apply(', '.join) # converting the lists of genres into strings.
df1['production_companies'] = df1['production_companies'].apply(literal_eval) #Converting the column of list of strings (json like)into actual (python) lists
df1['production_companies'] = df1['production_companies'].apply(', '.join) #converting the lists into strings

In [106]:
df1.head() #

Unnamed: 0,id,title,genres,original_language,vote_average,vote_count,popularity,budget,production_companies,revenue,runtime,genres_str
0,758323,The Pope's Exorcist,"Horror, Mystery, Thriller",English,7.4,619,5089.969,18000000,"Screen Gems, 2.0 Entertainment, Jesus & Mary, ...",65675816,103,"Horror, Mystery, Thriller"
1,640146,Ant-Man and the Wasp: Quantumania,"Action, Adventure, Science Fiction",English,6.6,2294,4665.438,200000000,"Marvel Studios, Kevin Feige Productions",464566092,125,"Action, Adventure, Science Fiction"
2,502356,The Super Mario Bros. Movie,"Animation, Adventure, Family, Fantasy, Comedy",English,7.5,1861,3935.55,100000000,"Universal Pictures, Illumination, Nintendo",1121048165,92,"Animation, Adventure, Family, Fantasy, Comedy"
3,868759,Ghosted,"Action, Comedy, Romance",English,7.2,652,2791.532,0,"Skydance Media, Apple Studios",0,120,"Action, Comedy, Romance"
4,594767,Shazam! Fury of the Gods,"Action, Comedy, Fantasy, Adventure",English,6.8,1510,2702.593,125000000,"New Line Cinema, The Safran Company, DC Films,...",133437105,130,"Action, Comedy, Fantasy, Adventure"


In [107]:
#making a net profit feature
df1['net_profit'] = df1['revenue'] - df1['budget']

In [108]:
df1.head()

Unnamed: 0,id,title,genres,original_language,vote_average,vote_count,popularity,budget,production_companies,revenue,runtime,genres_str,net_profit
0,758323,The Pope's Exorcist,"Horror, Mystery, Thriller",English,7.4,619,5089.969,18000000,"Screen Gems, 2.0 Entertainment, Jesus & Mary, ...",65675816,103,"Horror, Mystery, Thriller",47675816
1,640146,Ant-Man and the Wasp: Quantumania,"Action, Adventure, Science Fiction",English,6.6,2294,4665.438,200000000,"Marvel Studios, Kevin Feige Productions",464566092,125,"Action, Adventure, Science Fiction",264566092
2,502356,The Super Mario Bros. Movie,"Animation, Adventure, Family, Fantasy, Comedy",English,7.5,1861,3935.55,100000000,"Universal Pictures, Illumination, Nintendo",1121048165,92,"Animation, Adventure, Family, Fantasy, Comedy",1021048165
3,868759,Ghosted,"Action, Comedy, Romance",English,7.2,652,2791.532,0,"Skydance Media, Apple Studios",0,120,"Action, Comedy, Romance",0
4,594767,Shazam! Fury of the Gods,"Action, Comedy, Fantasy, Adventure",English,6.8,1510,2702.593,125000000,"New Line Cinema, The Safran Company, DC Films,...",133437105,130,"Action, Comedy, Fantasy, Adventure",8437105


In [109]:
#removing unnecessary columns
df1.drop(columns = ['genres_str', 'budget','revenue'], inplace = True)


In [110]:
df1['tags'] = df1['genres'] + df1['original_language'] + df1['production_companies']

In [111]:
df1.head()

Unnamed: 0,id,title,genres,original_language,vote_average,vote_count,popularity,production_companies,runtime,net_profit,tags
0,758323,The Pope's Exorcist,"Horror, Mystery, Thriller",English,7.4,619,5089.969,"Screen Gems, 2.0 Entertainment, Jesus & Mary, ...",103,47675816,"Horror, Mystery, ThrillerEnglishScreen Gems, 2..."
1,640146,Ant-Man and the Wasp: Quantumania,"Action, Adventure, Science Fiction",English,6.6,2294,4665.438,"Marvel Studios, Kevin Feige Productions",125,264566092,"Action, Adventure, Science FictionEnglishMarve..."
2,502356,The Super Mario Bros. Movie,"Animation, Adventure, Family, Fantasy, Comedy",English,7.5,1861,3935.55,"Universal Pictures, Illumination, Nintendo",92,1021048165,"Animation, Adventure, Family, Fantasy, ComedyE..."
3,868759,Ghosted,"Action, Comedy, Romance",English,7.2,652,2791.532,"Skydance Media, Apple Studios",120,0,"Action, Comedy, RomanceEnglishSkydance Media, ..."
4,594767,Shazam! Fury of the Gods,"Action, Comedy, Fantasy, Adventure",English,6.8,1510,2702.593,"New Line Cinema, The Safran Company, DC Films,...",130,8437105,"Action, Comedy, Fantasy, AdventureEnglishNew L..."


In [112]:
import re

def separate_words_by_capitals(input_string):
    # Use regular expression to split words by two consecutive capital letters
    words = re.findall('[A-Z]{2}(?=[a-z])|[A-Z][a-z]*', input_string)
    return ', '.join(words)

In [113]:
df1['tags'] = df1['tags'].apply(separate_words_by_capitals)

In [114]:
df1.head()

Unnamed: 0,id,title,genres,original_language,vote_average,vote_count,popularity,production_companies,runtime,net_profit,tags
0,758323,The Pope's Exorcist,"Horror, Mystery, Thriller",English,7.4,619,5089.969,"Screen Gems, 2.0 Entertainment, Jesus & Mary, ...",103,47675816,"Horror, Mystery, Thriller, English, Screen, Ge..."
1,640146,Ant-Man and the Wasp: Quantumania,"Action, Adventure, Science Fiction",English,6.6,2294,4665.438,"Marvel Studios, Kevin Feige Productions",125,264566092,"Action, Adventure, Science, Fiction, English, ..."
2,502356,The Super Mario Bros. Movie,"Animation, Adventure, Family, Fantasy, Comedy",English,7.5,1861,3935.55,"Universal Pictures, Illumination, Nintendo",92,1021048165,"Animation, Adventure, Family, Fantasy, Comedy,..."
3,868759,Ghosted,"Action, Comedy, Romance",English,7.2,652,2791.532,"Skydance Media, Apple Studios",120,0,"Action, Comedy, Romance, English, Skydance, Me..."
4,594767,Shazam! Fury of the Gods,"Action, Comedy, Fantasy, Adventure",English,6.8,1510,2702.593,"New Line Cinema, The Safran Company, DC Films,...",130,8437105,"Action, Comedy, Fantasy, Adventure, English, N..."


In [115]:
df1.drop(columns=['genres','production_companies','original_language'], inplace=True)

In [116]:
df1.head()

Unnamed: 0,id,title,vote_average,vote_count,popularity,runtime,net_profit,tags
0,758323,The Pope's Exorcist,7.4,619,5089.969,103,47675816,"Horror, Mystery, Thriller, English, Screen, Ge..."
1,640146,Ant-Man and the Wasp: Quantumania,6.6,2294,4665.438,125,264566092,"Action, Adventure, Science, Fiction, English, ..."
2,502356,The Super Mario Bros. Movie,7.5,1861,3935.55,92,1021048165,"Animation, Adventure, Family, Fantasy, Comedy,..."
3,868759,Ghosted,7.2,652,2791.532,120,0,"Action, Comedy, Romance, English, Skydance, Me..."
4,594767,Shazam! Fury of the Gods,6.8,1510,2702.593,130,8437105,"Action, Comedy, Fantasy, Adventure, English, N..."


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import ast
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [126]:

from sklearn.feature_extraction.text import CountVectorizer


In [127]:
cv=CountVectorizer(max_features=5000,stop_words='english')
vector=cv.fit_transform(df1['tags']).toarray()
vector.shape

(10000, 5000)

In [128]:
similarity=cosine_similarity(vector)
similarity

array([[1.        , 0.19069252, 0.09534626, ..., 0.42640143, 0.        ,
        0.10050378],
       [0.19069252, 1.        , 0.2       , ..., 0.1118034 , 0.        ,
        0.10540926],
       [0.09534626, 0.2       , 1.        , ..., 0.1118034 , 0.        ,
        0.10540926],
       ...,
       [0.42640143, 0.1118034 , 0.1118034 , ..., 1.        , 0.        ,
        0.11785113],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.19245009],
       [0.10050378, 0.10540926, 0.10540926, ..., 0.11785113, 0.19245009,
        1.        ]])

In [134]:
def recommend(movie):
    index=df1[df1['title']==movie].index[0]
    distances=sorted(list(enumerate(similarity[index])),reverse=True,key=lambda x:x[1])
    for i in distances[1:10]:
        print(df1.iloc[i[0]].title)


In [135]:
recommend('Aladdin')

Christopher Robin
Darby O'Gill and the Little People
Tuck Everlasting
Togo
Maleficent
Mighty Joe Young
Beauty and the Beast
Oz the Great and Powerful
Olaf's Frozen Adventure


In [117]:
from sklearn.preprocessing import StandardScaler

In [122]:
# Select features for clustering
selected_features = ['vote_average', 'vote_count', 'popularity', 'runtime', 'net_profit']
data_subset = df1[selected_features]

# Normalize features
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data_subset)

# Choose the number of clusters (K)
num_clusters = 5

# Apply K-means clustering
kmeans = KMeans(n_clusters=num_clusters, random_state=0)
df1['cluster'] = kmeans.fit_predict(data_scaled)

# Get recommendations for a given movie
def recommend_movies(movie_title):
    cluster = df1[df1['title'] == movie_title]['cluster'].iloc[0]
    recommended_movies = df1[df1['cluster'] == cluster]
    recommended_movies = recommended_movies.sort_values(by='popularity', ascending=False)
    return recommended_movies[['title', 'popularity']]

# User inputs a movie title
user_input_movie = "Finding Nemo"

# Get recommendations for the user input movie
recommendations = recommend_movies(user_input_movie)
print(recommendations)



                               title  popularity
15      Puss in Boots: The Last Wish    1025.915
24    Black Panther: Wakanda Forever     620.645
37    Guardians of the Galaxy Vol. 2     492.950
55            Thor: Love and Thunder     297.321
64                 Top Gun: Maverick     277.473
...                              ...         ...
5250                          Ne Zha      16.684
4047                           Moana      15.305
5569                   The Boss Baby      15.231
5971                         Jumanji      12.360
7196         The Secret Life of Pets      11.553

[394 rows x 2 columns]


In [123]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack

# Load and preprocess your dataset
# ...

# Text vectorization using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
tags_tfidf = tfidf_vectorizer.fit_transform(df1['tags'])

# Select numerical features for clustering
selected_features = ['vote_average', 'vote_count', 'popularity', 'runtime', 'net_profit']
numerical_features = df1[selected_features]

# Concatenate numerical features with TF-IDF tags
combined_features = hstack((numerical_features, tags_tfidf))

# Normalize combined features
scaler = StandardScaler()
combined_features_scaled = scaler.fit_transform(combined_features.toarray())

# Choose the number of clusters (K)
num_clusters = 5

# Apply K-means clustering
kmeans = KMeans(n_clusters=num_clusters, random_state=0)
df1['cluster'] = kmeans.fit_predict(combined_features_scaled)

# Get recommendations for a given movie
def recommend_movies(movie_title):
    cluster = df1[df1['title'] == movie_title]['cluster'].iloc[0]
    recommended_movies = df1[df1['cluster'] == cluster]
    recommended_movies = recommended_movies.sort_values(by='popularity', ascending=False)
    return recommended_movies[['title', 'popularity']]

# User inputs a movie title
user_input_movie = "Wolf Creek 3"

# Get recommendations for the user input movie
recommendations = recommend_movies(user_input_movie)
print(recommendations)



                                  title  popularity
0                   The Pope's Exorcist    5089.969
1     Ant-Man and the Wasp: Quantumania    4665.438
2           The Super Mario Bros. Movie    3935.550
3                               Ghosted    2791.532
4              Shazam! Fury of the Gods    2702.593
...                                 ...         ...
9170                       Wolf Creek 3       7.462
9217                   Behind the Trees       7.438
9283                      Autumn Sonata       7.411
9369                 Top Secret Pursuit       7.367
9664                         Elena 1944       7.219

[9989 rows x 2 columns]
