# Movie Recommender with Python

#### DF Capstone Project
##### Author: Richard V

### Importing Libraries

In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min
from ast import literal_eval
from itertools import chain
import matplotlib.pyplot as plt
# Visualize clusters (2D PCA for simplicity)
from sklearn.decomposition import PCA


### Preprocessing the dataset
Here I explore the dataset, handling any null values or errors within the dataset, look for any correlations between variables and formatting the dataset for modelling later on.

In [2]:
df = pd.read_csv("10kmovies.csv")

In [3]:
df.head()

Unnamed: 0,id,title,release_date,genres,original_language,vote_average,vote_count,popularity,overview,budget,production_companies,revenue,runtime,tagline
0,758323,The Pope's Exorcist,2023-04-05,"['Horror', 'Mystery', 'Thriller']",English,7.4,619,5089.969,"Father Gabriele Amorth, Chief Exorcist of the ...",18000000,"['Screen Gems', '2.0 Entertainment', 'Jesus & ...",65675816,103,Inspired by the actual files of Father Gabriel...
1,640146,Ant-Man and the Wasp: Quantumania,2023-02-15,"['Action', 'Adventure', 'Science Fiction']",English,6.6,2294,4665.438,Super-Hero partners Scott Lang and Hope van Dy...,200000000,"['Marvel Studios', 'Kevin Feige Productions']",464566092,125,Witness the beginning of a new dynasty.
2,502356,The Super Mario Bros. Movie,2023-04-05,"['Animation', 'Adventure', 'Family', 'Fantasy'...",English,7.5,1861,3935.55,"While working underground to fix a water main,...",100000000,"['Universal Pictures', 'Illumination', 'Ninten...",1121048165,92,
3,868759,Ghosted,2023-04-18,"['Action', 'Comedy', 'Romance']",English,7.2,652,2791.532,Salt-of-the-earth Cole falls head over heels f...,0,"['Skydance Media', 'Apple Studios']",0,120,Finding that special someone can be a real adv...
4,594767,Shazam! Fury of the Gods,2023-03-15,"['Action', 'Comedy', 'Fantasy', 'Adventure']",English,6.8,1510,2702.593,"Billy Batson and his foster siblings, who tran...",125000000,"['New Line Cinema', 'The Safran Company', 'DC ...",133437105,130,Oh. My. Gods.


In [4]:
df.shape

(10000, 14)

In [5]:
df.isnull().sum() #tagline,overview and release date have null values. Not interested in this so will drop them.

id                         0
title                      0
release_date              21
genres                     0
original_language          0
vote_average               0
vote_count                 0
popularity                 0
overview                  77
budget                     0
production_companies       0
revenue                    0
runtime                    0
tagline                 2759
dtype: int64

In [6]:
df.drop(columns = ['release_date','overview','tagline'], inplace = True) # Dropping the columns will null values.

In [7]:
df.isnull().sum()

id                      0
title                   0
genres                  0
original_language       0
vote_average            0
vote_count              0
popularity              0
budget                  0
production_companies    0
revenue                 0
runtime                 0
dtype: int64

In [8]:
df.head()

Unnamed: 0,id,title,genres,original_language,vote_average,vote_count,popularity,budget,production_companies,revenue,runtime
0,758323,The Pope's Exorcist,"['Horror', 'Mystery', 'Thriller']",English,7.4,619,5089.969,18000000,"['Screen Gems', '2.0 Entertainment', 'Jesus & ...",65675816,103
1,640146,Ant-Man and the Wasp: Quantumania,"['Action', 'Adventure', 'Science Fiction']",English,6.6,2294,4665.438,200000000,"['Marvel Studios', 'Kevin Feige Productions']",464566092,125
2,502356,The Super Mario Bros. Movie,"['Animation', 'Adventure', 'Family', 'Fantasy'...",English,7.5,1861,3935.55,100000000,"['Universal Pictures', 'Illumination', 'Ninten...",1121048165,92
3,868759,Ghosted,"['Action', 'Comedy', 'Romance']",English,7.2,652,2791.532,0,"['Skydance Media', 'Apple Studios']",0,120
4,594767,Shazam! Fury of the Gods,"['Action', 'Comedy', 'Fantasy', 'Adventure']",English,6.8,1510,2702.593,125000000,"['New Line Cinema', 'The Safran Company', 'DC ...",133437105,130


In [9]:
df.shape

(10000, 11)

In [10]:
df.genres.unique()

array(["['Horror', 'Mystery', 'Thriller']",
       "['Action', 'Adventure', 'Science Fiction']",
       "['Animation', 'Adventure', 'Family', 'Fantasy', 'Comedy']", ...,
       "['Thriller', 'Drama', 'Music']",
       "['Fantasy', 'Animation', 'Action', 'Adventure', 'Science Fiction', 'Drama', 'Romance']",
       "['TV Movie', 'Fantasy', 'Animation', 'Action', 'Thriller', 'Science Fiction', 'Horror']"],
      dtype=object)

In [11]:
df.head()

Unnamed: 0,id,title,genres,original_language,vote_average,vote_count,popularity,budget,production_companies,revenue,runtime
0,758323,The Pope's Exorcist,"['Horror', 'Mystery', 'Thriller']",English,7.4,619,5089.969,18000000,"['Screen Gems', '2.0 Entertainment', 'Jesus & ...",65675816,103
1,640146,Ant-Man and the Wasp: Quantumania,"['Action', 'Adventure', 'Science Fiction']",English,6.6,2294,4665.438,200000000,"['Marvel Studios', 'Kevin Feige Productions']",464566092,125
2,502356,The Super Mario Bros. Movie,"['Animation', 'Adventure', 'Family', 'Fantasy'...",English,7.5,1861,3935.55,100000000,"['Universal Pictures', 'Illumination', 'Ninten...",1121048165,92
3,868759,Ghosted,"['Action', 'Comedy', 'Romance']",English,7.2,652,2791.532,0,"['Skydance Media', 'Apple Studios']",0,120
4,594767,Shazam! Fury of the Gods,"['Action', 'Comedy', 'Fantasy', 'Adventure']",English,6.8,1510,2702.593,125000000,"['New Line Cinema', 'The Safran Company', 'DC ...",133437105,130


In [12]:
df.corr()

  df.corr()


Unnamed: 0,id,vote_average,vote_count,popularity,budget,revenue,runtime
id,1.0,-0.241569,-0.259859,0.102688,-0.243411,-0.207591,-0.256838
vote_average,-0.241569,1.0,0.253543,0.040162,0.074849,0.149643,0.38844
vote_count,-0.259859,0.253543,1.0,0.069693,0.600121,0.753206,0.288462
popularity,0.102688,0.040162,0.069693,1.0,0.143257,0.148195,0.038973
budget,-0.243411,0.074849,0.600121,0.143257,1.0,0.735239,0.282498
revenue,-0.207591,0.149643,0.753206,0.148195,0.735239,1.0,0.253162
runtime,-0.256838,0.38844,0.288462,0.038973,0.282498,0.253162,1.0


In [13]:
df.dtypes

id                        int64
title                    object
genres                   object
original_language        object
vote_average            float64
vote_count                int64
popularity              float64
budget                    int64
production_companies     object
revenue                   int64
runtime                   int64
dtype: object

In [14]:
df.head()

Unnamed: 0,id,title,genres,original_language,vote_average,vote_count,popularity,budget,production_companies,revenue,runtime
0,758323,The Pope's Exorcist,"['Horror', 'Mystery', 'Thriller']",English,7.4,619,5089.969,18000000,"['Screen Gems', '2.0 Entertainment', 'Jesus & ...",65675816,103
1,640146,Ant-Man and the Wasp: Quantumania,"['Action', 'Adventure', 'Science Fiction']",English,6.6,2294,4665.438,200000000,"['Marvel Studios', 'Kevin Feige Productions']",464566092,125
2,502356,The Super Mario Bros. Movie,"['Animation', 'Adventure', 'Family', 'Fantasy'...",English,7.5,1861,3935.55,100000000,"['Universal Pictures', 'Illumination', 'Ninten...",1121048165,92
3,868759,Ghosted,"['Action', 'Comedy', 'Romance']",English,7.2,652,2791.532,0,"['Skydance Media', 'Apple Studios']",0,120
4,594767,Shazam! Fury of the Gods,"['Action', 'Comedy', 'Fantasy', 'Adventure']",English,6.8,1510,2702.593,125000000,"['New Line Cinema', 'The Safran Company', 'DC ...",133437105,130


In [15]:
action_movies = df[df['genres'].str.contains('Action')] # will output rows with action genre in them

In [16]:
action_movies = df[df['genres'] == 'Action']# will output rows with ONLY action genre in them. - there are none.

In [17]:
action_movies

Unnamed: 0,id,title,genres,original_language,vote_average,vote_count,popularity,budget,production_companies,revenue,runtime


In [18]:
# Convert string representations to actual lists
df['genres'] = df['genres'].apply(literal_eval)


In [19]:
df.head()

Unnamed: 0,id,title,genres,original_language,vote_average,vote_count,popularity,budget,production_companies,revenue,runtime
0,758323,The Pope's Exorcist,"[Horror, Mystery, Thriller]",English,7.4,619,5089.969,18000000,"['Screen Gems', '2.0 Entertainment', 'Jesus & ...",65675816,103
1,640146,Ant-Man and the Wasp: Quantumania,"[Action, Adventure, Science Fiction]",English,6.6,2294,4665.438,200000000,"['Marvel Studios', 'Kevin Feige Productions']",464566092,125
2,502356,The Super Mario Bros. Movie,"[Animation, Adventure, Family, Fantasy, Comedy]",English,7.5,1861,3935.55,100000000,"['Universal Pictures', 'Illumination', 'Ninten...",1121048165,92
3,868759,Ghosted,"[Action, Comedy, Romance]",English,7.2,652,2791.532,0,"['Skydance Media', 'Apple Studios']",0,120
4,594767,Shazam! Fury of the Gods,"[Action, Comedy, Fantasy, Adventure]",English,6.8,1510,2702.593,125000000,"['New Line Cinema', 'The Safran Company', 'DC ...",133437105,130


In [20]:
# Flatten the lists and convert to Series
flattened_series = pd.Series(chain.from_iterable(df['genres']))

# Get unique genres
unique_genres = flattened_series.unique()

print(unique_genres)

['Horror' 'Mystery' 'Thriller' 'Action' 'Adventure' 'Science Fiction'
 'Animation' 'Family' 'Fantasy' 'Comedy' 'Romance' 'Drama' 'History' 'War'
 'Crime' 'Music' 'Western' 'TV Movie' 'Documentary']


In [21]:
len(unique_genres) #This will be my k cluster number

19

### Basic Movie Recommender

So the essential idea of my movie recommender is to generate a list of movies to the user based on their genre of interest.

In [56]:
#Create a user prompt.
print (f"Here are the list of genres. {unique_genres}")
goi = [] #genres of interest
str = ""
while True:
    user_genre = input("Please enter a genre of interest (or 'exit' to quit): ").capitalize().strip() # strip in case a user will enter white spaces by accident.

    if user_genre == 'Exit':
        print("Exiting the program.")

        break
    elif user_genre in unique_genres:
        print(f"Great choice! {user_genre} is a popular genre.")
        str += user_genre
        goi.append(user_genre)
    else:
        print("Sorry, that's not a valid genre. Please choose from the following genres:")
        print(', '.join(unique_genres))

Here are the list of genres. ['Horror' 'Mystery' 'Thriller' 'Action' 'Adventure' 'Science Fiction'
 'Animation' 'Family' 'Fantasy' 'Comedy' 'Romance' 'Drama' 'History' 'War'
 'Crime' 'Music' 'Western' 'TV Movie' 'Documentary']
Great choice! Action is a popular genre.
Great choice! Comedy is a popular genre.
Sorry, that's not a valid genre. Please choose from the following genres:
Horror, Mystery, Thriller, Action, Adventure, Science Fiction, Animation, Family, Fantasy, Comedy, Romance, Drama, History, War, Crime, Music, Western, TV Movie, Documentary
Great choice! Crime is a popular genre.
Exiting the program.


In [57]:
# Convert list genres to string for TF-IDF vectorization later on but also to
df['genres_str'] = df['genres'].apply(', '.join)

In [58]:
goi #Our genres of interest from the user.

['Action', 'Comedy', 'Crime']

In [72]:
# df.groupby('vote_average')['title','genres','vote_average'].head(100).sort_values('vote_average',ascending=False) ## checking how to use sort_values.

In [71]:

# Create a boolean mask using str.contains for each genre
genre_masks = [df['genres_str'].str.contains(genre) for genre in goi]

# Combine the masks using logical OR (|)
combined_mask = pd.concat(genre_masks, axis=1).any(axis=1)

# Apply the combined mask to filter the DataFrame
filtered_df = df[combined_mask]

filtered_df.head(10).sort_values('vote_average',ascending=False)

Unnamed: 0,id,title,genres,original_language,vote_average,vote_count,popularity,budget,production_companies,revenue,runtime,genres_str
6,447365,Guardians of the Galaxy Volume 3,"[Science Fiction, Adventure, Action]",English,8.3,683,2520.308,250000000,"['Marvel Studios', 'Kevin Feige Productions']",289312702,150,"Science Fiction, Adventure, Action"
5,76600,Avatar: The Way of Water,"[Science Fiction, Adventure, Action]",English,7.7,7853,2280.912,460000000,"['20th Century Studios', 'Lightstorm Entertain...",2319331580,192,"Science Fiction, Adventure, Action"
2,502356,The Super Mario Bros. Movie,"[Animation, Adventure, Family, Fantasy, Comedy]",English,7.5,1861,3935.55,100000000,"['Universal Pictures', 'Illumination', 'Ninten...",1121048165,92,"Animation, Adventure, Family, Fantasy, Comedy"
9,493529,Dungeons & Dragons: Honor Among Thieves,"[Adventure, Fantasy, Comedy]",English,7.5,964,1655.052,151000000,"['Entertainment One', 'Paramount', 'Allspark P...",202484920,134,"Adventure, Fantasy, Comedy"
8,677179,Creed III,"[Drama, Action]",English,7.3,1298,1894.044,75000000,"['Metro-Goldwyn-Mayer', 'Proximity Media', 'Ba...",269000000,116,"Drama, Action"
10,948713,The Last Kingdom: Seven Kings Must Die,"[Action, Adventure, History, Drama, War]",English,7.3,317,1436.725,0,['Carnival Films'],0,111,"Action, Adventure, History, Drama, War"
3,868759,Ghosted,"[Action, Comedy, Romance]",English,7.2,652,2791.532,0,"['Skydance Media', 'Apple Studios']",0,120,"Action, Comedy, Romance"
4,594767,Shazam! Fury of the Gods,"[Action, Comedy, Fantasy, Adventure]",English,6.8,1510,2702.593,125000000,"['New Line Cinema', 'The Safran Company', 'DC ...",133437105,130,"Action, Comedy, Fantasy, Adventure"
1,640146,Ant-Man and the Wasp: Quantumania,"[Action, Adventure, Science Fiction]",English,6.6,2294,4665.438,200000000,"['Marvel Studios', 'Kevin Feige Productions']",464566092,125,"Action, Adventure, Science Fiction"
11,420808,Peter Pan & Wendy,"[Family, Fantasy, Action, Adventure]",English,5.9,277,1358.468,0,"['Walt Disney Pictures', 'Whitaker Entertainme...",0,106,"Family, Fantasy, Action, Adventure"


### Applying Clustering
Now that the basic idea of the movie recommender is created. How can we make a better movie recommender?

In [45]:
# Get movie titles of recommended movies with interest
recommended_titles_with_interest = df[df['genres_str'].str.contains(goi)]['title']

# Print recommended movie titles
print("Recommended movies based on your interest:")
print(recommended_titles_with_interest.tolist())

TypeError: unhashable type: 'list'

In [35]:
user_genres

['Action', 'Comedy']

In [36]:
# Convert list genres to string for TF-IDF vectorization
user_genres = user_genres.apply(', '.join)

AttributeError: 'list' object has no attribute 'apply'

In [37]:
df[df['genres_str'].str.contains(user_genres)]

KeyError: 'genres_str'

In [41]:


# Create a regular expression pattern for any of the specified genres
genre_pattern = '|'.join(user_genres)

# Filter movies based on user's genres
filtered_movies = df[df['genres'].str.contains(genre_pattern, case=False)]

if filtered_movies.empty:
    print(f"No movies found in the specified genres.")
else:
    print(f"Here are the movie titles in the specified genres:")
    for title in filtered_movies['title']:
        print(title)

KeyError: "None of [Float64Index([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,\n              ...\n              nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],\n             dtype='float64', length=10000)] are in the [columns]"

In [38]:
df.head()

Unnamed: 0,id,title,genres,original_language,vote_average,vote_count,popularity,budget,production_companies,revenue,runtime
0,758323,The Pope's Exorcist,"[Horror, Mystery, Thriller]",English,7.4,619,5089.969,18000000,"['Screen Gems', '2.0 Entertainment', 'Jesus & ...",65675816,103
1,640146,Ant-Man and the Wasp: Quantumania,"[Action, Adventure, Science Fiction]",English,6.6,2294,4665.438,200000000,"['Marvel Studios', 'Kevin Feige Productions']",464566092,125
2,502356,The Super Mario Bros. Movie,"[Animation, Adventure, Family, Fantasy, Comedy]",English,7.5,1861,3935.55,100000000,"['Universal Pictures', 'Illumination', 'Ninten...",1121048165,92
3,868759,Ghosted,"[Action, Comedy, Romance]",English,7.2,652,2791.532,0,"['Skydance Media', 'Apple Studios']",0,120
4,594767,Shazam! Fury of the Gods,"[Action, Comedy, Fantasy, Adventure]",English,6.8,1510,2702.593,125000000,"['New Line Cinema', 'The Safran Company', 'DC ...",133437105,130


In [None]:
# Convert genres to binary encoding
mlb = MultiLabelBinarizer()
genre_matrix = mlb.fit_transform(df['genres'])

# Choose the number of clusters
num_clusters = 19

# Apply K-Means clustering
kmeans = KMeans(n_clusters=num_clusters)
df['cluster'] = kmeans.fit_predict(genre_matrix)

# Print the results
print(df)



pca = PCA(n_components=2)
pca_result = pca.fit_transform(genre_matrix)
df['pca_1'] = pca_result[:, 0]
df['pca_2'] = pca_result[:, 1]

plt.scatter(df['pca_1'], df['pca_2'], c=df['cluster'], cmap='viridis')
plt.xlabel('PCA 1')
plt.ylabel('PCA 2')
plt.title('Genre Clusters')
plt.show()

In [None]:
# Determine the optimal number of clusters using the elbow method
wcss = []
for i in range(1, 25):  # Trying up to 10 clusters
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
    kmeans.fit(genre_matrix)
    wcss.append(kmeans.inertia_)

# Plot the elbow method graph
plt.plot(range(1, 25), wcss)
plt.title('Elbow Method')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.show()

In [None]:
kmeans.inertia_

In [None]:
!pip install scikit-surprise

In [None]:
from sklearn.metrics import silhouette_score

In [None]:
# List to store silhouette scores
silhouette_scores = []

# Determine silhouette scores for different numbers of clusters
for n_clusters in range(2, 25):  # Trying 2 to 10 clusters
    kmeans = KMeans(n_clusters=n_clusters, init='k-means++', max_iter=300, n_init=10, random_state=0)
    cluster_labels = kmeans.fit_predict(genre_matrix)
    silhouette_avg = silhouette_score(genre_matrix, cluster_labels)
    silhouette_scores.append(silhouette_avg)

# Plot silhouette scores
plt.plot(range(2, 25), silhouette_scores)
plt.title('Silhouette Method')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.show()

In [None]:
# Get user's input
user_interest = input("Enter your interest in a movie genre: ")

# Find movies with similar genres using the recommender model
similar_movies = []
for movie_id, genres in zip(df['movie_id'], df['genres']):
    if user_interest in genres:
        similar_movies.append(movie_id)

if not similar_movies:
    print("No movies found matching your interest.")
else:
    print("Recommended movies based on your interest:")
    for movie_id in similar_movies:
        movie_title = df[df['movie_id'] == movie_id]['title'].values[0]
        print(movie_title)

In [None]:
df.head()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# User input for genre of interest
user_interest = input("Enter your interest in a movie genre: ")

# Convert list genres to string for TF-IDF vectorization
df['genres_str'] = df['genres'].apply(', '.join)

# Vectorize genres using TF-IDF
vectorizer = TfidfVectorizer()
genre_matrix = vectorizer.fit_transform(df['genres_str'])

# Determine optimal number of clusters using silhouette score
silhouette_scores = []
for n_clusters in range(2, 11):  # Trying 2 to 10 clusters
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    cluster_labels = kmeans.fit_predict(genre_matrix)
    silhouette_avg = silhouette_score(genre_matrix, cluster_labels)
    silhouette_scores.append(silhouette_avg)

# Choose the optimal number of clusters
optimal_num_clusters = silhouette_scores.index(max(silhouette_scores)) + 2

# Perform K-Means clustering with optimal number of clusters
kmeans = KMeans(n_clusters=optimal_num_clusters, random_state=0)
cluster_labels = kmeans.fit_predict(genre_matrix)

# Get movies in the same cluster as the user's interest
user_cluster = kmeans.predict(vectorizer.transform([user_interest]))
recommended_movies = df[df['title'] != user_interest][cluster_labels == user_cluster[0]]

# Print recommended movies
print("Recommended movies based on your interest:")
print(recommended_movies['title'].tolist())

In [None]:
# Count the number of recommended movies that contain the user's interest
recommended_with_interest = sum(any(user_interest in genre for genre in genres) for genres in recommended_movies['genres'])

# Print recommended movies and the count
print("Recommended movies based on your interest:")
print(recommended_movies['title'].tolist())
print(f"Number of recommended movies with your interest: {recommended_with_interest}")


In [None]:
recommended_with_interest

In [None]:
len(recommended_movies)

In [None]:
# Get movie titles of recommended movies with interest
recommended_titles_with_interest = recommended_movies[recommended_movies['genres_str'].str.contains(user_interest)]['title']

# Print recommended movie titles
print("Recommended movies based on your interest:")
print(recommended_titles_with_interest.tolist())

In [None]:
len(recommended_titles_with_interest)

In [None]:
genres

In [None]:
recommended_titles_with_interest

In [None]:
recommended_movies[recommended_movies['genres_str'].str.contains(user_interest)]