## Import Dependencies


In [1]:
import pandas as pd
import numpy as np
# warnings
import warnings
warnings.filterwarnings('ignore')
# sklearn
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

## Data Imports

In [2]:
df_netflix = pd.read_csv('/content/drive/MyDrive/RecommendationSystem/Netflix/titles.csv')
df_amazon = pd.read_csv('/content/drive/MyDrive/RecommendationSystem/Amazon/titles.csv')
df_hbo = pd.read_csv('/content/drive/MyDrive/RecommendationSystem/HBO/titles.csv')

In [3]:
df = pd.concat([df_netflix, df_amazon, df_hbo], axis=0)
df.head()

Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score
0,ts300399,Five Came Back: The Reference Films,SHOW,This collection includes 12 World War II-era p...,1945,TV-MA,51,['documentation'],['US'],1.0,,,,0.6,
1,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,114,"['drama', 'crime']",['US'],,tt0075314,8.2,808582.0,40.965,8.179
2,tm154986,Deliverance,MOVIE,Intent on seeing the Cahulawassee River before...,1972,R,109,"['drama', 'action', 'thriller', 'european']",['US'],,tt0068473,7.7,107673.0,10.01,7.3
3,tm127384,Monty Python and the Holy Grail,MOVIE,"King Arthur, accompanied by his squire, recrui...",1975,PG,91,"['fantasy', 'action', 'comedy']",['GB'],,tt0071853,8.2,534486.0,15.461,7.811
4,tm120801,The Dirty Dozen,MOVIE,12 American military prisoners in World War II...,1967,,150,"['war', 'action']","['GB', 'US']",,tt0061578,7.7,72662.0,20.398,7.6


In [4]:
df.columns

Index(['id', 'title', 'type', 'description', 'release_year',
       'age_certification', 'runtime', 'genres', 'production_countries',
       'seasons', 'imdb_id', 'imdb_score', 'imdb_votes', 'tmdb_popularity',
       'tmdb_score'],
      dtype='object')

## Data Cleaning and Preprocessing

In [5]:
df_movies = df.drop_duplicates()
df_movies.duplicated().sum()

0

In [7]:
# Drop unnecessary columns
df_movies.drop(['description', 'age_certification'], axis=1, inplace=True)

In [8]:
df_movies

Unnamed: 0,id,title,type,release_year,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score
0,ts300399,Five Came Back: The Reference Films,SHOW,1945,51,['documentation'],['US'],1.0,,,,0.600,
1,tm84618,Taxi Driver,MOVIE,1976,114,"['drama', 'crime']",['US'],,tt0075314,8.2,808582.0,40.965,8.179
2,tm154986,Deliverance,MOVIE,1972,109,"['drama', 'action', 'thriller', 'european']",['US'],,tt0068473,7.7,107673.0,10.010,7.300
3,tm127384,Monty Python and the Holy Grail,MOVIE,1975,91,"['fantasy', 'action', 'comedy']",['GB'],,tt0071853,8.2,534486.0,15.461,7.811
4,tm120801,The Dirty Dozen,MOVIE,1967,150,"['war', 'action']","['GB', 'US']",,tt0061578,7.7,72662.0,20.398,7.600
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3289,tm1082718,Romeo Santos: Utopia Live from MetLife Stadium,MOVIE,2021,103,"['romance', 'music']",['PR'],,,,,8.425,8.100
3290,tm1067128,Algo Azul,MOVIE,2021,90,['comedy'],['PA'],,tt9257620,5.9,50.0,1.400,2.000
3291,tm1121489,Entre Nos: What She Said,MOVIE,2021,28,['comedy'],[],,tt15532762,,,,
3292,tm1121486,Entre Nos: The Winners 2,MOVIE,2021,28,['comedy'],[],,tt15532736,,,,


In [15]:
# 1. Remove unwanted characters from the 'production_countries' column
# The .str.replace() method is used to remove '[' and ']' characters, and any single quotes
# The 'regex=True' flag allows the .str.replace() method to interpret the patterns as regular expressions.
# Note: Square brackets [ ] are special characters in regex, so they are not part of character set and needs escaping.
df_movies['production_countries'] = df_movies['production_countries'].str.replace(r"\[", '', regex=True).str.replace(r"'", '', regex=True).str.replace(r"\]", '', regex=True)

# 2. Extract the first country from the cleaned 'production_countries' column
# The .str.split(',') splits the string into a list using commas as the delimiter, then .str[0] selects the first element.
# This creates a new column 'lead_prod_country' that represents the primary production country of each movie
df_movies['lead_prod_country'] = df_movies['production_countries'].str.split(',').str[0]

# 3. Calculate the number of countries involved in the production of each movie
# The .str.split(',') splits the 'production_countries' string by commas, and .str.len() counts the number of elements in the resulting list.
# This new column 'prod_countries_cnt' stores the count of production countries for each movie, providing additional data insights
df_movies['prod_countries_cnt'] = df_movies['production_countries'].str.split(',').str.len()

# 4. Replace any empty values in the 'lead_prod_country' column with NaN (Not a Number)
# This step uses the .replace() method to convert any empty strings ('') to np.nan (missing values)
# Handling missing data with NaN is important for accurate data analysis and prevents errors in downstream processing
df_movies['lead_prod_country'] = df_movies['lead_prod_country'].replace('', np.nan)

In [16]:
df_movies['lead_prod_country']

0        US
1        US
2        US
3        GB
4        GB
       ... 
3289     PR
3290     PA
3291    NaN
3292    NaN
3293     US
Name: lead_prod_country, Length: 18980, dtype: object

## Working with genres

In [17]:
df_movies['genres']

0                                 ['documentation']
1                                ['drama', 'crime']
2       ['drama', 'action', 'thriller', 'european']
3                   ['fantasy', 'action', 'comedy']
4                                 ['war', 'action']
                           ...                     
3289                           ['romance', 'music']
3290                                     ['comedy']
3291                                     ['comedy']
3292                                     ['comedy']
3293                              ['documentation']
Name: genres, Length: 18980, dtype: object

In [18]:
# 1. Remove unwanted characters from the 'genres' column
# The .str.replace() method is used to remove '[' and ']' characters, and any single quotes from the 'genres' column
# This cleans the 'genres' data by removing extraneous characters, making it easier to analyze and manipulate
# Note: Square brackets [ ] are special characters in regex, so they need escaping with a backslash (\).
df_movies['genres'] = df_movies['genres'].str.replace(r"\[", '', regex=True).str.replace(r"'", '', regex=True).str.replace(r"\]", '', regex=True)

# 2. Extract the first genre from the cleaned 'genres' column
# The .str.split(',') splits the 'genres' string by commas, and .str[0] selects the first element of the resulting list
# This creates a new column 'main_genre' that represents the primary genre of each movie
df_movies['main_genre'] = df_movies['genres'].str.split(',').str[0]

# . Replace any empty values in the 'main_genre' column with NaN (Not a Number)
# This step uses the .replace() method to convert any empty strings ('') to np.nan, indicating missing data
# Handling missing data with NaN is important for accurate data analysis and prevents errors in downstream processing
df_movies['main_genre'] = df_movies['main_genre'].replace('', np.nan)

In [19]:
df_movies['main_genre']

0       documentation
1               drama
2               drama
3             fantasy
4                 war
            ...      
3289          romance
3290           comedy
3291           comedy
3292           comedy
3293    documentation
Name: main_genre, Length: 18980, dtype: object

In [20]:
df_movies.drop(['genres', 'production_countries'], axis=1, inplace=True)

In [22]:
df_movies.columns

Index(['id', 'title', 'type', 'release_year', 'runtime', 'seasons', 'imdb_id',
       'imdb_score', 'imdb_votes', 'tmdb_popularity', 'tmdb_score',
       'lead_prod_country', 'prod_countries_cnt', 'main_genre'],
      dtype='object')

## Drop Missing Values

In [23]:
df_movies.shape

(18980, 14)

In [24]:
df_movies.isnull().sum()

id                        0
title                     1
type                      0
release_year              0
runtime                   0
seasons               14772
imdb_id                1394
imdb_score             1873
imdb_votes             1910
tmdb_popularity         670
tmdb_score             2656
lead_prod_country      1160
prod_countries_cnt        0
main_genre              321
dtype: int64

In [25]:
# Drop rows with any missing values to clean the dataset
df_movies.dropna(inplace=True)

# Set the 'title' column as the DataFrame index
df_movies.set_index('title', inplace=True)

# Drop the 'id' and 'imdb_id' columns as they are not needed for further analysis
df_movies.drop(['id', 'imdb_id'], axis=1, inplace=True)

In [28]:
df_movies.shape

(3294, 11)

## Encoding Categorical Features

In [29]:
# Create dummy variables for categorical columns
dummies = pd.get_dummies(df_movies[['type', 'lead_prod_country', 'main_genre']], drop_first=True)

# Concatenate the dummy variables with the original DataFrame
df_movies_dum = pd.concat([df_movies, dummies], axis=1)

# Drop the original categorical columns after creating dummy variables
df_movies_dum.drop(['type', 'lead_prod_country', 'main_genre'], axis=1, inplace=True)

## Scaling (Min-MaxScaler)

In [30]:
# Apply MinMax Scaler
scaler = MinMaxScaler()
df_scaled = scaler.fit_transform(df_movies_dum)
df_scaled = pd.DataFrame(df_scaled, columns=df_movies_dum.columns)

# Display scaled DataFrame
df_scaled

Unnamed: 0,release_year,runtime,seasons,imdb_score,imdb_votes,tmdb_popularity,tmdb_score,prod_countries_cnt,lead_prod_country_AR,lead_prod_country_AT,...,main_genre_history,main_genre_horror,main_genre_music,main_genre_reality,main_genre_romance,main_genre_scifi,main_genre_sport,main_genre_thriller,main_genre_war,main_genre_western
0,0.397727,0.168539,0.058824,0.9125,0.037009,0.007913,0.815870,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.625000,0.134831,0.156863,0.9250,0.155671,0.058490,0.815326,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.545455,0.286517,0.058824,0.6750,0.017194,0.022579,0.728261,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.568182,0.056180,0.450980,0.6250,0.002570,0.018954,0.619565,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.625000,0.129213,0.078431,0.7000,0.017658,0.008919,0.782609,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3289,0.988636,0.146067,0.000000,0.5000,0.000028,0.002064,0.456522,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3290,0.988636,0.258427,0.000000,0.5500,0.000027,0.002077,1.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3291,0.988636,0.185393,0.000000,0.5750,0.000017,0.000377,0.021739,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3292,0.988636,0.191011,0.019608,0.3125,0.000067,0.001158,0.510870,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


## DBSCAN

In [31]:
# Define the range of epsilon (eps) and minimum samples (min_samples) parameters for DBSCAN
eps_array = [0.2, 0.5, 1]  # List of different epsilon values (the maximum distance between two samples for one to be considered as in the neighborhood of the other)
min_samples_array = [5, 10, 30]  # List of different min_samples values (the number of samples in a neighborhood for a point to be considered as a core point)

# Iterate over each combination of eps and min_samples
for eps in eps_array:
    for min_samples in min_samples_array:
        clusterer = DBSCAN(eps=eps, min_samples=min_samples).fit(df_scaled)
        cluster_labels = clusterer.labels_

        if len(set(cluster_labels)) == 1:
            continue
        silhouette_avg = silhouette_score(df_scaled, cluster_labels)

        print('For eps =', eps,
              "For min samples =", min_samples,
              "Count Cluster =", len(set(cluster_labels)),
              "The average silhouette score is :", silhouette_avg)

For eps = 0.2 For min samples = 5 Count Cluster = 75 The average silhouette score is : 0.4378840737098286
For eps = 0.2 For min samples = 10 Count Cluster = 37 The average silhouette score is : 0.36601440046646755
For eps = 0.2 For min samples = 30 Count Cluster = 17 The average silhouette score is : 0.23106054247198202
For eps = 0.5 For min samples = 5 Count Cluster = 91 The average silhouette score is : 0.6019560501740351
For eps = 0.5 For min samples = 10 Count Cluster = 56 The average silhouette score is : 0.5303679432698052
For eps = 0.5 For min samples = 30 Count Cluster = 21 The average silhouette score is : 0.36228604161700484
For eps = 1 For min samples = 5 Count Cluster = 93 The average silhouette score is : 0.6091664186394289
For eps = 1 For min samples = 10 Count Cluster = 57 The average silhouette score is : 0.5362809971937993
For eps = 1 For min samples = 30 Count Cluster = 22 The average silhouette score is : 0.37121300388037515


## DBSCAN with best hyperparameters

In [32]:
dbscan_model = DBSCAN(eps=1, min_samples=5).fit(df_scaled)

print("Clusters ", len(set(dbscan_model.labels_)))
print("Score :", silhouette_score(df_scaled, dbscan_model.labels_))

Clusters  93
Score : 0.6091664186394289


## Save Cluster for recommendations

In [33]:
df_movies['dbscan_clusters'] = dbscan_model.labels_

In [34]:
df_movies

Unnamed: 0_level_0,type,release_year,runtime,seasons,imdb_score,imdb_votes,tmdb_popularity,tmdb_score,lead_prod_country,prod_countries_cnt,main_genre,dbscan_clusters
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Monty Python's Flying Circus,SHOW,1969,30,4.0,8.8,73424.0,17.617,8.306,GB,1,comedy,0
Seinfeld,SHOW,1989,24,9.0,8.9,308824.0,130.213,8.301,US,1,comedy,1
Knight Rider,SHOW,1982,51,4.0,6.9,34115.0,50.267,7.500,US,1,scifi,2
Thomas & Friends,SHOW,1984,10,24.0,6.5,5104.0,42.196,6.500,GB,1,animation,3
Saved by the Bell,SHOW,1989,23,5.0,7.1,35034.0,19.855,8.000,US,1,family,4
...,...,...,...,...,...,...,...,...,...,...,...,...
Level Playing Field,SHOW,2021,26,1.0,5.5,60.0,4.595,5.000,US,1,documentation,35
Os Ausentes,SHOW,2021,46,1.0,5.9,59.0,4.624,10.000,BR,1,action,-1
Through Our Eyes,SHOW,2021,33,1.0,6.1,38.0,0.840,1.000,US,1,documentation,35
Sweet Life: Los Angeles,SHOW,2021,34,2.0,4.0,137.0,2.579,5.500,US,1,reality,5


## Movie Recommendation Function

In [42]:
import random

def recommend_movie(movie_name: str):
    movie_name = movie_name.lower()
    df_movies['name'] = df_movies.index.str.lower()

    movie = df_movies[df_movies['name'].str.contains(movie_name, na=False)]

    if not movie.empty:
        cluster = movie['dbscan_clusters'].values[0]
        cluster_movies = df_movies[df_movies['dbscan_clusters']==cluster]

        if len(cluster_movies) >= 5:
            recommended_movies = random.sample(list(cluster_movies.index), 5)
        else:
            recommended_movies = list(cluster_movies.index)

        print("---We can recommend you these movies---")
        for m in recommended_movies:
            print(m)
    else:
        print("Movie not found in database!")

In [44]:
input_movie = input("Enter movie name: ")
print("\n\n")
recommend_movie(input_movie)

Enter movie name: hacker



---We can recommend you these movies---
The Last Word
You Are Wanted
Generation War
Babylon Berlin
Criminal: Germany


## Save dataset

In [45]:
df_movies.to_csv("clustered_movies.csv", index=False)