In [1]:
import pandas as pd
import numpy as np

# warnings 
import warnings
warnings.filterwarnings('ignore')


# sklearn
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

# Step 1: Data imports

In [2]:
df_netflix = pd.read_csv('netflix/titles.csv')
df_amazon =  pd.read_csv('amazonprimezip/titles.csv')
df_hbo =  pd.read_csv('hbomaxzip/titles.csv')

In [3]:
df = pd.concat([df_netflix, df_amazon, df_hbo], axis=0)

In [4]:
df.head()

Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score
0,ts300399,Five Came Back: The Reference Films,SHOW,This collection includes 12 World War II-era p...,1945,TV-MA,51,['documentation'],['US'],1.0,,,,0.6,
1,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,114,"['drama', 'crime']",['US'],,tt0075314,8.2,808582.0,40.965,8.179
2,tm154986,Deliverance,MOVIE,Intent on seeing the Cahulawassee River before...,1972,R,109,"['drama', 'action', 'thriller', 'european']",['US'],,tt0068473,7.7,107673.0,10.01,7.3
3,tm127384,Monty Python and the Holy Grail,MOVIE,"King Arthur, accompanied by his squire, recrui...",1975,PG,91,"['fantasy', 'action', 'comedy']",['GB'],,tt0071853,8.2,534486.0,15.461,7.811
4,tm120801,The Dirty Dozen,MOVIE,12 American military prisoners in World War II...,1967,,150,"['war', 'action']","['GB', 'US']",,tt0061578,7.7,72662.0,20.398,7.6


# Step 3: Data Cleaning and Preprocessing

In [5]:
df_movies = df.drop_duplicates()

In [6]:
# Drop unnecessary columns
df_movies.drop(['description', 'age_certification'], axis=1, inplace=True)

##### working with production_countries column

In [7]:
df['production_countries']

0             ['US']
1             ['US']
2             ['US']
3             ['GB']
4       ['GB', 'US']
            ...     
3289          ['PR']
3290          ['PA']
3291              []
3292              []
3293          ['US']
Name: production_countries, Length: 19015, dtype: object

In [8]:

# 1. Remove unwanted characters from the 'production_countries' column
df_movies['production_countries'] = df_movies['production_countries'].str.replace(r"\[", '', regex=True).str.replace(r"'", '', regex=True).str.replace(r"\]", '', regex=True)

# 2. Extract the first country from the cleaned 'production_countries' column
df_movies['lead_prod_country'] = df_movies['production_countries'].str.split(',').str[0]

# 3. Calculate the number of countries involved in the production of each movie
df_movies['prod_countries_cnt'] = df_movies['production_countries'].str.split(',').str.len()

# 4. Replace any empty values in the 'lead_prod_country' column with NaN (Not a Number)
df_movies['lead_prod_country'] = df_movies['lead_prod_country'].replace('', np.nan)


In [9]:
df_movies['lead_prod_country'].value_counts()

lead_prod_country
US    9277
IN    1683
GB    1307
CA     655
JP     596
      ... 
NA       1
NP       1
GT       1
KG       1
RW       1
Name: count, Length: 125, dtype: int64

##### Working with genres

In [10]:
df_movies['genres']

0                                 ['documentation']
1                                ['drama', 'crime']
2       ['drama', 'action', 'thriller', 'european']
3                   ['fantasy', 'action', 'comedy']
4                                 ['war', 'action']
                           ...                     
3289                           ['romance', 'music']
3290                                     ['comedy']
3291                                     ['comedy']
3292                                     ['comedy']
3293                              ['documentation']
Name: genres, Length: 18980, dtype: object

In [11]:
# 1. Remove unwanted characters from the 'genres' column
df_movies['genres'] = df_movies['genres'].str.replace(r"\[", '', regex=True).str.replace(r"'", '', regex=True).str.replace(r"\]", '', regex=True)

# 2. Extract the first genre from the cleaned 'genres' column
df_movies['main_genre'] = df_movies['genres'].str.split(',').str[0]

# . Replace any empty values in the 'main_genre' column with NaN (Not a Number)
df_movies['main_genre'] = df_movies['main_genre'].replace('', np.nan)

In [12]:
df_movies['main_genre'].value_counts()

main_genre
drama            4786
comedy           3513
documentation    1985
thriller         1737
action           1012
romance           862
horror            792
scifi             753
crime             648
animation         646
western           465
fantasy           447
reality           325
family            270
music             185
war               111
history            88
sport              26
european            8
Name: count, dtype: int64

In [13]:
#  Drop unnecessary columns 'genres' and 'production_countries' from the DataFrame
df_movies.drop(['genres', 'production_countries'], axis=1, inplace=True)

### drop missing values

In [14]:
df_movies.shape

(18980, 14)

In [15]:
df_movies.isnull().sum()

id                        0
title                     1
type                      0
release_year              0
runtime                   0
seasons               14772
imdb_id                1394
imdb_score             1873
imdb_votes             1910
tmdb_popularity         670
tmdb_score             2656
lead_prod_country      1160
prod_countries_cnt        0
main_genre              321
dtype: int64

In [16]:
# drop season column
df_movies.drop(columns=['seasons'], axis=1, inplace = True)

In [17]:
df_movies.head()

Unnamed: 0,id,title,type,release_year,runtime,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score,lead_prod_country,prod_countries_cnt,main_genre
0,ts300399,Five Came Back: The Reference Films,SHOW,1945,51,,,,0.6,,US,1,documentation
1,tm84618,Taxi Driver,MOVIE,1976,114,tt0075314,8.2,808582.0,40.965,8.179,US,1,drama
2,tm154986,Deliverance,MOVIE,1972,109,tt0068473,7.7,107673.0,10.01,7.3,US,1,drama
3,tm127384,Monty Python and the Holy Grail,MOVIE,1975,91,tt0071853,8.2,534486.0,15.461,7.811,GB,1,fantasy
4,tm120801,The Dirty Dozen,MOVIE,1967,150,tt0061578,7.7,72662.0,20.398,7.6,GB,2,war


In [18]:
df_movies.isnull().sum()

id                       0
title                    1
type                     0
release_year             0
runtime                  0
imdb_id               1394
imdb_score            1873
imdb_votes            1910
tmdb_popularity        670
tmdb_score            2656
lead_prod_country     1160
prod_countries_cnt       0
main_genre             321
dtype: int64

In [19]:
# Drop rows with any missing values to clean the dataset
df_movies.dropna(inplace=True)

# Set the 'title' column as the DataFrame index
df_movies.set_index('title', inplace=True)

# Drop the 'id' and 'imdb_id' columns as they are not needed for further analysis
df_movies.drop(['id', 'imdb_id'], axis=1, inplace=True)


In [20]:
df_movies.shape

(14855, 10)

# Encoding Categorical Features:

In [21]:
# Create dummy variables for categorical columns ('type', 'lead_prod_country', 'main_genre')
dummies = pd.get_dummies(df_movies[['type', 'lead_prod_country', 'main_genre']], drop_first=True)

# Concatenate the dummy variables with the original DataFrame
df_movies_dum = pd.concat([df_movies, dummies], axis=1)

# 14. Drop the original categorical columns after creating dummy variables
df_movies_dum.drop(['type', 'lead_prod_country', 'main_genre'], axis=1, inplace=True)

# Scaling (MinmaxScaler):

In [22]:
# Apply MinMaxScaler to scale the data for model training
scaler = MinMaxScaler()
df_scaled = scaler.fit_transform(df_movies_dum)
df_scaled = pd.DataFrame(df_scaled, columns=df_movies_dum.columns)

# Display the scaled DataFrame

df_scaled

Unnamed: 0,release_year,runtime,imdb_score,imdb_votes,tmdb_popularity,tmdb_score,prod_countries_cnt,type_SHOW,lead_prod_country_AF,lead_prod_country_AL,...,main_genre_history,main_genre_horror,main_genre_music,main_genre_reality,main_genre_romance,main_genre_scifi,main_genre_sport,main_genre_thriller,main_genre_war,main_genre_western
0,0.581818,0.347561,0.806818,0.316407,0.009210,0.802065,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.545455,0.332317,0.750000,0.042132,0.002250,0.706522,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.572727,0.277439,0.806818,0.209149,0.003476,0.762065,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.500000,0.457317,0.750000,0.028432,0.004586,0.739130,0.142857,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.518182,0.091463,0.875000,0.028730,0.003961,0.815870,0.000000,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14850,0.990909,0.121951,0.295455,0.000070,0.001286,0.554348,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14851,0.990909,0.323171,0.590909,0.000009,0.015724,0.554348,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14852,0.990909,0.079268,0.647727,0.000004,0.000223,1.000000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14853,0.990909,0.176829,0.306818,0.000086,0.000539,0.500000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


<a id="5"></a> <br>
# step 4: DBSCAN 

###### run a loop to get best epsilon value and minpnts

In [23]:
# Define the range of epsilon (eps) and minimum samples (min_samples) parameters for DBSCAN
eps_array = [0.2, 0.5, 1]  # List of different epsilon values (the maximum distance between two samples for one to be considered as in the neighborhood of the other)
min_samples_array = [5, 10, 30]  # List of different min_samples values (the number of samples in a neighborhood for a point to be considered as a core point)

# Iterate over each combination of eps and min_samples
for eps in eps_array:
    for min_samples in min_samples_array:
        # Initialize and fit the DBSCAN model with the current parameters
        clusterer = DBSCAN(eps=eps, min_samples=min_samples).fit(df_scaled)
        
        # Retrieve the cluster labels from the fitted model
        cluster_labels = clusterer.labels_
        
        # Check if the algorithm found only one cluster or marked all points as noise (-1 label for noise)
        if len(set(cluster_labels)) == 1:
            continue  # Skip this combination as it does not provide meaningful clusters
        
        # Calculate the silhouette score to evaluate the quality of the clustering
        silhouette_avg = silhouette_score(df_scaled, cluster_labels)
        
        # Print the current parameters, number of clusters, and the silhouette score
        print("For eps =", eps,
              "For min_samples =", min_samples,
              "Count clusters =", len(set(cluster_labels)),
              "The average silhouette_score is :", silhouette_avg)


For eps = 0.2 For min_samples = 5 Count clusters = 228 The average silhouette_score is : 0.5279735696015463
For eps = 0.2 For min_samples = 10 Count clusters = 129 The average silhouette_score is : 0.449427531631811
For eps = 0.2 For min_samples = 30 Count clusters = 49 The average silhouette_score is : 0.3096912856472519
For eps = 0.5 For min_samples = 5 Count clusters = 297 The average silhouette_score is : 0.6158428604489057
For eps = 0.5 For min_samples = 10 Count clusters = 171 The average silhouette_score is : 0.5581954472036961
For eps = 0.5 For min_samples = 30 Count clusters = 69 The average silhouette_score is : 0.4395908978429388
For eps = 1 For min_samples = 5 Count clusters = 303 The average silhouette_score is : 0.6197731362825976
For eps = 1 For min_samples = 10 Count clusters = 176 The average silhouette_score is : 0.5617750734157442
For eps = 1 For min_samples = 30 Count clusters = 72 The average silhouette_score is : 0.4458811460892883


# DBSCAN With Best Hypterparameters (eps=1, minpnts=5)

In [24]:
dbscan_model = DBSCAN(eps=1, min_samples=5).fit(df_scaled)
print("For eps =", 1,
      "For min_samples =", 5,
      "Count clusters =", len(set(dbscan_model.labels_)),
      "The average silhouette_score is :", silhouette_score(df_scaled, dbscan_model.labels_))

For eps = 1 For min_samples = 5 Count clusters = 303 The average silhouette_score is : 0.6197731362825976


##### save clusters for recommendations 

In [25]:
df_movies['dbscan_clusters'] = dbscan_model.labels_

In [26]:
df_movies['dbscan_clusters'].value_counts()

dbscan_clusters
 0      1397
 10     1288
-1      1072
 5       756
 95      662
        ... 
 190       5
 270       5
 232       5
 272       5
 189       5
Name: count, Length: 303, dtype: int64

<a id="6"></a> <br>
# Step 5: Movie Recommendation Function

#### Our data is ready to use the clustering results to try and recommend a movie by the name of the one you like

In [27]:
import random

def recommend_movie(movie_name: str):
    # Convert the input movie name to lowercase for case-insensitive matching
    movie_name = movie_name.lower()

    # Create a new column 'name' with lowercase movie names for comparison
    df_movies['name'] = df_movies.index.str.lower()

    # Find the movie that matches the input name
    movie = df_movies[df_movies['name'].str.contains(movie_name, na=False)]

    if not movie.empty:
        # Get the cluster label of the input movie
        cluster = movie['dbscan_clusters'].values[0]

        # Get all movies in the same cluster
        cluster_movies = df_movies[df_movies['dbscan_clusters'] == cluster]

        # If there are more than 5 movies in the cluster, randomly select 5
        if len(cluster_movies) >= 5:
            recommended_movies = random.sample(list(cluster_movies.index), 5)
        else:
            # If fewer than 5, return all the movies in the cluster
            recommended_movies = list(cluster_movies.index)

        # Print the recommended movies
        print('--- We can recommend you these movies ---')
        for m in recommended_movies:
            print(m)
    else:
        print('Movie not found in the database.')


In [28]:
s = input('Input movie name: ')

print("\n\n")
recommend_movie(s)




--- We can recommend you these movies ---
Tales from the Loop
Spawn
Black Summer
Mortal Kombat: Conquest
Trollhunters: Tales of Arcadia


In [29]:
s = input('Input movie name: ')

print("\n\n")
recommend_movie(s)




--- We can recommend you these movies ---
The Originals
Preacher
DMZ
HAPPY!
Mortal Kombat: Conquest


In [30]:
s = input('Input movie name: ')

print("\n\n")
recommend_movie(s)




--- We can recommend you these movies ---
Goosebumps
The Plot Against America
The 100
DOTA: Dragon's Blood
Eureka


# Streamlit App (so save df_movies dataset)

In [32]:
df_movies.to_csv("clustered_movies.csv", index=False)