# First import all necessary libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')

# Import 3 datasets

In [6]:
df_netflix = pd.read_csv('titles-3.csv')
df_amazon = pd.read_csv('titles-2.csv')
df_hbo = pd.read_csv('titles.csv')

In [8]:
df_amazon.shape

(9871, 15)

In [10]:
df_hbo.shape

(3294, 15)

In [12]:
df_netflix.shape

(5850, 15)

# Concate these 3 datasets

In [16]:
df = pd.concat([df_amazon, df_hbo, df_netflix], axis=0)

In [18]:
df.shape

(19015, 15)

# Data cleaning and Preprocessing

In [23]:
# Drop the duplicate records
df_movies = df.drop_duplicates()
df_movies.duplicated().sum()

0

#### The re no duplicate values left

In [28]:
# Drop unnecessary columns
df_movies.drop(['description', 'age_certification'], axis=1, inplace=True)

In [32]:
df_movies.head(2)

Unnamed: 0,id,title,type,release_year,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score
0,ts20945,The Three Stooges,SHOW,1934,19,"['comedy', 'family', 'animation', 'action', 'f...",['US'],26.0,tt0850645,8.6,1092.0,15.424,7.6
1,tm19248,The General,MOVIE,1926,78,"['action', 'drama', 'war', 'western', 'comedy'...",['US'],,tt0017925,8.2,89766.0,8.647,8.0


# Work with "production_countries" column

In [36]:
df_movies['production_countries'] = df_movies['production_countries'].str.replace(r"\[|\]|\'", '', regex=True)

In [38]:
df_movies['lead_production_countries'] = df_movies['production_countries'].str.split(',').str[0]

In [40]:
df_movies['production_countries_lenth'] = df_movies['production_countries'].str.split(',').str.len()

In [42]:
df_movies['lead_production_countries'] = df_movies['lead_production_countries'].replace('', np.nan)

# Work with 'genre' column

In [47]:
df_movies['genres'] = df_movies['genres'].str.replace(r"\[|\]|\'", '', regex=True)

In [49]:
df_movies['main_genres'] = df_movies['genres'].str.split(',').str[0]

In [51]:
df_movies['main_genres'] = df_movies['main_genres'].replace('', np.nan)

In [55]:
# Drop 'genres', 'production_countries' columns

df_movies.drop(['genres', 'production_countries'], axis=1, inplace=True)

# Drop missing values

In [60]:
df_movies.shape

(18980, 14)

In [62]:
df_movies.isnull().sum()

id                                0
title                             1
type                              0
release_year                      0
runtime                           0
seasons                       14772
imdb_id                        1394
imdb_score                     1873
imdb_votes                     1910
tmdb_popularity                 670
tmdb_score                     2656
lead_production_countries      1160
production_countries_lenth        0
main_genres                     321
dtype: int64

In [64]:
# Drop rows with any missing values to clean the dataset
df_movies.dropna(inplace=True)

# set the 'title' column as the DataFrame index
df_movies.set_index('title', inplace=True)

# Drop 'id' and 'imdb_id' columns as they are not needed for further analysis
df_movies.drop(['id', 'imdb_id'], axis=1, inplace=True)

In [68]:
df_movies.head(2)

Unnamed: 0_level_0,type,release_year,runtime,seasons,imdb_score,imdb_votes,tmdb_popularity,tmdb_score,lead_production_countries,production_countries_lenth,main_genres
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
The Three Stooges,SHOW,1934,19,26.0,8.6,1092.0,15.424,7.6,US,1,comedy
What's My Line?,SHOW,1950,30,18.0,8.6,1563.0,87.392,6.9,US,1,reality


# Encoding categorical features

In [73]:
# Create dummy variables for categorical columns
dummies = pd.get_dummies(df_movies[['type', 'lead_production_countries', 'main_genres']], drop_first=True).astype('int')

# Concatenate the dummy variables with the original DataFrame
df_movies_dum = pd.concat([df_movies, dummies], axis=1)

# Drop the orifinal categorical columns after creating dummy variables
df_movies_dum.drop(['type', 'lead_production_countries', 'main_genres'], axis=1, inplace=True)

# Scaling (MinMaxScaler)

In [78]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df_scaled = scaler.fit_transform(df_movies_dum)
df_scaled = pd.DataFrame(df_scaled, columns=df_movies_dum.columns)

In [79]:
df_scaled.head(2)

Unnamed: 0,release_year,runtime,seasons,imdb_score,imdb_votes,tmdb_popularity,tmdb_score,production_countries_lenth,lead_production_countries_AR,lead_production_countries_AT,...,main_genres_history,main_genres_horror,main_genres_music,main_genres_reality,main_genres_romance,main_genres_scifi,main_genres_sport,main_genres_thriller,main_genres_war,main_genres_western
0,0.0,0.106742,0.490196,0.8875,0.000548,0.006928,0.73913,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.181818,0.168539,0.333333,0.8875,0.000785,0.039256,0.663043,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


# DBSCAN
### Run a loop to get best epsilon value and minpnts

In [87]:
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score

In [91]:
# Define the range of epsilon and minimum samples (min_samples) parameter for DBSCAN
eps_array = [0.2, 0.5, 1]
min_samples_array = [5, 10, 30]

for eps in eps_array: 
    for min_sample in min_samples_array:
        cluster = DBSCAN(eps=eps, min_samples=min_sample)
        cluster.fit(df_scaled)
        cluster_label = cluster.labels_

        if len(set(cluster_label)) == 1:
            continue
        silhouette_avg = silhouette_score(df_scaled, cluster_label)

        print(f"eps: {eps}\nmin_sample: {min_sample}\nCount Cluster: {len(set(cluster_label))}\nAverage silhouette score: {silhouette_avg}")

        print('*'*15)

eps: 0.2
min_sample: 5
Count Cluster: 75
Average silhouette score: 0.4378840737098286
***************
eps: 0.2
min_sample: 10
Count Cluster: 37
Average silhouette score: 0.36601440046646755
***************
eps: 0.2
min_sample: 30
Count Cluster: 17
Average silhouette score: 0.23106054247198202
***************
eps: 0.5
min_sample: 5
Count Cluster: 91
Average silhouette score: 0.6019560501740349
***************
eps: 0.5
min_sample: 10
Count Cluster: 56
Average silhouette score: 0.5303679432698051
***************
eps: 0.5
min_sample: 30
Count Cluster: 21
Average silhouette score: 0.36228604161700484
***************
eps: 1
min_sample: 5
Count Cluster: 93
Average silhouette score: 0.6091664186394288
***************
eps: 1
min_sample: 10
Count Cluster: 57
Average silhouette score: 0.5362809971937993
***************
eps: 1
min_sample: 30
Count Cluster: 22
Average silhouette score: 0.37121300388037504
***************


# Train the model 
# DBSCAN with eps=1 and min_samples=5 as hyperparameter tuning

In [97]:
dbscan_model = DBSCAN(eps=1, min_samples=5)
dbscan_model.fit(df_scaled)

# Save the clusters for recommendations

In [106]:
df_movies['dbscan_clusters'] = dbscan_model.labels_

In [112]:
df_movies.head()

Unnamed: 0_level_0,type,release_year,runtime,seasons,imdb_score,imdb_votes,tmdb_popularity,tmdb_score,lead_production_countries,production_countries_lenth,main_genres,dbscan_clusters
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
The Three Stooges,SHOW,1934,19,26.0,8.6,1092.0,15.424,7.6,US,1,comedy,0
What's My Line?,SHOW,1950,30,18.0,8.6,1563.0,87.392,6.9,US,1,reality,1
I Love Lucy,SHOW,1951,30,9.0,8.5,25944.0,17.088,8.1,US,1,comedy,0
Mister Rogers' Neighborhood,SHOW,1968,29,31.0,8.7,8675.0,8.747,4.7,US,1,fantasy,2
Lupin the Third,SHOW,1971,23,6.0,7.9,2116.0,45.829,8.0,JP,1,scifi,3


# Movie Recommendation Function
### Our data is ready to use the clustering results to try and recommend a movie by the name of the one you like

In [119]:
import random

def recommend_movie(movie_name: str):
    movie_name = movie_name.lower()
    df_movies['name'] = df_movies.index.str.lower()

    movie = df_movies[df_movies['name'].str.contains(movie_name, na=False)]

    if not movie.empty:
        cluster = movie['dbscan_clusters'].values[0]
        cluster_movies = df_movies[df_movies['dbscan_clusters']==cluster]

        if len(cluster_movies) >= 5:
            recommmended_movies = random.sample(list(cluster_movies.index), 5)
        else:
            recommmended_movies = list(cluster_movies.index)

        print(f"We can recommend you these movies")
        for movie in recommmended_movies:
            print(movie)

    else:
        print("Movie not found")

In [125]:
input_movie = input('Enter movie name: ')
print('\n\n')
recommend_movie(input_movie)

Enter movie name:  I Love Lucy





We can recommend you these movies
That Damn Michael Che
The Game
2 Dope Queens
Regular Show
Reno 911!
