In [1]:
import json
import time

import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="whitegrid")

pd.set_option('display.max_rows', 100)

In [2]:
import pickle
import glob

base_path = os.path.dirname(os.getcwd())
master_data_path = os.path.abspath(os.path.join(base_path,'master-data'))
data_path = os.path.abspath(os.path.join(base_path,'data'))
images_path = os.path.abspath(os.path.join(base_path,'dm-final-report', 'images'))   
# tsv_files = glob.glob(os.path.join(master_data_path,"*.tsv.gz"))

# for file in tsv_files:
#     print(file)
#     pickle.dump(pd.read_table(file,sep="\t",low_memory=False, na_values=["\\N","nan"]),
#                 open(file[:-7]+".sav","wb"))

In [15]:
title_basics_crew_principals_ratings_merged_df = pd.read_pickle(os.path.join(data_path,"title.basics.crew.principals.ratings.cleaned.sav"))
title_principals_df = pd.read_pickle(os.path.join(data_path,"title.principals.cleaned.sav"))
name_basics_original_df = pd.read_pickle(os.path.join(master_data_path,"name.basics.sav"))

In [None]:
title_basics_crew_principals_ratings_merged_df

In [None]:
principals = title_principals_df.merge(name_basics_original_df[['nconst','primaryName', 'primaryProfession']], on='nconst', how='inner')
principals = principals.dropna(subset=['primaryProfession'])
principals["actor"] = principals["primaryProfession"].apply(lambda x: "actor" in x)
principals["actress"] = principals["primaryProfession"].apply(lambda x: "actress" in x)

actors_main_df = principals[principals["actor"] == True][['tconst', 'nconst']]
actors_main_df = actors_main_df.rename(columns={'nconst':'actor'})
actresses_main_df = principals[principals["actress"] == True][['tconst', 'nconst']]
actresses_main_df = actresses_main_df.rename(columns={'nconst':'actress'})

my idea is to for each movie, we check if it has prominent actors, actresses, directors, writers, and give these movies a score 

In [None]:
# actor score

actors_df = title_basics_crew_principals_ratings_merged_df[['tconst', 'averageRating', 'numVotes']].merge(actors_main_df, on='tconst', how='inner').copy()
mean_scores = actors_df.groupby('actor').agg(mean_rating=('averageRating', 'mean'),
                                         mean_votes=('numVotes', 'mean')).reset_index()
# Apply Min-Max normalization
mean_scores['normalized_rating'] = (mean_scores['mean_rating'] - mean_scores['mean_rating'].min()) / (mean_scores['mean_rating'].max() - mean_scores['mean_rating'].min())
mean_scores['normalized_votes'] = (mean_scores['mean_votes'] - mean_scores['mean_votes'].min()) / (mean_scores['mean_votes'].max() - mean_scores['mean_votes'].min())
# Combine scores with equal weights for rating and votes
mean_scores['actor_score'] = 0.5 * mean_scores['normalized_rating'] + 0.5 * mean_scores['normalized_votes']
mean_scores = mean_scores[['actor', 'actor_score']].sort_values(by='actor_score', ascending=False).reset_index(drop=True)
actor_score = mean_scores.copy()
actor_score

In [None]:
# actress score

actresses_df = title_basics_crew_principals_ratings_merged_df[['tconst', 'averageRating', 'numVotes']].merge(actresses_main_df, on='tconst', how='inner').copy()
mean_scores = actresses_df.groupby('actress').agg(mean_rating=('averageRating', 'mean'),
                                         mean_votes=('numVotes', 'mean')).reset_index()
# Apply Min-Max normalization
mean_scores['normalized_rating'] = (mean_scores['mean_rating'] - mean_scores['mean_rating'].min()) / (mean_scores['mean_rating'].max() - mean_scores['mean_rating'].min())
mean_scores['normalized_votes'] = (mean_scores['mean_votes'] - mean_scores['mean_votes'].min()) / (mean_scores['mean_votes'].max() - mean_scores['mean_votes'].min())
# Combine scores with equal weights for rating and votes
mean_scores['actress_score'] = 0.5 * mean_scores['normalized_rating'] + 0.5 * mean_scores['normalized_votes']
mean_scores = mean_scores[['actress', 'actress_score']].sort_values(by='actress_score', ascending=False).reset_index(drop=True)
actress_score = mean_scores.copy()
actress_score

In [None]:
# director score

directors_df = title_basics_crew_principals_ratings_merged_df[['tconst','directors', 'averageRating', 'numVotes']].copy()
directors_df['directors'] = directors_df['directors'].str.split(',')
directors_df = directors_df.explode('directors')
directors_df = directors_df.rename(columns={'directors':'director'})
mean_scores = directors_df.groupby('director').agg(mean_rating=('averageRating', 'mean'),
                                         mean_votes=('numVotes', 'mean')).reset_index()
# Apply Min-Max normalization
mean_scores['normalized_rating'] = (mean_scores['mean_rating'] - mean_scores['mean_rating'].min()) / (mean_scores['mean_rating'].max() - mean_scores['mean_rating'].min())
mean_scores['normalized_votes'] = (mean_scores['mean_votes'] - mean_scores['mean_votes'].min()) / (mean_scores['mean_votes'].max() - mean_scores['mean_votes'].min())
# Combine scores with equal weights for rating and votes
mean_scores['director_score'] = 0.5 * mean_scores['normalized_rating'] + 0.5 * mean_scores['normalized_votes']
mean_scores = mean_scores[['director', 'director_score']].sort_values(by='director_score', ascending=False).reset_index(drop=True)
director_score = mean_scores.copy()
director_score

In [None]:
# writer score

writers_df = title_basics_crew_principals_ratings_merged_df[['tconst','writers', 'averageRating', 'numVotes']].copy()
writers_df['writers'] = writers_df['writers'].str.split(',')
writers_df = writers_df.explode('writers')
writers_df = writers_df.rename(columns={'writers':'writer'})
mean_scores = writers_df.groupby('writer').agg(mean_rating=('averageRating', 'mean'),
                                         mean_votes=('numVotes', 'mean')).reset_index()
# Apply Min-Max normalization
mean_scores['normalized_rating'] = (mean_scores['mean_rating'] - mean_scores['mean_rating'].min()) / (mean_scores['mean_rating'].max() - mean_scores['mean_rating'].min())
mean_scores['normalized_votes'] = (mean_scores['mean_votes'] - mean_scores['mean_votes'].min()) / (mean_scores['mean_votes'].max() - mean_scores['mean_votes'].min())
# Combine scores with equal weights for rating and votes
mean_scores['writer_score'] = 0.5 * mean_scores['normalized_rating'] + 0.5 * mean_scores['normalized_votes']
mean_scores = mean_scores[['writer', 'writer_score']].sort_values(by='writer_score', ascending=False).reset_index(drop=True)
writer_score = mean_scores.copy()
writer_score

In [None]:
# getting the highest score for each movie based on actor score

df = actors_df.merge(actor_score, on='actor', how='inner')
highest_actor_scores = df.groupby('tconst')['actor_score'].max().reset_index()
df_with_highest_actor_score = df.merge(highest_actor_scores, on=['tconst', 'actor_score'], how='inner')

In [None]:
# getting the highest score for each movie based on actress score

df = actresses_df.merge(actress_score, on='actress', how='inner')
highest_actress_scores = df.groupby('tconst')['actress_score'].max().reset_index()
df_with_highest_actress_score = df.merge(highest_actress_scores, on=['tconst', 'actress_score'], how='inner')

In [None]:
# getting the highest score for each movie based on director score

df = directors_df.merge(director_score, on='director', how='inner')
highest_director_scores = df.groupby('tconst')['director_score'].max().reset_index()
df_with_highest_director_score = df.merge(highest_director_scores, on=['tconst', 'director_score'], how='inner')

In [None]:
# getting the highest score for each movie based on writer score

df = writers_df.merge(writer_score, on='writer', how='inner')
highest_writer_scores = df.groupby('tconst')['writer_score'].max().reset_index()
df_with_highest_writer_score = df.merge(highest_writer_scores, on=['tconst', 'writer_score'], how='inner')

In [None]:
# combining all scores

df_read_for_kmeans = title_basics_crew_principals_ratings_merged_df[['tconst', 'isAdult', 'startYear', 'runtimeMinutes', 'genres', 'averageRating', 'numVotes']].copy()
df_read_for_kmeans = df_read_for_kmeans.merge(df_with_highest_actor_score[['tconst', 'actor_score']], on='tconst', how='inner')
df_read_for_kmeans = df_read_for_kmeans.merge(df_with_highest_actress_score[['tconst', 'actress_score']], on='tconst', how='inner')
df_read_for_kmeans = df_read_for_kmeans.merge(df_with_highest_director_score[['tconst', 'director_score']], on='tconst', how='inner')
df_read_for_kmeans = df_read_for_kmeans.merge(df_with_highest_writer_score[['tconst', 'writer_score']], on='tconst', how='inner')
df_read_for_kmeans = df_read_for_kmeans.drop_duplicates(subset=['tconst'])
df_read_for_kmeans

# KMeans Clustering

In [3]:
import os
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
import plotly.express as px
from sklearn.decomposition import PCA

In [4]:
# read df_read_for_kmeans

df_read_for_kmeans = pd.read_pickle(os.path.join(data_path,"df_read_for_kmeans.sav"))
df_read_for_kmeans

Unnamed: 0,tconst,isAdult,startYear,runtimeMinutes,genres,averageRating,numVotes,actor_score,actress_score,director_score,writer_score
0,tt0000009,False,1894,45,Romance,5.3,209,0.281154,0.238934,0.238965,0.238943
1,tt0000574,False,1906,70,"Action,Adventure,Biography",6.0,876,0.277972,0.277972,0.278103,0.278007
21,tt0000591,False,1907,90,Drama,5.5,23,0.266678,0.250004,0.250007,0.233341
23,tt0000941,False,1909,45,Drama,4.6,28,0.250931,0.200005,0.212971,0.276668
25,tt0001184,False,1910,58,"Adventure,Drama",3.8,22,0.216671,0.155559,0.212971,0.216674
...,...,...,...,...,...,...,...,...,...,...,...
634466,tt9916190,False,2020,95,"Action,Adventure,Thriller",3.6,255,0.283365,0.144500,0.144538,0.172270
634467,tt9916270,False,2020,84,Thriller,5.8,1475,0.292735,0.282253,0.233734,0.267053
634469,tt9916362,False,2020,92,"Drama,History",6.4,5754,0.291351,0.306038,0.276346,0.301511
634470,tt9916538,False,2019,123,Drama,8.6,7,0.343074,0.337452,0.307609,0.347330


In [5]:
df = df_read_for_kmeans.copy()

# Convert 'isAdult' from boolean to int
df['isAdult'] = df['isAdult'].astype(int)

# Extract genres and one-hot encode them
genres = df['genres'].str.get_dummies(sep=',')

# Normalize the numerical columns
scaler = StandardScaler()
numerical_features = ['isAdult', 'startYear', 'runtimeMinutes', 'averageRating', 'numVotes', 'actor_score', 'actress_score', 'director_score', 'writer_score']
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Join the one-hot encoded genres back with the dataframe
df = df.join(genres).drop('genres', axis=1)

In [None]:
df = df.sample(n=50000, random_state=42, replace=False)
# save

df.to_pickle(os.path.join(data_path,"df_read_for_kmeans_sampled.sav"))

In [6]:
import os
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import plotly.express as px

base_path = os.path.dirname(os.getcwd())
data_path = os.path.abspath(os.path.join(base_path,'data'))   

df = pd.read_pickle(os.path.join(data_path,"df_read_for_kmeans_sampled.sav"))

numerical_features = ['isAdult', 'startYear', 'runtimeMinutes', 'averageRating', 'numVotes', 'actor_score', 'actress_score', 'director_score', 'writer_score']
genres = pd.read_pickle(os.path.join(data_path,"genres.sav"))

# Set a range for potential number of clusters you want to test
range_n_clusters = list(range(3, 6))

# Empty list to store the silhouette scores for each number of clusters
silhouette_scores = []

# Loop over the range to test each potential number of clusters
for n_clusters in range_n_clusters:
    print(f'Fitting model with {n_clusters} clusters')
    clusterer = KMeans(n_clusters=n_clusters, random_state=10, n_init=10)
    cluster_labels = clusterer.fit_predict(df[numerical_features + genres.columns.tolist()])
    print(f'Finished fitting model with {n_clusters} clusters')
    
    # Calculate silhouette score and append to list
    print(f'Calculating silhouette score for {n_clusters} clusters')
    silhouette_avg = silhouette_score(df[numerical_features + genres.columns.tolist()], cluster_labels)
    silhouette_scores.append(silhouette_avg)
    print("For n_clusters =", n_clusters,
          "The average silhouette_score is :", silhouette_avg)

# Plot the silhouette scores using Plotly
fig = px.line(x=range_n_clusters, y=silhouette_scores, title='Silhouette Score for Each Number of Clusters', labels={'x':'Number of Clusters', 'y':'Silhouette Score'})
fig.update_layout(plot_bgcolor='white',  # Set the plot background to white
    paper_bgcolor='white',  # Set the overall background to white
    font_color='black',  # Ensure that the font color is black
    title_font_size=20,  # Increase title font size
    font=dict(size=14),  # Increase general font size for axis titles, tick labels, etc.
    width=1200,  # Adjust figure width
    height=800,  # Adjust figure height
    )  # Adjust margins)

fig.update_xaxes(tick0=0, dtick=1)

fig.write_image("silhouette_score2.png")

Fitting model with 3 clusters
Finished fitting model with 3 clusters
Calculating silhouette score for 3 clusters
For n_clusters = 3 The average silhouette_score is : 0.2224807617542949
Fitting model with 4 clusters
Finished fitting model with 4 clusters
Calculating silhouette score for 4 clusters
For n_clusters = 4 The average silhouette_score is : 0.18679198804488972
Fitting model with 5 clusters
Finished fitting model with 5 clusters
Calculating silhouette score for 5 clusters
For n_clusters = 5 The average silhouette_score is : 0.17513324992899906


In [7]:
import os
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import plotly.express as px

base_path = os.path.dirname(os.getcwd())
data_path = os.path.abspath(os.path.join(base_path,'data'))   

df = df.sample(n=50000, random_state=42, replace=False)

numerical_features = ['isAdult', 'startYear', 'runtimeMinutes', 'averageRating', 'numVotes', 'actor_score', 'actress_score', 'director_score', 'writer_score']
genres = pd.read_pickle(os.path.join(data_path,"genres.sav"))

# Set a range for potential number of clusters you want to test
range_n_clusters = list(range(10, 21))

# Empty list to store the silhouette scores for each number of clusters
silhouette_scores = []
n_clusters = 3

# Loop over the range to test each potential number of clusters

print(f'Fitting model with {n_clusters} clusters')
clusterer = KMeans(n_clusters=n_clusters, random_state=10, n_init=10)
df['cluster'] = clusterer.fit_predict(df[numerical_features + genres.columns.tolist()])
print(f'Finished fitting model with {n_clusters} clusters')

# Calculate silhouette score and append to list
print(f'Calculating silhouette score for {n_clusters} clusters')
silhouette_avg = silhouette_score(df[numerical_features + genres.columns.tolist()], df['cluster'])
silhouette_scores.append(silhouette_avg)
print("For n_clusters =", n_clusters,
      "The average silhouette_score is :", silhouette_avg)

Fitting model with 3 clusters
Finished fitting model with 3 clusters
Calculating silhouette score for 3 clusters
For n_clusters = 3 The average silhouette_score is : 0.22250704955597794


In [8]:
pca = PCA(n_components=2)
reduced_features = pca.fit_transform(df[numerical_features + genres.columns.tolist()])
df['pca_x'] = reduced_features[:,0]
df['pca_y'] = reduced_features[:,1]

In [9]:
fig = px.scatter(df, x='pca_x', y='pca_y', color='cluster', title='K-means Clustering with 10 Clusters')
fig.update_layout(plot_bgcolor='white',  # Set the plot background to white
    paper_bgcolor='white',  # Set the overall background to white
    font_color='black',  # Ensure that the font color is black
    title_font_size=20,  # Increase title font size
    font=dict(size=14),  # Increase general font size for axis titles, tick labels, etc.
    width=1200,  # Adjust figure width
    height=800,  # Adjust figure height
    )  # Adjust margins)
fig.show()