In [3]:
import json
import time

import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="whitegrid")

pd.set_option('display.max_rows', 100)

In [None]:
import pickle
import glob

base_path = os.path.dirname(os.getcwd())
master_data_path = os.path.abspath(os.path.join(base_path,'master-data'))
data_path = os.path.abspath(os.path.join(base_path,'data'))
images_path = os.path.abspath(os.path.join(base_path,'dm-final-report', 'images'))   

In [4]:
title_basics_crew_principals_ratings_merged_df = pd.read_pickle(os.path.join(data_path,"title.basics.crew.principals.ratings.cleaned.sav"))

In [None]:

title_principals_df = pd.read_pickle(os.path.join(data_path,"title.principals.cleaned.sav"))
name_basics_original_df = pd.read_pickle(os.path.join(master_data_path,"name.basics.sav"))

# SOM Clusters

In [None]:
som_clustered = pd.read_pickle(os.path.join(data_path,"som_clustered.sav"))

def get_cluster_number(cluster):
    return int(cluster.split("-")[0]) * 10 + int(cluster.split("-")[1])

som_clustered["cluster_number"] = som_clustered["cluster"].apply(get_cluster_number)
som_clustered

In [None]:
import pandas as pd
import plotly.express as px


df = som_clustered
df['cluster_number'] = df['cluster_number'].astype(str)




# Calculate counts for each 'cluster_number'
counts = df['cluster'].value_counts().reset_index()

counts.columns = ['cluster', 'count']

# Sort the dataframe by 'count' in descending order
counts_sorted = counts.sort_values('count', ascending=False)

# Create the bar chart
fig = px.bar(counts_sorted, x='cluster', y='count', title='Movie Count per Cluster Number')
fig.update_layout(
    paper_bgcolor='white',  # Set the overall background to white
    font_color='black',  # Ensure that the font color is black
    title_font_size=20,  # Increase title font size
    width=1000,  # Increase figure width
    height=800,
    font=dict(size=14),  # Increase general font size for axis titles, tick labels, etc.
    yaxis=dict(type='log')
)
fig.show()



In [None]:
df_read_for_kmeans = pd.read_pickle(os.path.join(data_path,"df_read_for_kmeans.sav"))
df_read_for_kmeans

In [None]:
som_clustered

In [None]:
# for cluster number 19

In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [None]:
cluster_number = "19"

cluster_df = som_clustered[som_clustered["cluster_number"] == cluster_number]

merged_df = pd.merge(cluster_df, df_read_for_kmeans, on="tconst", how="inner")

df = merged_df.copy()

genres_expanded = df['genres'].str.get_dummies(sep=',')
df = pd.concat([df, genres_expanded], axis=1).drop('genres', axis=1)

# Select numerical columns (excluding 'tconst' which is an identifier)
numerical_cols = ['isAdult', 'startYear', 'runtimeMinutes', 'averageRating', 'numVotes',
                  'actor_score', 'actress_score', 'director_score', 'writer_score'] + list(genres_expanded.columns)

# Normalize these columns
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])


# Compute the cosine similarity matrix
cosine_sim_matrix = cosine_similarity(df[numerical_cols])

# Convert to DataFrame for better usability, setting the index and columns as movie IDs
cosine_sim_df = pd.DataFrame(cosine_sim_matrix, index=df['tconst'], columns=df['tconst'])


In [None]:
# Function to find most similar movies
def get_similar_movies(movie_id, top_n=5):
    # Ensure the movie ID is in the index to avoid errors
    if movie_id not in cosine_sim_df.index:
        return f"No data available for movie ID {movie_id}"

    # Get the similarity scores for a given movie with all movies
    sim_scores = cosine_sim_df.loc[movie_id]

    # Sort the movies based on the similarity scores
    sim_scores = sim_scores.sort_values(ascending=False)

    # Get the scores of the top-n most similar movies
    # Skip the first one since it will be the movie itself with a score of 1
    top_sim_scores = sim_scores.iloc[1:top_n+1]

    # Return the top similar movies and their scores
    sim_df = pd.DataFrame({'tconst': top_sim_scores.index, 'Similarity Score': top_sim_scores.values})
    similar_movies = pd.merge(sim_df, title_basics_crew_principals_ratings_merged_df, on='tconst', how='inner')[['tconst', 'primaryTitle', 'startYear']]
    return similar_movies

# Example usage:
similar_movies = get_similar_movies('tt0140683')
similar_movies

In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import os
import pickle

def create_cosine_similarity_matrices(som_clustered, df_ready_for_kmeans, clusters):
    for cluster in clusters:
        print(f"Creating cosine similarity matrix for cluster {cluster}...")
        cluster_df = som_clustered[som_clustered["cluster"] == cluster]
        merged_df = pd.merge(cluster_df, df_ready_for_kmeans, on="tconst", how="inner")
        df = merged_df.copy()
        
        # One-hot encoding genres
        genres_expanded = df['genres'].str.get_dummies(sep=',')
        df = pd.concat([df, genres_expanded], axis=1).drop('genres', axis=1)
        
        # Select numerical columns for similarity computation
        numerical_cols = ['isAdult', 'startYear', 'runtimeMinutes', 'averageRating', 'numVotes',
                          'actor_score', 'actress_score', 'director_score', 'writer_score'] + list(genres_expanded.columns)
        
        # Normalize these columns
        scaler = StandardScaler()
        df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
        
        # Compute the cosine similarity matrix
        cosine_sim_matrix = cosine_similarity(df[numerical_cols])
        
        # Convert to DataFrame for better usability, setting the index and columns as movie IDs
        cosine_sim_df = pd.DataFrame(cosine_sim_matrix, index=df['tconst'], columns=df['tconst'])
        cosine_sim_df.to_pickle(os.path.join(data_path, "cosine_sim_data", f"cosine_sim_df_cluster_{cluster}.sav"))
        print(f"Saved cosine similarity matrix for cluster {cluster}.")


def load_all_cosine_sim_dfs(clusters):
    all_cosine_sim_dfs = {}
    for cluster in clusters:
        file_path = os.path.join(data_path, "cosine_sim_data", f"cosine_sim_df_cluster_{cluster}.sav")
        if os.path.exists(file_path):
            all_cosine_sim_dfs[cluster] = pd.read_pickle(file_path)
            
    # dump the pkl
    with open(os.path.join(data_path, "cosine_sim_data", "all_cosine_sim_dfs.pkl"), "wb") as f:
        pickle.dump(all_cosine_sim_dfs, f)
        
    # read the pkl
    with open(os.path.join(data_path, "cosine_sim_data", "all_cosine_sim_dfs.pkl"), "rb") as f:
        all_cosine_sim_dfs = pickle.load(f)
    return all_cosine_sim_dfs

def find_similar_movies(tconst,som_clustered, all_cosine_sim_dfs):
    # Determine the cluster of the movie
    if tconst in som_clustered['tconst'].values:
        cluster = som_clustered.loc[som_clustered['tconst'] == tconst, 'cluster'].iloc[0]
        cosine_sim_df = all_cosine_sim_dfs[cluster]
        
        # Get the top 5 similar movies
        sim_scores = cosine_sim_df.loc[tconst].sort_values(ascending=False)[1:6]
        title_basics_crew_principals_ratings_merged_df = pd.read_pickle(os.path.join(data_path,"title.basics.crew.principals.ratings.cleaned.sav"))
        similar_movies = title_basics_crew_principals_ratings_merged_df[title_basics_crew_principals_ratings_merged_df['tconst'].isin(sim_scores.index)]
        similar_movies = similar_movies.assign(Similarity=sim_scores.values)
        return similar_movies[['tconst', 'primaryTitle', 'startYear', 'Similarity']]
    else:
        return "Movie ID not found."

# Example usage
som_clustered = pd.read_pickle(os.path.join(data_path,"som_clustered.sav"))
df_ready_for_kmeans = pd.read_pickle(os.path.join(data_path,"df_read_for_kmeans.sav"))

clusters = som_clustered['cluster'].unique()

# create_cosine_similarity_matrices(som_clustered, df_ready_for_kmeans, clusters)
all_cosine_sim_dfs = load_all_cosine_sim_dfs(clusters)
similar_movies = find_similar_movies('tt1375666', som_clustered, all_cosine_sim_dfs)
similar_movies

In [5]:
som_clustered.merge(title_basics_crew_principals_ratings_merged_df, on="tconst", how="inner")[['tconst', 'primaryTitle', 'startYear', 'cluster', 'averageRating', 'numVotes']].sort_values(by="numVotes", ascending=False)

Unnamed: 0,tconst,primaryTitle,startYear,cluster,averageRating,numVotes
45617,tt1375666,Inception,2010,0-6,8.8,2537886
38365,tt0816692,Interstellar,2014,2-9,8.7,2082145
55317,tt0133093,The Matrix,1999,0-6,8.7,2042794
32782,tt0167260,The Lord of the Rings: The Return of the King,2003,2-9,9.0,1969825
15,tt0114369,Se7en,1995,3-9,8.6,1789345
...,...,...,...,...,...,...
94542,tt0100110,Le marché du couple,1990,9-9,6.0,5
56739,tt2437612,The Hobby Stop,2012,5-5,8.8,5
659,tt15510976,A Filha Do Governador 2,2016,9-3,9.6,5
52732,tt8391544,Ticket,2019,6-6,6.6,5


In [8]:
import pickle
import pandas as pd
import os

base_path = os.path.dirname(os.getcwd())
data_path = os.path.abspath(os.path.join(base_path,'data'))


som_clustered = pd.read_pickle(os.path.join(data_path,"som_clustered.sav"))
df_ready_for_kmeans = pd.read_pickle(os.path.join(data_path,"df_read_for_kmeans.sav"))

clusters = som_clustered['cluster'].unique()


def load_all_cosine_sim_dfs():
    with open(os.path.join(data_path, "cosine_sim_data", "all_cosine_sim_dfs.pkl"), "rb") as f:
        all_cosine_sim_dfs = pickle.load(f)
        
    return all_cosine_sim_dfs

def find_similar_movies(tconst,som_clustered, all_cosine_sim_dfs):
    # Determine the cluster of the movie
    if tconst in som_clustered['tconst'].values:
        cluster = som_clustered.loc[som_clustered['tconst'] == tconst, 'cluster'].iloc[0]
        cosine_sim_df = all_cosine_sim_dfs[cluster]
        
        # Get the top 5 similar movies
        sim_scores = cosine_sim_df.loc[tconst].sort_values(ascending=False)[1:6]
        title_basics_crew_principals_ratings_merged_df = pd.read_pickle(os.path.join(data_path,"title.basics.crew.principals.ratings.cleaned.sav"))
        similar_movies = title_basics_crew_principals_ratings_merged_df[title_basics_crew_principals_ratings_merged_df['tconst'].isin(sim_scores.index)]
        similar_movies = similar_movies.assign(Similarity=sim_scores.values)
        return similar_movies[['tconst', 'primaryTitle', 'startYear', 'Similarity']].reset_index(drop=True)
    else:
        return "Movie ID not found."

all_cosine_sim_dfs = load_all_cosine_sim_dfs()
similar_movies = find_similar_movies('tt0102926', som_clustered, all_cosine_sim_dfs)
print(similar_movies.to_latex(index=False))

\begin{tabular}{llrr}
\toprule
tconst & primaryTitle & startYear & Similarity \\
\midrule
tt0375679 & Crash & 2004 & 0.978776 \\
tt0407887 & The Departed & 2006 & 0.975251 \\
tt0477348 & No Country for Old Men & 2007 & 0.920194 \\
tt0765443 & Eastern Promises & 2007 & 0.917364 \\
tt7286456 & Joker & 2019 & 0.887485 \\
\bottomrule
\end{tabular}

