# CineSimile: DM Project - Movie Recommendation System

In [None]:
import json
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="whitegrid")

pd.set_option('display.max_rows', 100)

In [None]:
import pickle
import glob

base_path = os.path.dirname(os.getcwd())
master_data_path = os.path.abspath(os.path.join(base_path,'master-data'))
data_path = os.path.abspath(os.path.join(base_path,'data'))
images_path = os.path.abspath(os.path.join(base_path,'dm-final-report', 'images'))   
# tsv_files = glob.glob(os.path.join(master_data_path,"*.tsv.gz"))

# for file in tsv_files:
#     print(file)
#     pickle.dump(pd.read_table(file,sep="\t",low_memory=False, na_values=["\\N","nan"]),
#                 open(file[:-7]+".sav","wb"))

## Merging the Datasets

Loading the datasets

\item \textbf{Title Basics Dataset}: Contains the basic information about the movies.

      \item \textbf{Title Crew Dataset}: Contains the directors and writers of the movies.
      
      \item \textbf{Title Principals Dataset}: Contains the cast members of the movies.
      
      \item \textbf{Title Ratings Dataset}: Contains the ratings of the movies.
      
      \item \textbf{Name Basics Dataset}: Contains the people's information who are associated with the movies.

In [None]:
title_basics_df = pd.read_pickle(os.path.join(data_path,"title.basics.cleaned.sav"))
title_crew_df = pd.read_pickle(os.path.join(data_path,"title.crew.cleaned.sav"))
title_principals_df = pd.read_pickle(os.path.join(data_path,"title.principals.cleaned.sav"))
title_ratings_df = pd.read_pickle(os.path.join(master_data_path,"title.ratings.sav"))
name_basics_df = pd.read_pickle(os.path.join(data_path,"name.basics.cleaned.sav"))

In [None]:
# missing values
title_basics_df.columns, title_basics_df.shape

In [None]:
title_crew_df.columns, title_crew_df.shape

In [None]:
title_principals_df.columns, title_principals_df.shape

In [None]:
title_ratings_df.columns, title_ratings_df.shape

In [None]:
name_basics_df.columns, name_basics_df.shape

In [None]:
# merge all title_basics_df, title_crew_df, title_principals_df and title_ratings_df on tconst

title_basics_crew_merged_df = pd.merge(title_basics_df, title_crew_df, on="tconst", how="inner")
title_basics_crew_principals_merged_df = pd.merge(title_basics_crew_merged_df, title_principals_df, on="tconst", how="inner")
title_basics_crew_principals_ratings_merged_df = pd.merge(title_basics_crew_principals_merged_df, title_ratings_df, on="tconst", how="inner")
title_basics_crew_principals_ratings_merged_df.to_pickle(os.path.join(data_path,"title.basics.crew.principals.ratings.cleaned.sav"))

# Read dataset

In [None]:
title_basics_crew_principals_ratings_merged_df = pd.read_pickle(os.path.join(data_path,"title.basics.crew.principals.ratings.cleaned.sav"))
title_basics_crew_principals_ratings_merged_df

In [None]:
# cleaned_principals_df = title_basics_crew_principals_ratings_merged_df[["tconst", "nconst"]].copy()
# cleaned_principals_df.to_pickle(os.path.join(data_path,"title.principals.cleaned.v2.sav"))

In [None]:
# title_basics_crew_principals_ratings_merged_df = title_basics_crew_principals_ratings_merged_df.drop(columns=["nconst"])
# title_basics_crew_principals_ratings_merged_df = title_basics_crew_principals_ratings_merged_df.drop_duplicates()
# title_basics_crew_principals_ratings_merged_df.to_pickle(os.path.join(data_path,"title.basics.crew.principals.ratings.cleaned.sav"))

In [None]:
import plotly.express as px

fig = px.box(title_basics_crew_principals_ratings_merged_df[["titleType", "runtimeMinutes"]], x='titleType', y='runtimeMinutes',
             category_orders={"titleType": ["movie"]},
             title='Distribution of Movie Runtime Minutes',
             log_y=True)  # Setting the y-axis to logarithmic scale

fig.update_layout(
    xaxis_title='',
    yaxis_title='Runtime Minutes (Log Scale)',
    title_font_size=20, 
    font=dict(size=14, color='black'), 
    width=800,  
    height=600, 
    margin=dict(l=40, r=40, t=60, b=40),  
    plot_bgcolor='white',  
    paper_bgcolor='white' 
)

fig.show()


In [None]:
title_basics_crew_principals_ratings_merged_df

In [None]:
title_basics_crew_principals_ratings_merged_df = title_basics_crew_principals_ratings_merged_df[(title_basics_crew_principals_ratings_merged_df["runtimeMinutes"] >= 30) & (title_basics_crew_principals_ratings_merged_df["runtimeMinutes"] <= 300)]
title_basics_crew_principals_ratings_merged_df = title_basics_crew_principals_ratings_merged_df.reset_index(drop=True)
title_basics_crew_principals_ratings_merged_df.to_pickle(os.path.join(data_path,"title.basics.crew.principals.ratings.cleaned.sav"))

In [None]:
title_basics_crew_principals_ratings_merged_df[["tconst"]].nunique()

In [None]:
title_basics_crew_principals_ratings_merged_df = pd.read_pickle(os.path.join(data_path,"title.basics.crew.principals.ratings.cleaned.sav"))
title_basics_crew_principals_ratings_merged_df

In [None]:
title_basics_crew_principals_ratings_merged_df["runtimeMinutes"].describe(include="all")

In [None]:
import plotly.express as px


fig = px.box(title_basics_crew_principals_ratings_merged_df, y='runtimeMinutes',
             title='Distribution of Movie Runtime Minutes',
             labels={'runtimeMinutes': 'Runtime Minutes'})

fig.update_layout(
    plot_bgcolor='white',  
    paper_bgcolor='white',  
    font_color='black',  
    title_font_size=20, 
    font=dict(size=14), 
    width=800, 
    height=600, 
    margin=dict(l=40, r=40, t=60, b=40)  
)

fig.show()


In [None]:
title_basics_crew_principals_ratings_merged_df

In [None]:
title_basics_crew_principals_ratings_merged_df["isAdult"].value_counts()

In [None]:
import plotly.express as px


counts = title_basics_crew_principals_ratings_merged_df['isAdult'].value_counts()


percentages = 100 * counts / counts.sum()


percentages_df = percentages.reset_index()
percentages_df.columns = ['isAdult', 'Percentage']


fig = px.pie(percentages_df, values='Percentage', names='isAdult', title='Percentage of Movies by Adult Content')


fig.update_layout(
    paper_bgcolor='white',  
    font_color='black',  
    title_font_size=20,  
    width=1000, 
    height=800,
    font=dict(size=14),  
)


fig.show()


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np


temp = title_basics_crew_principals_ratings_merged_df['genres'].dropna()
vec = CountVectorizer(token_pattern='(?u)\\b[\\w-]+\\b', analyzer='word').fit(temp)
bag_of_genres = vec.transform(temp)
unique_genres = vec.get_feature_names_out()  
np.array(unique_genres)

In [None]:
import plotly.express as px
import pandas as pd


genres = pd.DataFrame(bag_of_genres.todense(), columns=unique_genres, index=temp.index)
sorted_genres_perc = 100 * pd.Series(genres.sum()).sort_values(ascending=False) / genres.shape[0]


df_sorted_genres_perc = sorted_genres_perc.reset_index()
df_sorted_genres_perc.columns = ['Genre', 'Percentage of Films']


fig = px.bar(df_sorted_genres_perc, y='Genre', x='Percentage of Films', orientation='h',
             labels={'Percentage of Films': 'Percentage of Films (%) (Log Scale)'},
             height=800, title='Percentage of Films by Genre', log_x=True)


fig.update_layout(
    xaxis_title='Percentage of Films (%) (Log Scale)',
    yaxis_title='Genre',
    title_font_size=20,
    font=dict(size=14, color='black'),
    width=800,
    margin=dict(l=40, r=40, t=60, b=40),
    plot_bgcolor='white', 
    paper_bgcolor='white' 
)

In [None]:
title_basics_crew_principals_ratings_merged_df

In [None]:
import plotly.graph_objs as go
import pandas as pd


movies_per_year = title_basics_crew_principals_ratings_merged_df.groupby('startYear').size()


movies_per_year_df = movies_per_year.reset_index()
movies_per_year_df.columns = ['startYear', 'Count']


fig = go.Figure(data=go.Scatter(x=movies_per_year_df['startYear'], y=movies_per_year_df['Count'],
                                mode='markers+lines', 
                                marker=dict(size=5)))

fig.update_layout(title='Count of Movies Per Year',
                  xaxis=dict(
                      title="Year",
                      tickmode='linear',
                      dtick=5,
                      tickformat="%Y",
                      range=[movies_per_year_df['startYear'].min() - 5, 2025] 
                  ),
                  yaxis_title='Count of Movies',
                  paper_bgcolor='white',
                  plot_bgcolor='white',
                  font=dict(color='black'),
                  width=1000,
                  height=600)


fig.show()


In [None]:
import plotly.graph_objs as go
import pandas as pd


votes_per_year = title_basics_crew_principals_ratings_merged_df.groupby('startYear')['numVotes'].sum()


votes_per_year_df = votes_per_year.reset_index()
votes_per_year_df.columns = ['startYear', 'TotalVotes']


fig = go.Figure(data=go.Scatter(x=votes_per_year_df['startYear'], y=votes_per_year_df['TotalVotes'],
                                mode='markers+lines',  
                                marker=dict(size=5))) 


fig.update_layout(title='Total Number of Votes Per Year',
                  xaxis=dict(
                      title="Year",
                      tickmode='linear',
                      dtick=5,  
                      tickformat="%Y", 
                      range=[votes_per_year_df['startYear'].min()-5, 2025] 
                  ),
                  yaxis_title='Total Votes',
                  paper_bgcolor='white',
                  plot_bgcolor='white',
                  font=dict(color='black'),
                  width=1000,
                  height=600)


fig.show()


In [None]:
title_basics_crew_principals_ratings_merged_df.groupby('startYear')['numVotes'].sum().reset_index().sort_values(by="numVotes", ascending=False)

In [None]:
title_basics_crew_principals_ratings_merged_df[title_basics_crew_principals_ratings_merged_df["startYear"] == 2013].sort_values(by="numVotes", ascending=False)

In [None]:
title_basics_crew_principals_ratings_merged_df[title_basics_crew_principals_ratings_merged_df["startYear"] == 2012].sort_values(by="numVotes", ascending=False)

In [None]:
import plotly.graph_objs as go
import pandas as pd


average_rating_per_year = title_basics_crew_principals_ratings_merged_df.groupby('startYear')['averageRating'].mean()


average_rating_per_year_df = average_rating_per_year.reset_index()
average_rating_per_year_df.columns = ['startYear', 'AverageRating']


fig = go.Figure(data=go.Scatter(x=average_rating_per_year_df['startYear'], y=average_rating_per_year_df['AverageRating'],
                                mode='markers+lines',  
                                marker=dict(size=5)))


fig.update_layout(title='Average Movie Rating Per Year',
                  xaxis=dict(
                      title="Year",
                      tickmode='linear',
                      dtick=5, 
                      tickformat="%Y", 
                      range=[average_rating_per_year_df['startYear'].min() - 5, 2025]  
                  ),
                  yaxis_title='Average Rating',
                  paper_bgcolor='white',
                  plot_bgcolor='white',
                  font=dict(color='black'),
                  width=1000,
                  height=600)


fig.show()


In [None]:
title_basics_crew_principals_ratings_merged_df.groupby('startYear')['averageRating'].mean().reset_index().sort_values(by="averageRating", ascending=False)

In [None]:
title_basics_crew_principals_ratings_merged_df[title_basics_crew_principals_ratings_merged_df["startYear"] == 1909].sort_values(by="averageRating", ascending=False)

In [None]:
title_basics_crew_principals_ratings_merged_df.columns

In [None]:
import pandas as pd
import plotly.express as px


df = title_basics_crew_principals_ratings_merged_df.copy()
df['genres'] = df['genres'].str.split(',')
df_exploded = df.explode('genres')


df_exploded['decade'] = (df_exploded['startYear'] // 10) * 10


genre_decade_distribution = df_exploded.groupby(['decade', 'genres']).size().reset_index(name='count')


fig = px.line(genre_decade_distribution, x='decade', y='count', color='genres', 
              line_group='genres', markers=True, 
              labels={'decade': 'Decade', 'count': 'Number of Movies', 'genres': 'Genre'},
              title='Movie Distribution by Genre and Decade')


fig.update_layout(
    xaxis=dict(
        title='Decade',
        tickmode='array',
        tickvals=genre_decade_distribution['decade'].unique(),
        ticktext=[f"{int(d)}s" for d in genre_decade_distribution['decade'].unique()]
    ),
    yaxis=dict(
        title='Number of Movies',
        type='linear'  
    ),
    paper_bgcolor='white',
    plot_bgcolor='white',
    font=dict(color='black', size=15),
    width=1200, 
    height=1200 
)


fig.show()


In [None]:
import plotly.express as px


graph_df = title_basics_crew_principals_ratings_merged_df.copy()
graph_df['runtimeMinutes'] = pd.to_numeric(graph_df['runtimeMinutes'], errors='coerce')
graph_df = graph_df.dropna(subset=['runtimeMinutes'])


graph_df['genres'] = graph_df['genres'].str.split(',')
df_exploded = graph_df[["genres", "runtimeMinutes"]].explode('genres')


fig = px.box(df_exploded, x='genres', y='runtimeMinutes', notched=True,
             title='Runtime Durations by Genre')


fig.update_layout(
    xaxis_title='Genre',
    yaxis_title='Runtime Minutes',
    paper_bgcolor='white',
    plot_bgcolor='white',
    font=dict(color='black', size=14),
    xaxis={'categoryorder':'total descending'}, 
     yaxis=dict(
        dtick=10  
    ),
    width=1200,
    height=800
)


fig.show()


In [None]:
import pandas as pd
import plotly.express as px


df = title_basics_crew_principals_ratings_merged_df[["genres", "isAdult"]].copy()
df['genres'] = df['genres'].str.split(',')
df_exploded = df.explode('genres')


genre_adult_counts = df_exploded.groupby(['genres', 'isAdult']).size().reset_index(name='count')


genre_adult_pivot = genre_adult_counts.pivot(index='genres', columns='isAdult', values='count').reset_index()
genre_adult_pivot.columns = ['genres', 'non_adult', 'adult']


genre_adult_pivot = genre_adult_pivot.fillna(0)


genre_adult_pivot['adult_to_non_adult_ratio'] = genre_adult_pivot['adult'] / genre_adult_pivot['non_adult']


fig = px.bar(genre_adult_pivot, x='genres', y='adult_to_non_adult_ratio',
             title='Ratio of Adult to Non-Adult Content by Movie Genre')


fig.update_layout(
    xaxis_title='Genre',
    yaxis_title='Adult to Non-Adult Ratio',
    xaxis={'categoryorder':'total descending'},
    paper_bgcolor='white',
    plot_bgcolor='white',
    font=dict(color='black', size=14),
    yaxis=dict(type="log"),  
    width=1200,
    height=800
)


fig.show()


In [None]:
import pandas as pd
import plotly.graph_objs as go


df = title_basics_crew_principals_ratings_merged_df.copy()
df['genres'] = df['genres'].str.split(',')
df_exploded = df.explode('genres')


genre_stats = df_exploded.groupby('genres').agg({
    'numVotes': 'sum',
    'averageRating': 'mean'
}).reset_index()


genre_stats = genre_stats.sort_values(by='numVotes', ascending=False)


In [None]:
# Create the bar plot for numVotes
fig_votes = go.Figure([go.Bar(x=genre_stats['genres'], y=genre_stats['numVotes'], marker_color='blue')])
fig_votes.update_layout(
    title='Total Number of Votes by Genre',
    xaxis_title='Genre',
    yaxis_title='Total Votes (Log Scale)',
    paper_bgcolor='white',
    plot_bgcolor='white',
    yaxis=dict(type='log'),  # Logarithmic scale for better visualization
    font=dict(color='black', size=15),
    width=1200,  # Adjust width
    height=800   # Adjust height
)
fig_votes.show()


In [None]:
# Create the bar plot for averageRating
import plotly.graph_objs as go

genre_stats = genre_stats.sort_values(by='averageRating', ascending=False)

fig_ratings = go.Figure([go.Bar(x=genre_stats['genres'], y=genre_stats['averageRating'], marker_color='green')])
fig_ratings.update_layout(
    title='Average Rating by Genre',
    xaxis_title='Genre',
    yaxis_title='Average Rating',
    paper_bgcolor='white',
    plot_bgcolor='white',
    yaxis=dict(
        range=[0, 9], 
        dtick=1, 
        tickmode='array',
        tickvals=[1, 2, 3, 4, 5, 6, 7, 8], 
        gridcolor='grey'
    ),
    font=dict(color='black', size=15),
    width=1200, 
    height=800   
)

fig_ratings.show()


In [None]:
import pandas as pd

name_basics_df = pd.read_pickle(os.path.join(data_path,"name.basics.cleaned.sav"))


df_movies = title_basics_crew_principals_ratings_merged_df[["directors", "numVotes", "averageRating"]].copy()
df_movies['directors'] = df_movies['directors'].str.split(',')
df_exploded = df_movies.explode('directors')


df_directors = name_basics_df[['nconst', 'primaryName']].drop_duplicates()
df_exploded = df_exploded.merge(df_directors, left_on='directors', right_on='nconst', how='left')


director_popularity = df_exploded.groupby('primaryName').agg({
    'numVotes': 'sum'  
}).reset_index()


top_directors = director_popularity.sort_values(by='numVotes', ascending=False).head(50)


In [None]:
import plotly.graph_objs as go

# Create the bar plot for top directors
fig = go.Figure([go.Bar(x=top_directors['primaryName'], y=top_directors['numVotes'])])
fig.update_layout(
    title='Top 50 Directors by Movie Popularity (Total Votes)',
    xaxis_title='Director',
    yaxis_title='Total Votes',
    xaxis={'categoryorder':'total descending'},
    paper_bgcolor='white',
    plot_bgcolor='white',
    font=dict(color='black', size=15),
    width=1200,  
    height=800  
)
fig.show()


In [None]:
df_movies = title_basics_crew_principals_ratings_merged_df[["tconst", "numVotes"]]
df_movies

In [None]:
cleaned_principals_df = pd.read_pickle(os.path.join(data_path,"title.principals.cleaned.v2.sav"))
cleaned_principals_df.describe()

In [None]:
name_basics_df

In [None]:
movie_principal = df_movies.merge(cleaned_principals_df, on="tconst", how="inner")
movie_principal_basic = movie_principal.merge(name_basics_df[["nconst", "primaryName"]], on="nconst", how="inner")
movie_principal_basic

In [None]:
import pandas as pd
import plotly.express as px


name_votes = movie_principal_basic.groupby('primaryName')['numVotes'].sum().reset_index()


top_50_votes = name_votes.sort_values(by='numVotes', ascending=False).head(50)


fig = px.bar(top_50_votes, x='primaryName', y='numVotes', title='Top 50 Names with Highest Votes')


fig.update_layout(
    xaxis_title='Name',
    yaxis_title='Total Votes',
    xaxis_tickangle=-45,
    paper_bgcolor='white',
    plot_bgcolor='white',
    font=dict(color='black', size=15),
    width=1200,
    height=800
)


fig.show()


In [None]:
title_basics_crew_principals_ratings_merged_df

In [None]:
title_principals_df = pd.read_pickle(os.path.join(data_path,"title.principals.cleaned.sav"))
name_basics_original_df = pd.read_pickle(os.path.join(master_data_path,"name.basics.sav"))

In [None]:
title_principals_df

In [None]:
name_basics_original_df = name_basics_original_df[["nconst", "primaryName", "primaryProfession"]].copy()

In [None]:
name_basics_original_df

In [None]:
movies_df = title_basics_crew_principals_ratings_merged_df[["tconst", "averageRating","numVotes"]].copy()
movies_df = movies_df.merge(title_principals_df, on="tconst", how="inner")
movies_df = movies_df.merge(name_basics_original_df, on="nconst", how="inner")
movies_df

In [None]:
movies_df[movies_df["primaryName"] == "Hans Zimmer"]

In [None]:
import pandas as pd
import plotly.express as px


df =movies_df[['averageRating', 'numVotes','primaryName', 'primaryProfession']].copy()
df['primaryProfession'] = df['primaryProfession'].fillna('')
df = df[df['primaryName'] != "Hans Zimmer"]


df['averageRating'] = pd.to_numeric(df['averageRating'], errors='coerce')
df['numVotes'] = pd.to_numeric(df['numVotes'], errors='coerce')
df.dropna(subset=['averageRating', 'numVotes'], inplace=True)


df['is_actor'] = df['primaryProfession'].str.contains('actor', na=False)
df['is_actress'] = df['primaryProfession'].str.contains('actress', na=False)

actors_df = df[df['is_actor']].groupby('primaryName').agg({
    'averageRating': 'mean',
    'numVotes': 'sum'
}).sort_values(by='numVotes', ascending=False).head(10)

actresses_df = df[df['is_actress']].groupby('primaryName').agg({
    'averageRating': 'mean',
    'numVotes': 'sum'
}).sort_values(by='numVotes', ascending=False).head(10)

fig_actors = px.scatter(actors_df, x='averageRating', y='numVotes', text=actors_df.index,
                        title='Top 10 Actors by Average Rating and Total Votes',
                        labels={'averageRating': 'Average Rating', 'numVotes': 'Total Votes'})
fig_actors.update_traces(textposition='top center')
fig_actors.update_layout(    paper_bgcolor='white',
    plot_bgcolor='white',
    font=dict(color='black', size=15), xaxis_tickangle=-45, width=1500, height=1500)
fig_actors.show()

In [None]:
fig_actresses = px.scatter(actresses_df, x='averageRating', y='numVotes', text=actresses_df.index,
                           title='Top 10 Actresses by Average Rating and Total Votes',
                           labels={'averageRating': 'Average Rating', 'numVotes': 'Total Votes'})
fig_actresses.update_traces(textposition='top center')
fig_actresses.update_layout(    paper_bgcolor='white',
    plot_bgcolor='white',
    font=dict(color='black', size=15), xaxis_tickangle=-45, width=1500, height=1500)
fig_actresses.show()

In [None]:
import pandas as pd
import plotly.express as px


df = movies_df[['averageRating', 'primaryProfession']].copy()
df['primaryProfession'] = df['primaryProfession'].apply(lambda x: x.split(',') if isinstance(x, str) else [])
df_exploded = df.explode('primaryProfession')


high_rating_threshold = 7.5 
high_rating_df = df_exploded[df_exploded['averageRating'] >= high_rating_threshold]


profession_rating = high_rating_df.groupby('primaryProfession')['averageRating'].mean().reset_index()


profession_rating_sorted = profession_rating.sort_values(by='averageRating', ascending=False)


fig = px.bar(profession_rating_sorted.head(20), x='primaryProfession', y='averageRating',
             title='Top Professions Associated with High-Rating Titles')


fig.update_layout(
    xaxis_title='Profession',
    yaxis_title='Average Rating of High-Rating Titles',
    paper_bgcolor='white',
    plot_bgcolor='white',
    font=dict(color='black', size=14),
    width=1200,
    height=800
)


fig.show()


In [None]:
title_basics_crew_principals_ratings_merged_df[title_basics_crew_principals_ratings_merged_df["genres"].str.contains("TV")]

In [None]:
import pandas as pd
import plotly.express as px


df = title_basics_crew_principals_ratings_merged_df[['startYear', 'averageRating', 'numVotes', 'genres']].copy()
df['decade'] = (df['startYear'] // 10) * 10


df['averageRating'] = pd.to_numeric(df['averageRating'], errors='coerce')
df['numVotes'] = pd.to_numeric(df['numVotes'], errors='coerce')


df['genres'] = df['genres'].str.split(',')
df_exploded = df.explode('genres')


df_sampled = df_exploded.sample(frac=0.1, random_state=1)

fig = px.scatter(df_sampled, x='averageRating', y='numVotes', color='genres',
                 facet_col='genres', facet_col_wrap=4,
                 title='Correlation between Ratings and Votes Across Genres',
                 labels={'averageRating': 'Average Rating', 'numVotes': 'Number of Votes'})


fig.update_layout(
    paper_bgcolor='white',
    plot_bgcolor='white',
    font=dict(color='black', size=14),
    height=1200,  
    width=1200   
)


fig.show()


In [None]:
import pandas as pd
import plotly.express as px


df_sampled = df_exploded.sample(frac=0.1, random_state=1)


fig = px.scatter(df_sampled, x='averageRating', y='numVotes', color='genres',
                 facet_col='decade', facet_col_wrap=4,
                 title='Correlation between Ratings and Votes Across Decades',
                 labels={'averageRating': 'Average Rating', 'numVotes': 'Number of Votes'},
                 category_orders={"decade": sorted(df_sampled['decade'].unique())})  # Ensure decades are sorted


fig.update_layout(
    paper_bgcolor='white',
    plot_bgcolor='white',
    font=dict(color='black', size=14),
    height=1200,  
    width=1200  
)


fig.show()


In [None]:
import pandas as pd
import plotly.express as px


df = title_basics_crew_principals_ratings_merged_df[['startYear', 'averageRating']].copy()
df['decade'] = (df['startYear'] // 10) * 10


decade_rating = df.groupby('decade')['averageRating'].mean().reset_index()


fig = px.line(decade_rating, x='decade', y='averageRating',
              title='Average Movie Ratings Across Decades',
              labels={'decade': 'Decade', 'averageRating': 'Average Rating'})


fig.update_layout(
    xaxis_title='Decade',
    yaxis_title='Average Rating',
    paper_bgcolor='white',
    plot_bgcolor='white',
    font=dict(color='black', size=15),
    xaxis_tickformat='d', 
    xaxis=dict(
                      title="Decade",
                      tickmode='linear',
                      dtick=10,
                      tickformat="%Y",
                      range=[df['startYear'].min() - 5, 2025]  
                  ),
    width=1200,
    height=800
)


fig.show()


In [None]:
import pandas as pd
import plotly.express as px


df = title_basics_crew_principals_ratings_merged_df[['runtimeMinutes', 'averageRating', 'genres']].copy()
df['runtimeMinutes'] = pd.to_numeric(df['runtimeMinutes'], errors='coerce')
df['averageRating'] = pd.to_numeric(df['averageRating'], errors='coerce')


df = df.dropna(subset=['runtimeMinutes', 'averageRating'])


df['genres'] = df['genres'].str.split(',')
df_exploded = df.explode('genres')

df_filtered = df_exploded[df_exploded['runtimeMinutes'] < 301] 

fig = px.scatter(df_filtered, x='runtimeMinutes', y='averageRating', color='genres',
                 facet_col='genres', facet_col_wrap=4, 
                 title='Runtime vs. Average Rating by Genre',
                 labels={'runtimeMinutes': 'Runtime (minutes)', 'averageRating': 'Average Rating'},
                 hover_data=['genres'])


fig.update_layout(
    xaxis_title='Runtime in Minutes',
    yaxis_title='Average Rating',
    paper_bgcolor='white',
    plot_bgcolor='white',
    font=dict(color='black', size=15),
    height=1500,  
    width=1500
)


fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.update_xaxes(matches=None, showticklabels=True)
fig.update_yaxes(matches=None, showticklabels=True)


fig.show()

In [None]:
import plotly.figure_factory as ff

features = ['averageRating', 'runtimeMinutes', 'numVotes']
correlation_matrix = title_basics_crew_principals_ratings_merged_df[features].corr()


colors = 'Viridis' 


fig = ff.create_annotated_heatmap(
    z=correlation_matrix.to_numpy(),
    x=correlation_matrix.columns.tolist(),
    y=correlation_matrix.index.tolist(),
    annotation_text=correlation_matrix.round(2).astype(str).to_numpy(),
    showscale=True,
    colorscale=colors
)


fig.update_layout(
    title_text='Correlation Matrix',
    title_x=0.5,
    xaxis_title='Features',
    yaxis_title='Features',
    xaxis=dict(tickmode='array', tickvals=list(range(len(features))), ticktext=features),
    yaxis=dict(tickmode='array', tickvals=list(range(len(features))), ticktext=features)
)


fig.update_layout(
    title_font_size=20,
    width=800,
    height=800,
    margin=dict(l=40, r=40, t=200, b=40),
    paper_bgcolor='white',
    plot_bgcolor='white',
    font=dict(color='black', size=15),
)


fig.show()

In [None]:
import pandas as pd
import scipy.stats as stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd


genres_df = title_basics_crew_principals_ratings_merged_df[["genres"]]['genres'].str.get_dummies(sep=',')


genre_ratings_df = genres_df.join(title_basics_crew_principals_ratings_merged_df[["averageRating"]]['averageRating'])


anova_results = {}
for genre in genres_df.columns:
    grouped_data = genre_ratings_df.groupby(genre)['averageRating']
 
    anova_results[genre] = stats.f_oneway(grouped_data.get_group(1), grouped_data.get_group(0))


anova_df = pd.DataFrame(anova_results, index=['F-value', 'p-value']).T


print(anova_df)


tukey_results = pairwise_tukeyhsd(endog=genre_ratings_df['averageRating'], groups=genre_ratings_df['Drama'])
print()
print(tukey_results)


In [42]:


import pandas as pd
from scipy.stats import chi2_contingency


df = title_basics_crew_principals_ratings_merged_df[['genres', 'isAdult']].copy()
df['genres'] = df['genres'].str.split(',')
df_exploded = df.explode('genres')


contingency_table = pd.crosstab(df_exploded['genres'], df_exploded['isAdult'])


chi2, p_value, dof, expected = chi2_contingency(contingency_table)


print("Chi-square statistic:", chi2)
print("P-value:", p_value)
print("Degrees of freedom:", dof)
print("\nExpected counts:\n", pd.DataFrame(expected, index=contingency_table.index, columns=contingency_table.columns))

Chi-square statistic: 244755.2042471774
P-value: 0.0
Degrees of freedom: 26

Expected counts:
 isAdult              False        True 
genres                                 
Action        26320.750461   232.249539
Adult          2127.229710    18.770290
Adventure     16494.455906   145.544094
Animation      4448.745078    39.254922
Biography      7479.006599    65.993401
Comedy        66247.444537   584.555463
Crime         23989.322559   211.677441
Documentary   28095.093975   247.906025
Drama        120352.035418  1061.964582
Family        10036.440267    88.559733
Fantasy        8112.417496    71.582504
Film-Noir       873.294210     7.705790
Game-Show         1.982507     0.017493
History        7044.837628    62.162372
Horror        19971.772692   176.227308
Music          5441.980945    48.019055
Musical        5593.642709    49.357291
Mystery       11147.635283    98.364717
News            455.976545     4.023455
Reality-TV       24.781334     0.218666
Romance       29732.64452