# 02 - Exploratory Data Analysis

This notebook explores trends and patterns in the cleaned TMDB dataset.


# Load Data 

In [37]:
import pandas as pd
from collections import Counter
import plotly.express as px
import seaborn as sns
import plotly.graph_objects as go
import nbformat
import numpy as np

In [38]:
# Load cleaned data
df = pd.read_csv('../data/cleaned_tmdb_movies.csv')

# Set visual style
sns.set(style="whitegrid")


# Top Genres

In [47]:
# Flatten genres
all_genres = [genre for sublist in df['genre_names'].apply(eval) for genre in sublist]
genre_counts = Counter(all_genres)
genre_df = pd.DataFrame(genre_counts.items(), columns=['Genre', 'Count']).sort_values('Count', ascending=False)

# Step 2: Create figure
fig_top_genres = px.bar(genre_df.head(10), x='Genre', y='Count', title="Top 10 Genres")
fig_top_genres.show()


In [42]:
with open("../img/fig_top_genres.pkl", "wb") as f:
    pickle.dump(fig_top_genres, f)

# Top Actors

In [46]:
# Flatten genres
all_genres = [genre for sublist in df['genre_names'].apply(eval) for genre in sublist]
genre_counts = Counter(all_genres)
genre_df = pd.DataFrame(genre_counts.items(), columns=['Genre', 'Count']).sort_values('Count', ascending=False)

fig_top_actors = px.bar(genre_df.head(10), x='Genre', y='Count', title="Top 10 Genres")
fig_top_actors.show()


In [44]:
with open("../img/fig_top_actors.pkl", "wb") as f:
    pickle.dump(fig_top_actors, f)

# Top Directors

In [78]:
# Step 1: Clean director names
valid_directors = df['director'].dropna()
valid_directors = valid_directors[~valid_directors.str.strip().isin(["", "Unknown"])]


In [79]:
# Step 2: Count top directors
director_df = valid_directors.value_counts().reset_index()
director_df.columns = ['Director', 'Count']


# Step 3: Create horizontal bar chart
fig_top_10_directors = px.bar(
    director_df.head(10),
    x='Count',
    y='Director',
    orientation='h',
    title="Top 10 Directors"
)
fig_top_10_directors.update_layout(yaxis=dict(autorange="reversed"))
fig_top_10_directors.show()


In [80]:
with open("../img/fig_top_10_directors.pkl", "wb") as f:
    pickle.dump(fig_top_10_directors, f)

# Budget vs Revenue

In [None]:
# Step 1: Create scatter plot
fig_budget_rev = px.scatter(
    df,
    x='budget',
    y='revenue',
    hover_data=['title_x'],
    title="Budget vs Revenue",
    labels={'title_x': 'Title'}
)

fig_budget_rev.show()

In [None]:
with open("../img/fig_budget_rev.pkl", "wb") as f:
    pickle.dump(fig_budget_rev, f)

In [None]:

# Step 1: Filter out rows where budget or revenue is <= 0
df_log = df[(df['budget'] > 0) & (df['revenue'] > 0)].copy()

# Step 2: Apply log transformation
df_log['log_budget'] = np.log1p(df_log['budget'])
df_log['log_revenue'] = np.log1p(df_log['revenue'])

fig_log_budget_rev = px.scatter(
    df_log,
    x='log_budget',
    y='log_revenue',
    hover_data=['title_x'],
    title="Log-Scaled Budget vs Revenue"
)

fig_log_budget_rev.show()


In [29]:
with open("../img/fig_log_budget_rev.pkl", "wb") as f:
    pickle.dump(fig_log_budget_rev, f)

# Movies per Year

In [59]:
df['release_year'] = pd.to_datetime(df['release_date']).dt.year
yearly_counts = df['release_year'].value_counts().sort_index()

fig_movie_per_year = px.line(
    x=yearly_counts.index,
    y=yearly_counts.values,
    labels={'x': 'Year', 'y': 'Movie Count'},
    title="Number of Movies Released per Year"
)

fig_movie_per_year.show()

In [60]:
with open("../img/fig_movie_per_year.pkl", "wb") as f:
    pickle.dump(fig_movie_per_year, f)


# Choropleth Map: Movies by Country

In [12]:

# Extract all country names
all_countries = [country['name']
                 for row in df['production_countries'].apply(eval)
                 for country in row]

# Count appearances
country_counts = Counter(all_countries)
country_df = pd.DataFrame(country_counts.items(), columns=['Country', 'Count'])


In [61]:
fig_movies_country = px.choropleth(
    country_df,
    locations="Country",
    locationmode="country names",
    color="Count",
    hover_name="Country",
    color_continuous_scale="viridis",
    title="Number of Movies Produced by Country"
)
fig_movies_country.show()


In [62]:
with open("../img/fig_movies_country.pkl", "wb") as f:
    pickle.dump(fig_movies_country, f)



# Treemap: Movies by Genre

In [14]:
# Flatten all genres
all_genres = [genre for row in df['genre_names'].apply(eval) for genre in row]
genre_counts = Counter(all_genres)
genre_df = pd.DataFrame(genre_counts.items(), columns=['Genre', 'Count'])


In [63]:
fig_movies_genres = px.treemap(genre_df,
                 path=['Genre'],
                 values='Count',
                 title='Distribution of Movies by Genre')

fig_movies_genres.show()


In [64]:
with open("../img/fig_movies_genres.pkl", "wb") as f:
    pickle.dump(fig_movies_genres, f)

# Donut Chart: Distribution of Languages

In [16]:
# Extract all spoken language names
all_languages = [lang['name']
                 for row in df['spoken_languages'].apply(eval)
                 for lang in row]

from collections import Counter
language_counts = Counter(all_languages)
language_df = pd.DataFrame(language_counts.items(), columns=['Language', 'Count'])
language_df = language_df.sort_values('Count', ascending=False)


In [65]:
import plotly.express as px

fig_top10_lang = px.pie(language_df.head(10),  # show top 10 languages
             names='Language',
             values='Count',
             hole=0.4,
             title='Top 10 Spoken Languages in Movies')

fig_top10_lang.update_traces(textposition='inside', textinfo='percent+label')
fig_top10_lang.show()


In [67]:
import pickle
with open("../img/fig_languages.pkl", "wb") as f:
    pickle.dump(fig_top10_lang, f)


#  Sunburst Chart: Genre + Subgenre 

In [69]:
#  Create Multi-Genre Pairs (Simulated Combo)
from itertools import combinations

# Generate pairs of genres for movies with 2+ genres
def genre_pairs(genre_list):
    genres = eval(genre_list)
    return [tuple(sorted(pair)) for pair in combinations(genres, 2)] if len(genres) > 1 else []

pairs = df['genre_names'].apply(genre_pairs)
flat_pairs = [pair for sublist in pairs for pair in sublist]

pair_counts = Counter(flat_pairs)
pair_df = pd.DataFrame(pair_counts.items(), columns=['Pair', 'Count'])
pair_df[['Genre1', 'Genre2']] = pd.DataFrame(pair_df['Pair'].tolist(), index=pair_df.index)

top_n = 8

top_languages = language_df.head(top_n).copy()
other_count = language_df['Count'][top_n:].sum()


In [70]:
fig_spoken_lang = px.bar(language_df.head(10),
             x='Count',
             y='Language',
             orientation='h',
             title='Top 10 Spoken Languages in Movies')

fig_spoken_lang.update_layout(yaxis=dict(autorange='reversed'))
fig_spoken_lang.show()


In [71]:
import pickle
with open("../img/fig_spoken_languages.pkl", "wb") as f:
    pickle.dump(fig_spoken_lang, f)
