<a href="https://colab.research.google.com/github/TyrelN/Data-Analysis-IMDB-Trends/blob/main/Capstone_1_Tyrel_Narciso.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import kagglehub
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Download the dataset
path = kagglehub.dataset_download("krishnanshverma/imdb-movies-dataset")

# Find the CSV file in the downloaded path
for file in os.listdir(path):
    if file.endswith(".csv"):
        dataset_file = os.path.join(path, file)
        break

# Load the dataset into a Pandas DataFrame
df = pd.read_csv(dataset_file)

df

Unnamed: 0,name,year,movie_rated,run_length,genres,release_date,rating,num_raters,num_reviews
0,Inception,2010,PG-13,2h 28min,Action; Adventure; Sci-Fi;,16 July 2010 (USA),8.8,1981675,3820
1,The Lord of the Rings: The Fellowship of the Ring,2001,PG-13,2h 58min,Action; Adventure; Drama;,19 December 2001 (USA),8.8,1609165,5365
2,The Lord of the Rings: The Return of the King,2003,PG-13,3h 21min,Adventure; Drama; Fantasy;,17 December 2003 (USA),8.9,1593859,3681
3,The Dark Knight Rises,2012,PG-13,2h 44min,Action; Adventure;,20 July 2012 (USA),8.4,1470329,2979
4,The Lord of the Rings: The Two Towers,2002,PG-13,2h 59min,Adventure; Drama; Fantasy;,18 December 2002 (USA),8.7,1440188,2559
...,...,...,...,...,...,...,...,...,...
1495,Catch-22,1970,R,2h 2min,Comedy; Drama; War;,24 June 1970 (USA),7.1,21424,145
1496,The Great Raid,2005,R,2h 12min,Action; Drama; War;,12 August 2005 (USA),6.7,20965,194
1497,Saints and Soldiers,2003,PG-13,1h 30min,Action; Drama; War;,25 March 2005 (USA),6.7,19730,163
1498,Stop-Loss,2008,R,1h 52min,Drama; War;,28 March 2008 (USA),6.4,19456,102


In [None]:
#regex import
import re

# Step 1: Clean release_date column
df['release_date_clean'] = df['release_date'].astype(str).str.split('(').str[0].str.strip()

# Step 2: Convert to datetime
df['release_date_dt'] = pd.to_datetime(df['release_date_clean'], format='%d %B %Y', errors='coerce')

def convert_run_length_to_minutes(run_length):
    if not run_length or pd.isnull(run_length):
        return None

    # Initialize hours and minutes
    hours = 0
    minutes = 0

    # Regex search for hours and minutes
    hours_match = re.search(r'(\d+)h', run_length)
    mins_match = re.search(r'(\d+)min', run_length)

    if hours_match:
        hours = int(hours_match.group(1))

    if mins_match:
        minutes = int(mins_match.group(1))

    # Total minutes
    total_minutes = hours * 60 + minutes
    return total_minutes

df['run_length_minutes'] = df['run_length'].apply(convert_run_length_to_minutes)
# Show results
print(df[['release_date', 'release_date_clean', 'release_date_dt','run_length', 'run_length_minutes']])


                release_date release_date_clean release_date_dt run_length  \
0         16 July 2010 (USA)       16 July 2010      2010-07-16   2h 28min   
1     19 December 2001 (USA)   19 December 2001      2001-12-19   2h 58min   
2     17 December 2003 (USA)   17 December 2003      2003-12-17   3h 21min   
3         20 July 2012 (USA)       20 July 2012      2012-07-20   2h 44min   
4     18 December 2002 (USA)   18 December 2002      2002-12-18   2h 59min   
...                      ...                ...             ...        ...   
1495      24 June 1970 (USA)       24 June 1970      1970-06-24    2h 2min   
1496    12 August 2005 (USA)     12 August 2005      2005-08-12   2h 12min   
1497     25 March 2005 (USA)      25 March 2005      2005-03-25   1h 30min   
1498     28 March 2008 (USA)      28 March 2008      2008-03-28   1h 52min   
1499      9 April 2004 (USA)       9 April 2004      2004-04-09   2h 17min   

      run_length_minutes  
0                    148  
1        

In [None]:
import pprint
stats = df.describe()
stats.head()

Unnamed: 0,year,rating,num_raters,num_reviews,release_date_dt,run_length_minutes
count,1500.0,1500.0,1500.0,1500.0,1498,1500.0
mean,2002.632,7.4776,411674.587333,996.054667,2003-05-24 19:23:09.052069504,122.178
min,1915.0,3.5,19290.0,102.0,1915-03-21 00:00:00,66.0
25%,1998.0,7.0,191244.5,424.75,1999-03-01 00:00:00,105.0
50%,2006.0,7.6,341452.0,724.5,2007-01-19 00:00:00,119.0


In [None]:
import pandas as pd
from bokeh.palettes import Category10

# Split the genres and explode the dataframe
df_genre_exploded = df.assign(genre=df['genres'].str.split('; ')).explode('genre')

# Group by genre and count movies for later visualization
movies_per_genre = df_genre_exploded['genre'].value_counts().reset_index()
movies_per_genre.columns = ['genre', 'movie_count']

df['primary_genre'] = df['genres'].str.split(',').str[0]

# Group by year and primary genre, count movies
genre_year_count = df.groupby(['year', 'primary_genre']).size().reset_index(name='movie_count')

# Pivot for wide format
pivot_table = genre_year_count.pivot(index='year', columns='primary_genre', values='movie_count').fillna(0)


# Get the top 10 genres by total movie count
top_genres = pivot_table.sum().sort_values(ascending=False).head(10).index.tolist()

# Filter the pivot table to only include these top genres
pivot_table_top = pivot_table[top_genres]

# Get the colors for 10 genres
colors = Category10[10]
genres = top_genres  # Already limited to 10

# Create the figure
p = figure(title="Top 10 Genre Popularity Over Time (Number of Movies)",
           x_axis_label='Year',
           y_axis_label='Number of Movies Released',
           height=500, width=800)

# Create stacked areas
p.varea_stack(stackers=genres,
              x='year',
              color=colors,
              legend_label=genres,
              source=source)

# Customize the legend
p.legend.location = 'top_left'
p.legend.title = 'Genres'
p.legend.click_policy = 'hide'

# Show the plot
show(p)

In [None]:
# Make sure 'year' and 'rating' are numeric
df['year'] = pd.to_numeric(df['year'], errors='coerce')
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')

# Drop rows with missing year or rating
df_year_rating = df.dropna(subset=['year', 'rating'])

# Group by year and calculate average rating
ratings_by_year = df_year_rating.groupby('year')['rating'].mean().reset_index()

ratings_by_year.head()

Unnamed: 0,year,rating
0,1915,6.3
1,1925,8.0
2,1928,8.1
3,1930,8.0
4,1933,7.8


In [None]:
# Make sure ratings and reviews are numeric
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')
df['num_reviews'] = pd.to_numeric(df['num_reviews'], errors='coerce')

# Drop rows with missing values
scatter_df = df.dropna(subset=['rating', 'num_reviews', 'name'])

scatter_df.head()

Unnamed: 0,name,year,movie_rated,run_length,genres,release_date,rating,num_raters,num_reviews,release_date_clean,release_date_dt,run_length_minutes
0,Inception,2010,PG-13,2h 28min,Action; Adventure; Sci-Fi;,16 July 2010 (USA),8.8,1981675,3820,16 July 2010,2010-07-16,148
1,The Lord of the Rings: The Fellowship of the Ring,2001,PG-13,2h 58min,Action; Adventure; Drama;,19 December 2001 (USA),8.8,1609165,5365,19 December 2001,2001-12-19,178
2,The Lord of the Rings: The Return of the King,2003,PG-13,3h 21min,Adventure; Drama; Fantasy;,17 December 2003 (USA),8.9,1593859,3681,17 December 2003,2003-12-17,201
3,The Dark Knight Rises,2012,PG-13,2h 44min,Action; Adventure;,20 July 2012 (USA),8.4,1470329,2979,20 July 2012,2012-07-20,164
4,The Lord of the Rings: The Two Towers,2002,PG-13,2h 59min,Adventure; Drama; Fantasy;,18 December 2002 (USA),8.7,1440188,2559,18 December 2002,2002-12-18,179


In [None]:
from bokeh.models import NumeralTickFormatter, ColumnDataSource, HoverTool, ColorBar, LinearColorMapper, FactorRange
from bokeh.transform import dodge, jitter, factor_cmap
from bokeh.layouts import gridplot
from bokeh.io import output_notebook, show
from bokeh.plotting import figure, show
import numpy as np

output_notebook()  # This sets up inline plotting in Colab (Jupyter-compatible)

source1 = ColumnDataSource(ratings_by_year)

#regression prep
years = source1.data['year']
ratings = source1.data['rating']

# Fit a linear regression (degree=1 for a straight line)
coeffs = np.polyfit(years, ratings, 1)
poly_eq = np.poly1d(coeffs)

# Generate y values (predicted ratings) for each year in your data
regression_ratings = poly_eq(years)

p1 = figure(title="Average IMDb Ratings Over Years",
            x_axis_label='Year',
            y_axis_label='Average Rating',
            height=350, width=600)

p1.line(x='year', y='rating', source=source1, line_width=2, color='navy')
p1.scatter(x='year', y='rating', source=source1, size=5, color='red')

# Plot regression line
p1.line(x=years, y=regression_ratings, line_width=2, color='firebrick', line_dash='dashed', legend_label='Regression Line')

# Optional formatting
p1.yaxis.formatter = NumeralTickFormatter(format="0.0")
p1.legend.location = 'top_left'
p1.legend.click_policy = 'hide'

show(p1)

In [None]:
genres = movies_per_genre['genre'].tolist()
counts = movies_per_genre['movie_count'].tolist()

p2 = figure(x_range=genres,
            title="Number of Movies Per Genre",
            height=350, width=600,
            y_axis_label='Number of Movies')

p2.vbar(x=genres, top=counts, width=0.5, color='green')

# Rotate x-axis labels for readability
p2.xaxis.major_label_orientation = 0.9

p2.xgrid.grid_line_color = None
p2.y_range.start = 0

show(p2)

In [None]:
source3 = ColumnDataSource(scatter_df)

p3 = figure(title="Ratings vs Number of Reviews",
            x_axis_label='Rating',
            y_axis_label='Number of Reviews',
            height=350, width=600,
            tools='pan,box_zoom,reset')

# Circles with alpha transparency
p3.scatter(x='rating', y='num_reviews', size=10, source=source3, color='purple', alpha=0.6)

# Hover tool to show movie names
hover = HoverTool()
hover.tooltips = [
    ("Movie", "@name"),
    ("Rating", "@rating"),
    ("Num Reviews", "@num_reviews")
]
p3.add_tools(hover)

show(p3)

In [None]:
# Drop NaNs in runtime
runtimes = df['run_length_minutes'].dropna()

hist, edges = np.histogram(runtimes, bins=20)

p_hist = figure(title='Distribution of Movie Runtimes',
                x_axis_label='Runtime (minutes)',
                y_axis_label='Number of Movies',
                width=600, height=400)

p_hist.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], fill_color='skyblue', line_color='white')

show(p_hist)

In [None]:
from bokeh.palettes import Viridis256, Category20

corr_df = df[['rating', 'release_date_dt', 'run_length_minutes']].corr().round(2)

# Convert to long format for Bokeh
corr_df.index.name = 'var1'
corr_melt = corr_df.reset_index().melt(id_vars='var1', var_name='var2', value_name='correlation')

# Create color mapper
mapper = LinearColorMapper(palette=Viridis256, low=-1, high=1)

p_heat = figure(title="Correlation Heatmap",
                x_range=list(corr_df.columns), y_range=list(corr_df.columns),
                x_axis_location="above", width=400, height=400,
                tools="", toolbar_location=None)

p_heat.rect(x="var1", y="var2", width=1, height=1, source=ColumnDataSource(corr_melt),
            fill_color={'field': 'correlation', 'transform': mapper},
            line_color=None)

# Add correlation values on top of rectangles
p_heat.text(x="var1", y="var2", text='correlation', source=ColumnDataSource(corr_melt),
            text_align="center", text_baseline="middle", text_color="white")

color_bar = ColorBar(color_mapper=mapper, location=(0, 0))
p_heat.add_layout(color_bar, 'right')

p_heat.xaxis.major_label_orientation = np.pi / 4
show(p_heat)

In [None]:
# Create Year-Month column
df['release_month'] = df['release_date_dt'].dt.to_period('M')

# Group by month and count movies released
monthly_releases = df.groupby('release_month').size().reset_index(name='num_movies')

# Convert Period to timestamp for plotting
monthly_releases['release_month'] = monthly_releases['release_month'].dt.to_timestamp()

source_time = ColumnDataSource(monthly_releases)

p_time = figure(title="Monthly Movie Release Trends",
                x_axis_label='Release Month',
                y_axis_label='Number of Movies Released',
                x_axis_type='datetime',
                width=700, height=400)

p_time.line(x='release_month', y='num_movies', source=source_time, line_width=3, color='teal')
#p_time.scatter(x='release_month', y='num_movies', source=source_time, size=4, color='light blue')

hover_time = HoverTool(tooltips=[("Month", "@release_month{%b %Y}"), ("Movies Released", "@num_movies")],
                       formatters={'@release_month': 'datetime'})

p_time.add_tools(hover_time)

show(p_time)