In [34]:
import pandas as pd 

# Load the dataset
file_path = "holiday_movies.csv"  # Update with the correct path if needed
df = pd.read_csv(file_path)

 

# Display basic information
df.info()

display(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2265 entries, 0 to 2264
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   tconst           2265 non-null   object 
 1   title_type       2265 non-null   object 
 2   primary_title    2265 non-null   object 
 3   original_title   2265 non-null   object 
 4   year             2265 non-null   int64  
 5   runtime_minutes  2076 non-null   float64
 6   genres           2233 non-null   object 
 7   simple_title     2265 non-null   object 
 8   average_rating   2265 non-null   float64
 9   num_votes        2265 non-null   int64  
 10  christmas        2265 non-null   bool   
 11  hanukkah         2265 non-null   bool   
 12  kwanzaa          2265 non-null   bool   
 13  holiday          2265 non-null   bool   
dtypes: bool(4), float64(2), int64(2), object(6)
memory usage: 185.9+ KB


Unnamed: 0,tconst,title_type,primary_title,original_title,year,runtime_minutes,genres,simple_title,average_rating,num_votes,christmas,hanukkah,kwanzaa,holiday
0,tt0020356,movie,Sailor's Holiday,Sailor's Holiday,1929,58.0,Comedy,sailors holiday,5.4,55,False,False,False,True
1,tt0020823,movie,The Devil's Holiday,The Devil's Holiday,1930,80.0,"Drama,Romance",the devils holiday,6.0,242,False,False,False,True
2,tt0020985,movie,Holiday,Holiday,1930,91.0,"Comedy,Drama",holiday,6.3,638,False,False,False,True
3,tt0021268,movie,Holiday of St. Jorgen,Prazdnik svyatogo Yorgena,1930,83.0,Comedy,holiday of st jorgen,7.4,256,False,False,False,True
4,tt0021377,movie,Sin Takes a Holiday,Sin Takes a Holiday,1930,81.0,"Comedy,Romance",sin takes a holiday,6.1,740,False,False,False,True


In [40]:
# Explode genres column for better analysis
df_exploded = df.assign(genres=df['genres'].str.split(',')).explode('genres')

# Define function to group years into periods
def group_years(year, period=5):
    return f"{(year // period) * period}-{((year // period) * period) + (period - 1)}"

df_exploded['year_period'] = df_exploded['year'].apply(lambda x: group_years(x, period=5))

In [None]:
import dash
from dash import dcc, html, Input, Output
import plotly.express as px


# Initialize Dash app
app = dash.Dash(__name__)

# Layout
app.layout = html.Div([
    html.H1("Holiday Movies Analysis"),
    
    # Dropdown for selecting period
    dcc.Dropdown(
        id='period-dropdown',
        options=[{'label': str(p), 'value': p} for p in sorted(df_exploded['year_period'].unique())],
        multi=True,
        placeholder="Select periods"
    ),
    
    # Dropdown for selecting genre
    dcc.Dropdown(
        id='genre-dropdown',
        options=[{'label': g, 'value': g} for g in sorted(df_exploded['genres'].dropna().unique())],
        multi=True,
        placeholder="Select genres"
    ),
    
    # Scatterplot
    dcc.Graph(id='scatter-plot'),
    
    # Heatmap
    dcc.Graph(id='heatmap'),
    
    
    # 100% Stacked Area Chart for Number of Movies Over Time by Genre
    dcc.Graph(id='stacked-area-chart')

    # Line chart for ratings over time by genre
    dcc.Graph(id='rating-trend'),
    
])

# Callbacks
@app.callback(
    Output('scatter-plot', 'figure'),
    [Input('period-dropdown', 'value'), Input('genre-dropdown', 'value')]
)
def update_scatter(selected_periods, selected_genres):
    filtered_df = df_exploded
    if selected_periods:
        filtered_df = filtered_df[filtered_df['year_period'].isin(selected_periods)]
    if selected_genres:
        filtered_df = filtered_df[filtered_df['genres'].isin(selected_genres)]
    fig = px.scatter(filtered_df, x='runtime_minutes', y='average_rating', color='genres',
                     title="Runtime vs. Rating by Genre", labels={'runtime_minutes': "Runtime (min)", 'average_rating': "Rating"})
    return fig

@app.callback(
    Output('heatmap', 'figure'),
    [Input('period-dropdown', 'value'), Input('genre-dropdown', 'value')]
)
def update_heatmap(selected_periods, selected_genres):
    filtered_df = df_exploded
    if selected_periods:
        filtered_df = filtered_df[filtered_df['year_period'].isin(selected_periods)]
    if selected_genres:
        filtered_df = filtered_df[filtered_df['genres'].isin(selected_genres)]
    genre_trends = filtered_df.groupby(['year_period', 'genres']).agg({'tconst': 'count'}).reset_index()
    heatmap_data = genre_trends.pivot(index='genres', columns='year_period', values='tconst').fillna(0)
    fig = px.imshow(heatmap_data, labels={'x': "Year Period", 'y': "Genre", 'color': "Movie Count"},
                    title="Holiday Movie Production Trends Over Time", color_continuous_scale='Viridis')
    return fig

@app.callback(
    Output('rating-trend', 'figure'),
    [Input('period-dropdown', 'value'), Input('genre-dropdown', 'value')]
)
def update_rating_trend(selected_periods, selected_genres):
    filtered_df = df_exploded
    if selected_periods:
        filtered_df = filtered_df[filtered_df['year_period'].isin(selected_periods)]
    if selected_genres:
        filtered_df = filtered_df[filtered_df['genres'].isin(selected_genres)]
    rating_trends = filtered_df.groupby(['year_period', 'genres']).agg({'average_rating': 'mean'}).reset_index()
    fig = px.line(rating_trends, x='year_period', y='average_rating', color='genres',
                  title="Average Rating Over Time by Genre", labels={'average_rating': "Average Rating"})
    return fig

@app.callback(
    Output('stacked-area-chart', 'figure'),
    [Input('period-dropdown', 'value'), Input('genre-dropdown', 'value')]
)
def update_stacked_area(selected_periods, selected_genres):
    filtered_df = df_exploded
    if selected_periods:
        filtered_df = filtered_df[filtered_df['year_period'].isin(selected_periods)]
    if selected_genres:
        filtered_df = filtered_df[filtered_df['genres'].isin(selected_genres)]
    
    genre_trends = filtered_df.groupby(['year_period', 'genres']).agg({'tconst': 'count'}).reset_index()
    genre_totals = genre_trends.groupby('year_period')['tconst'].transform('sum')
    genre_trends['percentage'] = genre_trends['tconst'] / genre_totals * 100
    
    fig = px.area(genre_trends, x='year_period', y='percentage', color='genres',
                  title="Percentage of Movies by Genre Over Time (100% Stacked Area Chart)",
                  labels={'percentage': "Percentage of Total Movies"},
                  line_group='genres', groupnorm='percent')
    return fig

# Run app
if __name__ == '__main__':
    app.run(debug=True, port=8060)
