# 1. Import libraries and load the dataset



In [1]:
import pandas as pd
import numpy as np


df_holidays_events = pd.read_csv(r'D:\Topic_13_Project\Topic_13_Retail_Store_Sales_Time_Series\data\raw\holidays_events.csv')

# 2. Display basic information about the dataset


In [2]:
print("First 5 Rows of Data Frame:\n", df_holidays_events.head(5))
print("Data Frame Shape:\n", df_holidays_events.shape)
print("Data Frame Info:\n", df_holidays_events.info())
print("Data Frame Statistics:\n", df_holidays_events.describe())
    

First 5 Rows of Data Frame:
          date     type    locale locale_name                    description  \
0  2012-03-02  Holiday     Local       Manta             Fundacion de Manta   
1  2012-04-01  Holiday  Regional    Cotopaxi  Provincializacion de Cotopaxi   
2  2012-04-12  Holiday     Local      Cuenca            Fundacion de Cuenca   
3  2012-04-14  Holiday     Local    Libertad      Cantonizacion de Libertad   
4  2012-04-21  Holiday     Local    Riobamba      Cantonizacion de Riobamba   

   transferred  
0        False  
1        False  
2        False  
3        False  
4        False  
Data Frame Shape:
 (350, 6)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 350 entries, 0 to 349
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   date         350 non-null    object
 1   type         350 non-null    object
 2   locale       350 non-null    object
 3   locale_name  350 non-null    object
 4   description

# 3. Missing Values Summary

In [3]:
# Calculate missing values
missing_counts = df_holidays_events.isnull().sum()
missing_percentage = missing_counts / len(df_holidays_events) * 100

# Create a summary DataFrame
missing_df_holidays_events = pd.DataFrame({
    'Missing Values': missing_counts,
    'Percentage': missing_percentage
})

print("Missing Values Summary:\n", missing_df_holidays_events)

# Filter columns with missing values
missing_df_holidays_events = missing_df_holidays_events[missing_df_holidays_events['Missing Values'] > 0].sort_values(by='Missing Values', ascending=False)

Missing Values Summary:
              Missing Values  Percentage
date                      0         0.0
type                      0         0.0
locale                    0         0.0
locale_name               0         0.0
description               0         0.0
transferred               0         0.0


# 4. Fixing datetime

In [4]:
df_holidays_events['date'] = pd.to_datetime(df_holidays_events['date'])
print(df_holidays_events['date'].dtype)

datetime64[ns]


## 5. Holiday Event Type and Scope Analysis

Analyzing the distribution of holiday types and their geographic scope.

In [5]:
# Event Type Distribution
type_counts = df_holidays_events['type'].value_counts()
type_pct = (type_counts / len(df_holidays_events)) * 100

print("Holiday Event Type Distribution:")
print("=" * 60)
for event_type, count in type_counts.items():
    pct = (count / len(df_holidays_events)) * 100
    print(f"{event_type}: {count} events ({pct:.1f}%)")

# Geographic Scope Distribution
locale_counts = df_holidays_events['locale'].value_counts()
locale_pct = (locale_counts / len(df_holidays_events)) * 100

print(f"\nGeographic Scope Distribution:")
print("=" * 60)
for locale, count in locale_counts.items():
    pct = (count / len(df_holidays_events)) * 100
    print(f"{locale}: {count} events ({pct:.1f}%)")

# Most common event descriptions
top_events = df_holidays_events['description'].value_counts().head(5)

print(f"\nTop 5 Most Frequent Events:")
print("=" * 60)
for event, count in top_events.items():
    print(f"{event}: {count} occurrences")

# National vs Regional/Local split
national_pct = (locale_counts.get('National', 0) / len(df_holidays_events)) * 100
regional_local_pct = 100 - national_pct

print(f"\n Holiday Distribution Insights:")
print(f"- National holidays: {national_pct:.1f}% (affect all stores)")
print(f"- Regional/Local holidays: {regional_local_pct:.1f}% (store-specific)")
print(f"- CRITICAL: {regional_local_pct:.1f}% of holidays require store-location matching")

Holiday Event Type Distribution:
Holiday: 221 events (63.1%)
Event: 56 events (16.0%)
Additional: 51 events (14.6%)
Transfer: 12 events (3.4%)
Bridge: 5 events (1.4%)
Work Day: 5 events (1.4%)

Geographic Scope Distribution:
National: 174 events (49.7%)
Local: 152 events (43.4%)
Regional: 24 events (6.9%)

Top 5 Most Frequent Events:
Carnaval: 10 occurrences
Fundacion de Ibarra: 7 occurrences
Fundacion de Cuenca: 7 occurrences
Cantonizacion de Libertad: 6 occurrences
Cantonizacion de Riobamba: 6 occurrences

 Holiday Distribution Insights:
- National holidays: 49.7% (affect all stores)
- Regional/Local holidays: 50.3% (store-specific)
- CRITICAL: 50.3% of holidays require store-location matching


## 6. Holiday Timing and Transfer Analysis

Analyzing temporal distribution and holiday transfer patterns.

In [6]:
# Temporal Coverage Analysis
unique_dates = df_holidays_events['date'].nunique()
total_events = len(df_holidays_events)
date_range = (df_holidays_events['date'].max() - df_holidays_events['date'].min()).days
avg_events_per_year = total_events / (date_range / 365.25)

# Multiple events on same date
dates_with_multiple = (df_holidays_events.groupby('date').size() > 1).sum()
dates_with_multiple_pct = (dates_with_multiple / unique_dates) * 100

print("Temporal Distribution Statistics:")
print("=" * 60)
print(f"Total events: {total_events}")
print(f"Unique dates: {unique_dates}")
print(f"Date range: {date_range} days")
print(f"Average events per year: {avg_events_per_year:.0f}")
print(f"Dates with multiple events: {dates_with_multiple} ({dates_with_multiple_pct:.1f}%)")

# Holiday Transfer Analysis
transferred_count = df_holidays_events['transferred'].sum()
transferred_pct = (transferred_count / len(df_holidays_events)) * 100
non_transferred_pct = 100 - transferred_pct

print(f"\nHoliday Transfer Statistics:")
print("=" * 60)
print(f"Transferred holidays: {transferred_count} ({transferred_pct:.1f}%)")
print(f"Non-transferred holidays: {len(df_holidays_events) - transferred_count} ({non_transferred_pct:.1f}%)")

# Unique locations
unique_locations = df_holidays_events['locale_name'].nunique()
location_counts = df_holidays_events['locale_name'].value_counts()
top_location = location_counts.index[0]
top_location_count = location_counts.iloc[0]

print(f"\nLocation Distribution:")
print("=" * 60)
print(f"Unique locations: {unique_locations}")
print(f"Top location: {top_location} ({top_location_count} events)")

print(f"\n Key Timing Insights:")
print(f"- {dates_with_multiple_pct:.1f}% of dates have MULTIPLE simultaneous events")
print(f"- Only {transferred_pct:.1f}% of holidays are transferred (simplified modeling)")
print(f"- {avg_events_per_year:.0f} holidays/events per year on average")

Temporal Distribution Statistics:
Total events: 350
Unique dates: 312
Date range: 2125 days
Average events per year: 60
Dates with multiple events: 31 (9.9%)

Holiday Transfer Statistics:
Transferred holidays: 12 (3.4%)
Non-transferred holidays: 338 (96.6%)

Location Distribution:
Unique locations: 24
Top location: Ecuador (174 events)

 Key Timing Insights:
- 9.9% of dates have MULTIPLE simultaneous events
- Only 3.4% of holidays are transferred (simplified modeling)
- 60 holidays/events per year on average
