In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-whitegrid')

# Add src to path
import sys
sys.path.append('..')

from src.config import RAW_DATA_PATH, EDA_FIGURES_DIR
from src.utils import save_figure

print("Libraries imported successfully!")

Libraries imported successfully!


## 1.1 Load Data

In [2]:
# Load data
df = pd.read_csv(RAW_DATA_PATH)

print(f"Dataset shape: {df.shape}")
print(f"Number of rows: {df.shape[0]:,}")
print(f"Number of columns: {df.shape[1]}")

Dataset shape: (48204, 9)
Number of rows: 48,204
Number of columns: 9


In [3]:
# First few rows
df.head(10)

Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,date_time,traffic_volume
0,,288.28,0.0,0.0,40,Clouds,scattered clouds,2012-10-02 09:00:00,5545
1,,289.36,0.0,0.0,75,Clouds,broken clouds,2012-10-02 10:00:00,4516
2,,289.58,0.0,0.0,90,Clouds,overcast clouds,2012-10-02 11:00:00,4767
3,,290.13,0.0,0.0,90,Clouds,overcast clouds,2012-10-02 12:00:00,5026
4,,291.14,0.0,0.0,75,Clouds,broken clouds,2012-10-02 13:00:00,4918
5,,291.72,0.0,0.0,1,Clear,sky is clear,2012-10-02 14:00:00,5181
6,,293.17,0.0,0.0,1,Clear,sky is clear,2012-10-02 15:00:00,5584
7,,293.86,0.0,0.0,1,Clear,sky is clear,2012-10-02 16:00:00,6015
8,,294.14,0.0,0.0,20,Clouds,few clouds,2012-10-02 17:00:00,5791
9,,293.1,0.0,0.0,20,Clouds,few clouds,2012-10-02 18:00:00,4770


In [4]:
# Last few rows
df.tail(10)

Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,date_time,traffic_volume
48194,,283.84,0.0,0.0,75,Rain,proximity shower rain,2018-09-30 15:00:00,4302
48195,,283.84,0.0,0.0,75,Drizzle,light intensity drizzle,2018-09-30 15:00:00,4302
48196,,284.38,0.0,0.0,75,Rain,light rain,2018-09-30 16:00:00,4283
48197,,284.79,0.0,0.0,75,Clouds,broken clouds,2018-09-30 17:00:00,4132
48198,,284.2,0.25,0.0,75,Rain,light rain,2018-09-30 18:00:00,3947
48199,,283.45,0.0,0.0,75,Clouds,broken clouds,2018-09-30 19:00:00,3543
48200,,282.76,0.0,0.0,90,Clouds,overcast clouds,2018-09-30 20:00:00,2781
48201,,282.73,0.0,0.0,90,Thunderstorm,proximity thunderstorm,2018-09-30 21:00:00,2159
48202,,282.09,0.0,0.0,90,Clouds,overcast clouds,2018-09-30 22:00:00,1450
48203,,282.12,0.0,0.0,90,Clouds,overcast clouds,2018-09-30 23:00:00,954


## 1.2 Basic Information

In [5]:
# Data types and non-null counts
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48204 entries, 0 to 48203
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   holiday              61 non-null     object 
 1   temp                 48204 non-null  float64
 2   rain_1h              48204 non-null  float64
 3   snow_1h              48204 non-null  float64
 4   clouds_all           48204 non-null  int64  
 5   weather_main         48204 non-null  object 
 6   weather_description  48204 non-null  object 
 7   date_time            48204 non-null  object 
 8   traffic_volume       48204 non-null  int64  
dtypes: float64(3), int64(2), object(4)
memory usage: 3.3+ MB


In [6]:
# Statistical summary for numerical columns
df.describe()

Unnamed: 0,temp,rain_1h,snow_1h,clouds_all,traffic_volume
count,48204.0,48204.0,48204.0,48204.0,48204.0
mean,281.20587,0.334264,0.000222,49.362231,3259.818355
std,13.338232,44.789133,0.008168,39.01575,1986.86067
min,0.0,0.0,0.0,0.0,0.0
25%,272.16,0.0,0.0,1.0,1193.0
50%,282.45,0.0,0.0,64.0,3380.0
75%,291.806,0.0,0.0,90.0,4933.0
max,310.07,9831.3,0.51,100.0,7280.0


In [7]:
# Statistical summary for categorical columns
df.describe(include='object')

Unnamed: 0,holiday,weather_main,weather_description,date_time
count,61,48204,48204,48204
unique,11,11,38,40575
top,Labor Day,Clouds,sky is clear,2013-05-19 10:00:00
freq,7,15164,11665,6


## 1.3 Missing Values Analysis

In [8]:
# Check missing values
missing = df.isnull().sum()
missing_pct = (missing / len(df) * 100).round(2)

missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Missing %': missing_pct
})

print("Missing Values Summary:")
print(missing_df)

Missing Values Summary:
                     Missing Count  Missing %
holiday                      48143      99.87
temp                             0       0.00
rain_1h                          0       0.00
snow_1h                          0       0.00
clouds_all                       0       0.00
weather_main                     0       0.00
weather_description              0       0.00
date_time                        0       0.00
traffic_volume                   0       0.00


## 1.4 Duplicates Analysis

In [9]:
# Check duplicates
n_duplicates = df.duplicated().sum()
print(f"Total duplicate rows: {n_duplicates}")

# Check duplicate timestamps
n_duplicate_times = df['date_time'].duplicated().sum()
print(f"Duplicate timestamps: {n_duplicate_times}")

Total duplicate rows: 17
Duplicate timestamps: 7629


In [10]:
# Show some duplicate timestamps
if n_duplicate_times > 0:
    dup_times = df[df['date_time'].duplicated(keep=False)].sort_values('date_time')
    print("\nSample duplicate timestamps:")
    display(dup_times.head(10))


Sample duplicate timestamps:


Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,date_time,traffic_volume
178,,281.25,0.0,0.0,99,Rain,light rain,2012-10-10 07:00:00,6793
179,,281.25,0.0,0.0,99,Drizzle,light intensity drizzle,2012-10-10 07:00:00,6793
180,,280.1,0.0,0.0,99,Rain,light rain,2012-10-10 08:00:00,6283
181,,280.1,0.0,0.0,99,Drizzle,light intensity drizzle,2012-10-10 08:00:00,6283
182,,279.61,0.0,0.0,99,Rain,light rain,2012-10-10 09:00:00,5680
183,,279.61,0.0,0.0,99,Drizzle,light intensity drizzle,2012-10-10 09:00:00,5680
269,,282.43,0.0,0.0,57,Drizzle,light intensity drizzle,2012-10-14 09:00:00,2685
270,,282.43,0.0,0.0,57,Mist,mist,2012-10-14 09:00:00,2685
271,,282.43,0.0,0.0,57,Haze,haze,2012-10-14 09:00:00,2685
272,,282.33,0.0,0.0,57,Drizzle,light intensity drizzle,2012-10-14 10:00:00,3370


## 1.5 Date/Time Analysis

In [None]:
# Convert to datetime
df['date_time'] = pd.to_datetime(df['date_time'])

print(f"Date range: {df['date_time'].min()} to {df['date_time'].max()}")
print(f"Time span: {(df['date_time'].max() - df['date_time'].min()).days} days")

## 1.6 Target Variable Analysis (traffic_volume)

In [None]:
# Target distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram
axes[0].hist(df['traffic_volume'], bins=50, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Traffic Volume')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribution of Traffic Volume')
axes[0].axvline(df['traffic_volume'].mean(), color='red', linestyle='--', label=f'Mean: {df["traffic_volume"].mean():.0f}')
axes[0].axvline(df['traffic_volume'].median(), color='green', linestyle='--', label=f'Median: {df["traffic_volume"].median():.0f}')
axes[0].legend()

# Boxplot
axes[1].boxplot(df['traffic_volume'], vert=True)
axes[1].set_ylabel('Traffic Volume')
axes[1].set_title('Boxplot of Traffic Volume')

plt.tight_layout()
save_figure(fig, f'{EDA_FIGURES_DIR}/target_distribution.png')
plt.show()

In [None]:
# Target statistics
print("Traffic Volume Statistics:")
print(df['traffic_volume'].describe())

## 1.7 Time Series Visualization

In [None]:
# Time series plot
fig, ax = plt.subplots(figsize=(16, 5))

ax.plot(df['date_time'], df['traffic_volume'], linewidth=0.5, alpha=0.7)
ax.set_xlabel('Date')
ax.set_ylabel('Traffic Volume')
ax.set_title('Traffic Volume Over Time')

plt.tight_layout()
save_figure(fig, f'{EDA_FIGURES_DIR}/time_series.png')
plt.show()

In [None]:
# Sample period (e.g., one month)
sample_start = '2017-06-01'
sample_end = '2017-06-30'
sample_df = df[(df['date_time'] >= sample_start) & (df['date_time'] <= sample_end)]

fig, ax = plt.subplots(figsize=(16, 5))
ax.plot(sample_df['date_time'], sample_df['traffic_volume'], linewidth=1)
ax.set_xlabel('Date')
ax.set_ylabel('Traffic Volume')
ax.set_title(f'Traffic Volume: {sample_start} to {sample_end}')
plt.tight_layout()
save_figure(fig, f'{EDA_FIGURES_DIR}/time_series_sample.png')
plt.show()

## 1.8 Temporal Patterns

In [None]:
# Extract temporal features for analysis
df['hour'] = df['date_time'].dt.hour
df['day_of_week'] = df['date_time'].dt.dayofweek
df['month'] = df['date_time'].dt.month

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# By hour
hourly = df.groupby('hour')['traffic_volume'].mean()
axes[0].bar(hourly.index, hourly.values, color='steelblue')
axes[0].set_xlabel('Hour of Day')
axes[0].set_ylabel('Average Traffic Volume')
axes[0].set_title('Traffic by Hour')
axes[0].set_xticks(range(24))

# By day of week
days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
daily = df.groupby('day_of_week')['traffic_volume'].mean()
axes[1].bar(days, daily.values, color='coral')
axes[1].set_xlabel('Day of Week')
axes[1].set_ylabel('Average Traffic Volume')
axes[1].set_title('Traffic by Day of Week')

# By month
monthly = df.groupby('month')['traffic_volume'].mean()
axes[2].bar(monthly.index, monthly.values, color='seagreen')
axes[2].set_xlabel('Month')
axes[2].set_ylabel('Average Traffic Volume')
axes[2].set_title('Traffic by Month')
axes[2].set_xticks(range(1, 13))

plt.tight_layout()
save_figure(fig, f'{EDA_FIGURES_DIR}/temporal_patterns.png')
plt.show()

In [None]:
# Heatmap: Hour vs Day of Week
pivot = df.pivot_table(values='traffic_volume', index='day_of_week', columns='hour', aggfunc='mean')

fig, ax = plt.subplots(figsize=(16, 6))
sns.heatmap(pivot, cmap='YlOrRd', annot=False, ax=ax)
ax.set_yticklabels(['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])
ax.set_xlabel('Hour of Day')
ax.set_ylabel('Day of Week')
ax.set_title('Average Traffic Volume: Hour vs Day of Week')

plt.tight_layout()
save_figure(fig, f'{EDA_FIGURES_DIR}/hour_day_heatmap.png')
plt.show()

## 1.9 Feature Analysis

In [None]:
# Numerical features distributions
numerical_cols = ['temp', 'rain_1h', 'snow_1h', 'clouds_all']

fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

for i, col in enumerate(numerical_cols):
    axes[i].hist(df[col], bins=50, edgecolor='black', alpha=0.7)
    axes[i].set_xlabel(col)
    axes[i].set_ylabel('Frequency')
    axes[i].set_title(f'Distribution of {col}')

plt.tight_layout()
save_figure(fig, f'{EDA_FIGURES_DIR}/numerical_distributions.png')
plt.show()

In [None]:
# Categorical features
print("Holiday values:")
print(df['holiday'].value_counts())

In [None]:
print("\nWeather Main values:")
print(df['weather_main'].value_counts())

In [None]:
# Traffic by weather
fig, ax = plt.subplots(figsize=(12, 5))
weather_traffic = df.groupby('weather_main')['traffic_volume'].mean().sort_values(ascending=False)
weather_traffic.plot(kind='bar', ax=ax, color='teal')
ax.set_xlabel('Weather')
ax.set_ylabel('Average Traffic Volume')
ax.set_title('Traffic by Weather Condition')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
save_figure(fig, f'{EDA_FIGURES_DIR}/traffic_by_weather.png')
plt.show()

## 1.10 Correlation Analysis

In [None]:
# Correlation matrix
corr_cols = ['traffic_volume', 'temp', 'rain_1h', 'snow_1h', 'clouds_all', 'hour', 'day_of_week', 'month']
corr_matrix = df[corr_cols].corr()

fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='RdBu_r', center=0, ax=ax, fmt='.2f')
ax.set_title('Correlation Matrix')

plt.tight_layout()
save_figure(fig, f'{EDA_FIGURES_DIR}/correlation_matrix.png')
plt.show()

In [None]:
# Correlation with target
target_corr = corr_matrix['traffic_volume'].drop('traffic_volume').sort_values(key=abs, ascending=False)
print("Correlation with traffic_volume:")
print(target_corr)

## 1.11 Summary & Key Findings

In [None]:
print("=" * 60)
print("EDA SUMMARY")
print("=" * 60)
print(f"""
1. DATASET OVERVIEW:
   - Total records: {len(df):,}
   - Features: {df.shape[1]}
   - Date range: {df['date_time'].min().date()} to {df['date_time'].max().date()}
   - Missing values: {df.isnull().sum().sum()}
   - Duplicate timestamps: {n_duplicate_times}

2. TARGET VARIABLE (traffic_volume):
   - Mean: {df['traffic_volume'].mean():,.0f}
   - Median: {df['traffic_volume'].median():,.0f}
   - Std: {df['traffic_volume'].std():,.0f}
   - Min: {df['traffic_volume'].min():,.0f}
   - Max: {df['traffic_volume'].max():,.0f}

3. KEY PATTERNS:
   - Rush hours (7-9 AM, 4-6 PM) show highest traffic
   - Weekdays have higher traffic than weekends
   - Clear seasonal patterns observed

4. NEXT STEPS:
   - Handle duplicate timestamps
   - Handle outliers in traffic_volume
   - Create temporal and lag features
""")
print("=" * 60)