In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings('ignore')

# Add src to path
import sys
sys.path.append('..')

from src.config import RAW_DATA_PATH, CLEANED_DATA_PATH, TARGET_COLUMN, DATE_COLUMN
from src.data_preprocessing import (
    load_data, convert_datetime, check_missing_values,
    handle_missing_values, handle_duplicates, handle_outliers_iqr,
    check_time_continuity, resample_hourly, preprocess_pipeline
)
from src.utils import save_csv

print("Libraries imported successfully!")

## 2.1 Load Raw Data

In [None]:
# Load data
df = load_data(RAW_DATA_PATH)
print(f"\nOriginal shape: {df.shape}")
df.head()

## 2.2 Handle DateTime

In [None]:
# Convert datetime and sort
df = convert_datetime(df, DATE_COLUMN)
df.head()

## 2.3 Handle Duplicates

In [None]:
# Check duplicates before
print(f"Duplicate timestamps before: {df[DATE_COLUMN].duplicated().sum()}")

# Remove duplicates
df = handle_duplicates(df, DATE_COLUMN, keep='first')

print(f"Shape after removing duplicates: {df.shape}")

## 2.4 Handle Missing Values

In [None]:
# Check missing values
missing_df = check_missing_values(df)
if len(missing_df) > 0:
    print("Missing values found:")
    print(missing_df)
else:
    print("No missing values found!")

In [None]:
# Handle missing values if any
df = handle_missing_values(df, numerical_strategy='interpolate', categorical_strategy='ffill')

## 2.5 Handle Outliers

In [None]:
# Visualize outliers before handling
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].boxplot(df[TARGET_COLUMN])
axes[0].set_title('Traffic Volume - Before Outlier Handling')
axes[0].set_ylabel('Traffic Volume')

# Handle outliers
df = handle_outliers_iqr(df, TARGET_COLUMN, factor=1.5, method='clip')

axes[1].boxplot(df[TARGET_COLUMN])
axes[1].set_title('Traffic Volume - After Outlier Handling')
axes[1].set_ylabel('Traffic Volume')

plt.tight_layout()
plt.show()

## 2.6 Check Time Continuity

In [None]:
# Check for missing timestamps
df, n_missing = check_time_continuity(df, DATE_COLUMN, freq='H')

if n_missing > 0:
    print(f"\nFound {n_missing} missing hourly timestamps. Resampling...")

## 2.7 Resample to Hourly (if needed)

In [None]:
# Resample to ensure hourly continuity
df = resample_hourly(df, DATE_COLUMN, TARGET_COLUMN)

print(f"\nFinal shape: {df.shape}")

## 2.8 Data Validation

In [None]:
# Final validation
print("=" * 50)
print("DATA VALIDATION")
print("=" * 50)
print(f"Shape: {df.shape}")
print(f"Missing values: {df.isnull().sum().sum()}")
print(f"Duplicate timestamps: {df[DATE_COLUMN].duplicated().sum()}")
print(f"Date range: {df[DATE_COLUMN].min()} to {df[DATE_COLUMN].max()}")
print(f"\nTarget column ({TARGET_COLUMN}) statistics:")
print(df[TARGET_COLUMN].describe())
print("=" * 50)

In [None]:
# View cleaned data
df.head(10)

## 2.9 Save Cleaned Data

In [None]:
# Save to CSV
save_csv(df, CLEANED_DATA_PATH, index=False)

print(f"\nCleaned data saved to: {CLEANED_DATA_PATH}")

## Summary

**Preprocessing steps completed:**
1. ✅ Converted datetime column
2. ✅ Sorted by timestamp
3. ✅ Removed duplicate timestamps
4. ✅ Handled missing values
5. ✅ Handled outliers using IQR method
6. ✅ Resampled to ensure hourly continuity
7. ✅ Saved cleaned data

**Next step:** Feature Engineering (03_Feature_Engineering.ipynb)