In [None]:
# Week 1 - Dataset Preparation

# ## Introduction
# This notebook prepares air pollution data for PM2.5 prediction.
# Raw data is stored in `data/your_dataset.csv`.
# Cleaned data will be saved as `data/clean_pollution_data.csv` for use in Week 2.

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# ### Load Data
raw_path = 'data/your_dataset.csv'

if not os.path.exists(raw_path):
    print("Raw dataset not found. Please place 'your_dataset.csv' inside the 'data/' folder.")
else:
    df = pd.read_csv(raw_path)

    # ### Quick View
    print(df.head())
    print(df.info())

    # ### Data Cleaning
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

    # ### Feature Engineering
    # 'hour' and 'month' columns already exist in the DataFrame, so no need to extract them.
    # If you want to create a datetime column for future use:
    df['datetime'] = pd.to_datetime(df[['year', 'month', 'day', 'hour']])

    # ### EDA
    plt.figure(figsize=(8,4))
    sns.histplot(df['pm2.5'])
    plt.title('PM2.5 Distribution')
    plt.show()

    # Only use numeric columns for correlation
    sns.heatmap(df[numeric_cols].corr(), annot=True)
    plt.title('Feature Correlation')
    plt.show()

    # ### Train/Test Split
    from sklearn.model_selection import train_test_split

    # Use existing numeric columns for features
    X = df[['TEMP', 'hour', 'month']]
    y = df['pm2.5']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print("Train and test sets created.")

    # ### Save Cleaned Data
    clean_path = 'data/clean_pollution_data.csv'
    df.to_csv(clean_path, index=False)
    print(f"Clean dataset saved as {clean_path}.")