In [2]:
# Importing Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Seting plotting style
sns.set(style="whitegrid")

# Loading the Data
try:
    df = pd.read_csv('../data/raw/city_day.csv')
    print("Data loaded successfully.")
except FileNotFoundError:
    print("Error: city_day.csv not found. Please run the Kaggle download command.")

# 3. Initial Data Inspection
print("--- Initial Data Info ---")
df.info()

print("\n--- First 5 Rows ---")
print(df.head())

print("\n--- Missing Values Count ---")
print(df.isnull().sum())

# Data Cleaning and Preprocessing
df['Date'] = pd.to_datetime(df['Date'])

# focusing our study on a single, high-impact city: Delhi
delhi_df = df[df['City'] == 'Delhi'].copy()
print(f"\nFiltered for Delhi. Shape of new DataFrame: {delhi_df.shape}")

delhi_df.set_index('Date', inplace=True)

# Address missing values
pollutants = ['PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'NH3', 'CO', 'SO2', 'O3', 'Benzene', 'Toluene', 'Xylene']
delhi_df[pollutants] = delhi_df[pollutants].ffill()
delhi_df.bfill(inplace=True)

print("\n--- Missing Values After Cleaning ---")
print(delhi_df.isnull().sum())

# Saving the Processed Data
processed_path = '../data/processed/delhi_aqi_cleaned.csv'
delhi_df.to_csv(processed_path)
print(f"\nCleaned Delhi data saved to {processed_path}")

Data loaded successfully.
--- Initial Data Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29531 entries, 0 to 29530
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   City        29531 non-null  object 
 1   Date        29531 non-null  object 
 2   PM2.5       24933 non-null  float64
 3   PM10        18391 non-null  float64
 4   NO          25949 non-null  float64
 5   NO2         25946 non-null  float64
 6   NOx         25346 non-null  float64
 7   NH3         19203 non-null  float64
 8   CO          27472 non-null  float64
 9   SO2         25677 non-null  float64
 10  O3          25509 non-null  float64
 11  Benzene     23908 non-null  float64
 12  Toluene     21490 non-null  float64
 13  Xylene      11422 non-null  float64
 14  AQI         24850 non-null  float64
 15  AQI_Bucket  24850 non-null  object 
dtypes: float64(13), object(3)
memory usage: 3.6+ MB

--- First 5 Rows ---
        City        Date  