In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer, mean_absolute_error

# Load dataset
data = pd.read_csv('city_day.csv')

# Fill missing values in numeric columns only
numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].mean())

# Convert Date column and extract features
data['Date'] = pd.to_datetime(data['Date'])
data['Month'] = data['Date'].dt.month
data['DayOfWeek'] = data['Date'].dt.dayofweek
data.drop('Date', axis=1, inplace=True)

# Scale pollutant data
scaler = StandardScaler()
pollutant_columns = ['PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'NH3', 'CO', 'SO2', 'O3', 'Benzene', 'Toluene', 'Xylene']
data[pollutant_columns] = scaler.fit_transform(data[pollutant_columns])

# Split data into X and y
X = data.drop(['AQI', 'AQI_Bucket', 'City'], axis=1)
y = data['AQI']

# Custom scorer for MAE
mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)

# Initialize models
lr_model = LinearRegression()
rf_model = RandomForestRegressor(random_state=42)

# Cross-validate Linear Regression
lr_scores = cross_val_score(lr_model, X, y, cv=5, scoring=mae_scorer)
print(f"Linear Regression - Mean Absolute Error (5-Fold CV): {-lr_scores.mean():.2f}")

# Cross-validate Random Forest
rf_scores = cross_val_score(rf_model, X, y, cv=5, scoring=mae_scorer)
print(f"Random Forest - Mean Absolute Error (5-Fold CV): {-rf_scores.mean():.2f}")

Linear Regression - Mean Absolute Error (5-Fold CV): 37.04
Random Forest - Mean Absolute Error (5-Fold CV): 32.43
