In [38]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [9]:
# Load dataset
merged_df = pd.read_csv("../data/merged_dataset.csv")



  merged_df = pd.read_csv("../data/merged_dataset.csv")


In [54]:
# CPCB AQI Calculation using all major pollutants
def calculate_aqi(row):
    breakpoints = {
        'pm2_5': [(0, 30, 0, 50), (31, 60, 51, 100), (61, 90, 101, 200), (91, 120, 201, 300), (121, 250, 301, 400)],
        'pm10': [(0, 50, 0, 50), (51, 100, 51, 100), (101, 250, 101, 200), (251, 350, 201, 300), (351, 430, 301, 400)],
        'no2': [(0, 40, 0, 50), (41, 80, 51, 100), (81, 180, 101, 200), (181, 280, 201, 300), (281, 400, 301, 400)],
        'so2': [(0, 40, 0, 50), (41, 80, 51, 100), (81, 380, 101, 200), (381, 800, 201, 300), (801, 1600, 301, 400)],
        'co': [(0, 1, 0, 50), (1.1, 2, 51, 100), (2.1, 10, 101, 200), (10.1, 17, 201, 300), (17.1, 34, 301, 400)],
        'o3': [(0, 50, 0, 50), (51, 100, 51, 100), (101, 168, 101, 200), (169, 208, 201, 300), (209, 748, 301, 400)]
    }
    
    sub_indices = []
    for pollutant, bps in breakpoints.items():
        if pollutant in row:
            value = row[pollutant]
            for c_low, c_high, i_low, i_high in bps:
                if c_low <= value <= c_high:
                    sub_index = ((i_high - i_low) / (c_high - c_low)) * (value - c_low) + i_low
                    sub_indices.append(sub_index)
                    break
    
    return max(sub_indices) if sub_indices else np.nan
# Apply AQI calculation to dataset
merged_df['AQI'] = merged_df.apply(calculate_aqi, axis=1)

In [55]:
# Feature Engineering
merged_df['rainfall'] = merged_df[[
    'JAN','FEB','MAR','APR','MAY','JUN','JUL','AUG','SEP','OCT','NOV','DEC']].sum(axis=1)
merged_df['rainfall_lag1'] = merged_df.groupby('location')['rainfall'].shift(1)
merged_df['monsoon_rain'] = merged_df[['JUN', 'JUL', 'AUG', 'SEP']].sum(axis=1)
merged_df['is_dry'] = (merged_df['JAN'] < 10).astype(int)

In [56]:
# Define features (X) and target (y)
features = ['year', 'month', 'rainfall', 'monsoon_rain', 'Latitude', 'Longitude', 
            'pm2_5', 'so2', 'no2', 'rspm', 'spm']  # Including all pollutants
X = merged_df[features]
y = merged_df['AQI']


In [57]:
# Handle missing values
imputer = SimpleImputer(strategy='median')
X = imputer.fit_transform(X)
y = y.fillna(y.median())


In [58]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardization
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [59]:
# Model Training
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [60]:
# Predictions
predictions = model.predict(X_test)

# Evaluation
r2 = r2_score(y_test, predictions)
rmse = mean_squared_error(y_test, predictions, squared=False)
print(f"R²: {r2}")
print(f"RMSE: {rmse}")

R²: 0.9983231225610074
RMSE: 1.0060836131425293




In [61]:
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

X_train shape: (348593, 11)
y_train shape: (348593,)
X_test shape: (87149, 11)
y_test shape: (87149,)


In [62]:
# Feature Importance
feature_importances = pd.DataFrame({'Feature': features, 'Importance': model.feature_importances_})
print(feature_importances.sort_values(by='Importance', ascending=False))


         Feature  Importance
8            no2    0.778829
6          pm2_5    0.175652
7            so2    0.043465
9           rspm    0.000690
0           year    0.000625
10           spm    0.000314
1          month    0.000242
2       rainfall    0.000059
3   monsoon_rain    0.000052
5      Longitude    0.000038
4       Latitude    0.000034
