In [None]:
# Install required packages
!pip -q install plotly scikit-learn pandas numpy

import pandas as pd

CSV_URL = "https://raw.githubusercontent.com/SurajChouhan14/pm25-forecasting-india-model-vs-baseline/main/air-quality-india.csv"

try:
    df = pd.read_csv(CSV_URL)
    print("Loaded from GitHub raw:", CSV_URL, "| Rows:", len(df))
except Exception as e:
    print("Failed to load from GitHub raw. Error:", e)
    print("Fallback: upload the CSV or mount Drive as described below.")


# Core imports
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error
import plotly.express as px
import plotly.graph_objects as go

# Version check (optional)
print("pandas:", pd.__version__)
print("numpy:", np.__version__)


Loaded from GitHub raw: https://raw.githubusercontent.com/SurajChouhan14/pm25-forecasting-india-model-vs-baseline/main/air-quality-india.csv | Rows: 36192
pandas: 2.2.2
numpy: 2.0.2


In [None]:


# Parse timestamps
df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce')

# Keep valid timestamps and sort
df = df.dropna(subset=['Timestamp']).sort_values('Timestamp').reset_index(drop=True)

# Fill PM2.5 gaps (forward then backward)
df['PM2.5'] = df['PM2.5'].ffill().bfill()

# Safety: drop remaining PM2.5 nulls
df = df.dropna(subset=['PM2.5']).reset_index(drop=True)

print(df.head())
print(df.info())


            Timestamp  Year  Month  Day  Hour   PM2.5
0 2017-11-07 12:00:00  2017     11    7    12   64.51
1 2017-11-07 13:00:00  2017     11    7    13   69.95
2 2017-11-07 14:00:00  2017     11    7    14   92.79
3 2017-11-07 15:00:00  2017     11    7    15  109.66
4 2017-11-07 16:00:00  2017     11    7    16  116.50
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36192 entries, 0 to 36191
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Timestamp  36192 non-null  datetime64[ns]
 1   Year       36192 non-null  int64         
 2   Month      36192 non-null  int64         
 3   Day        36192 non-null  int64         
 4   Hour       36192 non-null  int64         
 5   PM2.5      36192 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int64(4)
memory usage: 1.7 MB
None


In [None]:
# Time features
df['Year'] = df['Timestamp'].dt.year
df['Month'] = df['Timestamp'].dt.month
df['Day'] = df['Timestamp'].dt.day
df['Hour'] = df['Timestamp'].dt.hour
df['Day_of_year'] = df['Timestamp'].dt.dayofyear

# Ensure order before shifts
df = df.sort_values('Timestamp').reset_index(drop=True)

# Lag features (1h, 24h)
df['PM2.5_lag1'] = df['PM2.5'].shift(1)
df['PM2.5_lag24'] = df['PM2.5'].shift(24)

# Drop rows without lags
df = df.dropna(subset=['PM2.5_lag1', 'PM2.5_lag24']).reset_index(drop=True)

df.head()


Unnamed: 0,Timestamp,Year,Month,Day,Hour,PM2.5,Day_of_year,PM2.5_lag1,PM2.5_lag24
0,2017-11-08 12:00:00,2017,11,8,12,88.05,312,84.32,64.51
1,2017-11-08 14:00:00,2017,11,8,14,116.13,312,88.05,69.95
2,2017-11-08 15:00:00,2017,11,8,15,127.02,312,116.13,92.79
3,2017-11-08 16:00:00,2017,11,8,16,133.68,312,127.02,109.66
4,2017-11-08 17:00:00,2017,11,8,17,139.93,312,133.68,116.5


In [None]:
# Predictors and target
features = ['Year', 'Month', 'Day', 'Hour', 'Day_of_year', 'PM2.5_lag1', 'PM2.5_lag24']
target = 'PM2.5'

# 80/20 time split
split_idx = int(0.8 * len(df))

# No shuffle (avoid leakage)
train, test = df.iloc[:split_idx].copy(), df.iloc[split_idx:].copy()
X_train, y_train = train[features], train[target]
X_test, y_test = test[features], test[target]

len(train), len(test)


(28934, 7234)

In [None]:
# Fit model
model = RandomForestRegressor(n_estimators=300, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)

# Predict test set
y_pred = model.predict(X_test)

# Metrics
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(((y_test - y_pred) ** 2).mean())
print(f'R2: {r2:.3f} | MAE: {mae:.2f} | RMSE: {rmse:.2f}')

# Store metrics
metrics = {'r2': float(r2), 'mae': float(mae), 'rmse': float(rmse)}
metrics


R2: 0.967 | MAE: 2.22 | RMSE: 3.58


{'r2': 0.9670486782528738,
 'mae': 2.224751585107362,
 'rmse': 3.5784108596077915}

In [None]:
# Persistence baseline (t-24h)
baseline = test['PM2.5_lag24'].values

# Baseline metrics
r2_b = r2_score(y_test, baseline)
mae_b = mean_absolute_error(y_test, baseline)
rmse_b = np.sqrt(((y_test.values - baseline) ** 2).mean())

# Side-by-side
print(f'Naive-24h | R2: {r2_b:.3f} | MAE: {mae_b:.2f} | RMSE: {rmse_b:.2f}')
print(f'Model     | R2: {metrics["r2"]:.3f} | MAE: {metrics["mae"]:.2f} | RMSE: {metrics["rmse"]:.2f}')


Naive-24h | R2: 0.857 | MAE: 4.79 | RMSE: 7.46
Model     | R2: 0.967 | MAE: 2.22 | RMSE: 3.58


In [None]:
# Evaluation table
eval_df = test[['Timestamp']].copy()
eval_df['Actual'] = y_test.values
eval_df['Predicted'] = y_pred
eval_df['Error'] = eval_df['Predicted'] - eval_df['Actual']
eval_df['AbsError'] = eval_df['Error'].abs()

eval_df.head()


Unnamed: 0,Timestamp,Actual,Predicted,Error,AbsError
28934,2021-07-03 04:00:00,17.15,22.1144,4.9644,4.9644
28935,2021-07-03 05:00:00,19.54,17.2598,-2.2802,2.2802
28936,2021-07-03 06:00:00,18.69,21.261067,2.571067,2.571067
28937,2021-07-03 07:00:00,18.76,19.5959,0.8359,0.8359
28938,2021-07-03 08:00:00,19.1,19.808467,0.708467,0.708467


In [None]:
# Histogram of PM2.5
fig = px.histogram(df, x='PM2.5', nbins=60, title='PM2.5 Distribution')
fig.update_layout(xaxis_title='PM2.5', yaxis_title='Count')
fig.show()


In [None]:
# Daily mean trend
df_daily = df.set_index('Timestamp').resample('D')['PM2.5'].mean().reset_index()
fig = px.line(df_daily, x='Timestamp', y='PM2.5', title='Daily Mean PM2.5 Over Time')
fig.update_layout(xaxis_title='Date', yaxis_title='PM2.5')
fig.show()


In [None]:
# Overlay actual vs predicted
plot_df = test[['Timestamp']].copy()
plot_df['Actual'] = y_test.values
plot_df['Predicted'] = y_pred

fig = go.Figure()
fig.add_scatter(x=plot_df['Timestamp'], y=plot_df['Actual'], mode='lines', name='Actual')
fig.add_scatter(x=plot_df['Timestamp'], y=plot_df['Predicted'], mode='lines', name='Predicted')
fig.update_layout(title='Actual vs Predicted PM2.5 (Test)',
                  xaxis_title='Time', yaxis_title='PM2.5')
fig.show()


In [None]:
# Mark top 10% PM2.5
thr = eval_df['Actual'].quantile(0.9)
episode = (eval_df['Actual'] >= thr)

fig = go.Figure()
fig.add_scatter(x=eval_df['Timestamp'], y=eval_df['Actual'], mode='lines', name='Actual', line=dict(color='#1f77b4'))
fig.add_scatter(x=eval_df['Timestamp'], y=eval_df['Predicted'], mode='lines', name='Predicted', line=dict(color='#ff7f0e'))
fig.add_scatter(x=eval_df.loc[episode, 'Timestamp'], y=eval_df.loc[episode, 'Actual'],
                mode='markers', name='High Episode', marker=dict(color='crimson', size=6, opacity=0.6))
fig.update_layout(title='PM2.5: Actual vs Predicted with High Episodes',
                  xaxis_title='Time', yaxis_title='PM2.5',
                  xaxis=dict(rangeslider=dict(visible=True)))
fig.show()


In [None]:
# Residuals and 24h rolling abs error
e = eval_df.copy()
e['Residual'] = e['Predicted'] - e['Actual']
e['AbsErr_roll24'] = e['Error'].abs().rolling(24, min_periods=1).mean()

fig = go.Figure()
fig.add_scatter(x=e['Timestamp'], y=e['Residual'], mode='lines', name='Residual', line=dict(color='#2ca02c'))
fig.add_scatter(x=e['Timestamp'], y=e['AbsErr_roll24'], mode='lines', name='Abs Error (24h mean)', line=dict(color='#9467bd'))
fig.add_hline(y=0, line=dict(color='gray', dash='dash'))
fig.update_layout(title='Residuals Over Time (+ 24h Rolling Abs Error)',
                  xaxis_title='Time', yaxis_title='Residual / Abs Error')
fig.show()


In [None]:
# Median absolute error heatmap (Hour x Month)
h = test[['Month', 'Hour']].copy()
h['AbsError'] = eval_df['AbsError'].values
pivot = h.pivot_table(index='Hour', columns='Month', values='AbsError', aggfunc='median')

fig = px.imshow(pivot, aspect='auto', color_continuous_scale='YlOrRd',
                labels=dict(color='Median Abs Error'),
                title='Median Absolute Error by Hour x Month')
fig.update_xaxes(title='Month')
fig.update_yaxes(title='Hour')
fig.show()


In [None]:
# Larger PM2.5 vs error magnitude
fig = px.scatter(eval_df, x='Actual', y='AbsError', opacity=0.4, trendline='ols',
                 title='Absolute Error vs Actual PM2.5')
fig.update_layout(xaxis_title='Actual PM2.5', yaxis_title='Absolute Error')
fig.show()


In [None]:
# Compare distributions
stack = pd.DataFrame({
    'value': pd.concat([eval_df['Actual'], pd.Series(eval_df['Predicted'], name='Predicted')]),
    'type': ['Actual'] * len(eval_df) + ['Predicted'] * len(eval_df)
})

fig = px.histogram(stack, x='value', color='type', barmode='overlay', nbins=60,
                   color_discrete_map={'Actual': '#1f77b4', 'Predicted': '#ff7f0e'},
                   title='Distribution: Actual vs Predicted')
fig.update_traces(opacity=0.55)
fig.update_layout(xaxis_title='PM2.5', yaxis_title='Count')
fig.show()


In [None]:
# Daily mean calendar (test only)
d = test[['Timestamp']].copy()
d['PM_day'] = eval_df['Actual'].values
d = d.set_index('Timestamp').resample('D')['PM_day'].mean().reset_index()

# Calendar fields
d['Year'] = d['Timestamp'].dt.year
d['Month'] = d['Timestamp'].dt.month
d['Day'] = d['Timestamp'].dt.day

# Day x Month pivot
pivot = d.pivot_table(index='Day', columns='Month', values='PM_day', aggfunc='mean')

fig = px.imshow(pivot, aspect='auto', color_continuous_scale='Blues',
                labels=dict(color='Daily Mean PM2.5'),
                title='Calendar View: Daily Mean PM2.5 (by Day x Month)')
fig.update_xaxes(title='Month')
fig.update_yaxes(title='Day of Month')
fig.show()


**Conclusion**



*   Developed a time-ordered PM2.5
forecasting pipeline incorporating a 24-hour persistence baseline and a RandomForest model.

*   Evaluated performance on a held-out test window using consistent metrics (R2, MAE, RMSE) to enable a fair baseline comparison.

*  Interpreted results with the principle that any added model complexity must outperform the persistence baseline to be justified; when this condition is met, the gains are practically valuable for short-term decision-making (e.g., morning planning and advisories).

