In [None]:
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go

In [None]:
df = pd.read_csv('Data.csv')
df.head()

In [None]:
eda_df = df.copy()
eda_df['timestamp'] = pd.to_datetime(eda_df['timestamp'])
eda_df['year'] = eda_df['timestamp'].dt.year
eda_df['hour'] = eda_df['timestamp'].dt.hour
eda_df['day'] = eda_df['timestamp'].dt.date
eda_df['month'] = eda_df['timestamp'].dt.strftime('%B')
eda_df['weekday'] = eda_df['timestamp'].dt.day_name()


In [None]:
eda_df.head()

In [None]:
df_filtered = eda_df[eda_df['city'].isin(['Delhi', 'Mumbai', 'Srinagar', 'Pune'])]
daily_temp = df_filtered.groupby(['day', 'city'])['temperature_2m'].mean().reset_index()
fig = px.line(daily_temp, x='day', y='temperature_2m', color='city',
              title='Daily Average Temperature by City')
fig.show()

fig.write_image("Average_temp_city_by_day.png", width=2000, height=600, scale=2)

In [None]:
aqi_hourly = eda_df.groupby('hour')['aqi'].mean().reset_index()

# Create the plot
fig = go.Figure()

# Add the line trace for AQI by hour
fig.add_trace(
    go.Scatter(
        x=aqi_hourly['hour'],
        y=aqi_hourly['aqi'],
        mode='lines+markers',  # 'lines' for a line plot with 'markers' at data points
        name='Average AQI',
        line=dict(color='royalblue', width=3),  # Customize line color and thickness
        marker=dict(size=8, color='darkblue', line=dict(width=1, color='black')),  # Custom markers
    )
)

# Update layout for better readability
fig.update_layout(
    title='Average AQI by Hour of Day',
    xaxis_title='Hour of Day',
    yaxis_title='Average AQI',
    xaxis=dict(
        tickmode='linear',  # Ensure x-axis ticks are continuous
        tick0=0,            # Start at hour 0
        dtick=1,            # Tick every hour
    ),
    yaxis=dict(
        rangemode='tozero',  # Start y-axis at 0 (optional, but keeps the plot clean)
        range=[100, 120],     # Set y-axis range from 100 to 120
    ),
    plot_bgcolor='white',  # Background color of the plot
    font=dict(size=14),    # Font size for text
    showlegend=False,      # Hide legend as we only have one line
    template='plotly_white',  # Clean white template for aesthetics
    height=600,  # Customize height of the plot
    width=1000,  # Customize width of the plot
)

# Show the plot
fig.show()

fig.write_image("Average_AQI_by_hour.png", width=1800, height=800, scale=2)


In [None]:
latest = eda_df.sort_values('timestamp').drop_duplicates('city', keep='last')
fig = px.scatter_mapbox(
    latest,
    lat='latitude',
    lon='longitude',
    color='aqi',
    size='aqi',
    hover_name='city',
    size_max=20,
    zoom=3,
    mapbox_style="carto-positron",
    title='Latest AQI by City',
)
# Set custom height and width
fig.update_layout(
    height=700,
    width=1000
)
fig.show()

In [None]:
plt.figure(figsize=(10, 8))
weather_cols = ['temperature_2m', 'wind_speed_10m', 'cloud_cover', 'precipitation',
                'relative_humidity_2m', 'dew_point_2m', 'pm2_5', 'pm_10', 'aqi']
sns.heatmap(eda_df[weather_cols].corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Matrix of Weather and Air Quality Variables")
plt.show()

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

filtered_df = eda_df[eda_df['timestamp'] >= '2024-01-01']

# Get consistent city order (based on AQI median)
city_order = filtered_df.groupby('city')['aqi'].median().sort_values(ascending=False).index.tolist()

# Create subplot grid
fig = make_subplots(
    rows=1, cols=3,
    subplot_titles=("PM2.5 by City", "PM10 by City", "AQI by City"),
    shared_yaxes=False
)

# PM2.5 Boxplot
fig.add_trace(
    go.Box(
        x=filtered_df['city'],
        y=filtered_df['pm2_5'],
        name='PM2.5',
        marker_color='indianred',
        boxpoints='outliers',
        showlegend=False
    ),
    row=1, col=1
)

# PM10 Boxplot
fig.add_trace(
    go.Box(
        x=filtered_df['city'],
        y=filtered_df['pm_10'],
        name='PM10',
        marker_color='royalblue',
        boxpoints='outliers',
        showlegend=False
    ),
    row=1, col=2
)

# AQI Boxplot
fig.add_trace(
    go.Box(
        x=filtered_df['city'],
        y=filtered_df['aqi'],
        name='AQI',
        marker_color='seagreen',
        boxpoints='outliers',
        showlegend=False
    ),
    row=1, col=3
)

# Update layout
fig.update_layout(
    title_text='Air Quality Metrics Distribution by City',
    height=600,
    width=2200,
    template='plotly_white',
    margin=dict(t=80, b=100),
    font=dict(size=13)
)

# Set same city order for all plots
for i in range(1, 4):
    fig.update_xaxes(categoryorder='array', categoryarray=city_order, tickangle=-45, row=1, col=i)

fig.update_yaxes(title_text='µg/m³ or Index', row=1, col=1)

fig.show()

fig.write_image("boxplots_air_quality.png", width=1800, height=800, scale=2)

In [None]:
# Calculate the average AQI for each weekday
avg_aqi_per_weekday = eda_df.groupby('weekday')['aqi'].mean().reset_index()

# Reorder weekdays (Monday, Tuesday, etc.)
weekday_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
avg_aqi_per_weekday['weekday'] = pd.Categorical(avg_aqi_per_weekday['weekday'], categories=weekday_order, ordered=True)

# Plot the bar chart
fig = px.bar(
    avg_aqi_per_weekday,
    x='weekday',
    y='aqi',
    title='Average AQI per Weekday',
    labels={'weekday': 'Weekday', 'AQI': 'Average AQI'},
    color='weekday',  # Color by weekday for better visualization
    color_discrete_sequence=px.colors.qualitative.Set1  # Choose a color palette
)

# Update layout for better readability
fig.update_layout(
    height=600,
    width=800,
    xaxis_title='Weekday',
    yaxis_title='Average AQI',
    xaxis_tickangle=-45,
    font=dict(size=14)
)

# Show the plot
fig.show()

In [None]:
# Calculate the average AQI for each month (including year to handle multi-year data)
avg_aqi_per_month = eda_df.groupby(['year', 'month'])['aqi'].mean().reset_index()

# Reorder months to display in calendar order
month_order = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 
               'September', 'October', 'November', 'December']
avg_aqi_per_month['month'] = pd.Categorical(avg_aqi_per_month['month'], categories=month_order, ordered=True)

# Plot the bar chart
fig = px.bar(
    avg_aqi_per_month,
    x='month',
    y='aqi',
    color='month',  # Color by month for better visualization
    title='Average AQI per Month',
    labels={'month': 'Month', 'AQI': 'Average AQI'},
    color_discrete_sequence=px.colors.qualitative.Set2  # Choose a color palette
)

# Update layout for better readability
fig.update_layout(
    height=600,
    width=1000,
    xaxis_title='Month',
    yaxis_title='Average AQI',
    xaxis_tickangle=-45,
    font=dict(size=14),
    xaxis={'categoryorder': 'array', 'categoryarray': month_order}  # Ensure months are ordered correctly
)

# Show the plot
fig.show()

fig.write_image("Average_AQI_by_Month.png", width=1800, height=800, scale=2)

In [None]:
sample_df = eda_df[['pm2_5', 'pm_10', 'aqi', 'temperature_2m']].sample(1000)
sns.pairplot(sample_df)
plt.suptitle('Pairwise Relationships', y=1.02)
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
import numpy as np

In [None]:
df.head()

In [None]:
X = df.drop(columns=['aqi', 'timestamp', 'city', 'latitude', 'longitude'])  # Drop non-numeric features (timestamp, city, day)
y = df['aqi']

In [None]:
X.head()

In [None]:
y.head()

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
X_train, X_data, y_train, y_data = train_test_split(X_scaled, y, test_size=0.2, random_state=42, shuffle=True)
X_val, X_test, y_val, y_test = train_test_split(X_data, y_data, test_size=0.5, random_state=42, shuffle=True)

## Model

### XGBoost

XGBoost is a very efficient and scalable algorithm for large datasets. It's particularly well-suited for regression tasks and can handle both sparse and dense data.

#### Advantages:
- Extremely efficient and works well with large datasets.
- Provides built-in regularization to avoid overfitting.
- Often produces state-of-the-art results for regression tasks.

#### Disadvantages:
- It’s computationally more intensive compared to simpler models.
- Hyperparameter tuning can be complex.

#### Use Case:
- For competitive machine learning problems or where prediction accuracy is critical.

In [None]:
model = XGBRegressor(objective='reg:squarederror', n_estimators=1000, learning_rate=0.05, max_depth=6)
model.fit(X_train, y_train)

In [None]:
y_train_pred = model.predict(X_train)
y_val_pred = model.predict(X_val)
y_test_pred = model.predict(X_test)

In [None]:
# Calculate MSE and RMSE for each dataset
mse_train = mean_squared_error(y_train, y_train_pred)
mse_val = mean_squared_error(y_val, y_val_pred)
mse_test = mean_squared_error(y_test, y_test_pred)

# Calculate RMSE by taking the square root of MSE
rmse_train = np.sqrt(mse_train)
rmse_val = np.sqrt(mse_val)
rmse_test = np.sqrt(mse_test)

In [None]:
r2_train = r2_score(y_train, y_train_pred)
r2_val = r2_score(y_val, y_val_pred)
r2_test = r2_score(y_test, y_test_pred)

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# RMSE and R² Data
labels = ['Train', 'Validation', 'Test']
rmse_scores = [rmse_train, rmse_val, rmse_test]
r2_scores = [r2_train, r2_val, r2_test]

# Format the RMSE and R² scores to two decimal places
rmse_scores_text = [f"{score:.2f}" for score in rmse_scores]
r2_scores_text = [f"{score:.2f}" for score in r2_scores]

# Create subplots with 1 row and 2 columns
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=("Root Mean Squared Error (RMSE)", "R² Score"),
    shared_yaxes=False  # Different y-axes for each plot
)

# Plot for RMSE
fig.add_trace(
    go.Bar(
        x=labels, 
        y=rmse_scores, 
        name="RMSE",
        marker=dict(color='skyblue'),
        text=rmse_scores_text,  # Show RMSE value (formatted) on top of the bars
        textposition='outside',  # Place text outside the bars
    ),
    row=1, col=1
)

# Plot for R² Score
fig.add_trace(
    go.Bar(
        x=labels, 
        y=r2_scores, 
        name="R² Score",
        marker=dict(color='lightgreen'),
        text=r2_scores_text,  # Show R² value (formatted) on top of the bars
        textposition='outside',  # Place text outside the bars
    ),
    row=1, col=2
)

# Update layout for both subplots
fig.update_layout(
    title="Model Performance Comparison",
    title_x=0.5,  # Center the title horizontally
    title_y=0.95,  # Adjust the vertical position of the title
    title_xanchor='center',  # Anchor title to the center
    title_yanchor='top',  # Anchor title to the top for appropriate vertical spacing
    showlegend=False,
    height=600,  # Set height of the entire figure
    width=1000,  # Set width of the entire figure
    xaxis=dict(title="Dataset"),
    yaxis=dict(title="RMSE", range=[0, max(rmse_scores) + 100]),  # Custom y-axis for RMSE plot
    yaxis2=dict(title="R² Score", range=[0, 1]),  # Custom y-axis for R² Score plot
    template="plotly_white",  # White background for the plot
    margin=dict(l=40, r=40, t=100, b=40),  # Increase top margin for title spacing
)

# Show the plot
fig.show()

fig.write_image("Model_Results.png", width=1800, height=800, scale=2)

## Hyper Parameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

# Create the XGBoost model
xgb_model = XGBRegressor(objective='reg:squarederror')

# Define hyperparameters to tune
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.7, 0.8, 1],
    'colsample_bytree': [0.7, 0.8, 1]
}

# Set up the GridSearchCV
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, 
                           scoring='neg_mean_squared_error', cv=3, n_jobs=-1, verbose=2)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Best hyperparameters from grid search
best_params = grid_search.best_params_
print(f"Best Hyperparameters: {best_params}")

# Train the XGBoost model with the best hyperparameters
best_model = grid_search.best_estimator_

In [None]:
print(f"Best Hyperparameters: {best_params}")

In [None]:
y_train_pred = best_model.predict(X_train)
y_val_pred = best_model.predict(X_val)
y_test_pred = best_model.predict(X_test)

In [None]:
# Calculate MSE and RMSE for each dataset
mse_train = mean_squared_error(y_train, y_train_pred)
mse_val = mean_squared_error(y_val, y_val_pred)
mse_test = mean_squared_error(y_test, y_test_pred)

# Calculate RMSE by taking the square root of MSE
rmse_train = np.sqrt(mse_train)
rmse_val = np.sqrt(mse_val)
rmse_test = np.sqrt(mse_test)

In [None]:
r2_train = r2_score(y_train, y_train_pred)
r2_val = r2_score(y_val, y_val_pred)
r2_test = r2_score(y_test, y_test_pred)

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# RMSE and R² Data
labels = ['Train', 'Validation', 'Test']
rmse_scores = [rmse_train, rmse_val, rmse_test]
r2_scores = [r2_train, r2_val, r2_test]

# Format the RMSE and R² scores to two decimal places
rmse_scores_text = [f"{score:.2f}" for score in rmse_scores]
r2_scores_text = [f"{score:.2f}" for score in r2_scores]

# Create subplots with 1 row and 2 columns
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=("Root Mean Squared Error (RMSE)", "R² Score"),
    shared_yaxes=False  # Different y-axes for each plot
)

# Plot for RMSE
fig.add_trace(
    go.Bar(
        x=labels, 
        y=rmse_scores, 
        name="RMSE",
        marker=dict(color='skyblue'),
        text=rmse_scores_text,  # Show RMSE value (formatted) on top of the bars
        textposition='outside',  # Place text outside the bars
    ),
    row=1, col=1
)

# Plot for R² Score
fig.add_trace(
    go.Bar(
        x=labels, 
        y=r2_scores, 
        name="R² Score",
        marker=dict(color='lightgreen'),
        text=r2_scores_text,  # Show R² value (formatted) on top of the bars
        textposition='outside',  # Place text outside the bars
    ),
    row=1, col=2
)

# Update layout for both subplots
fig.update_layout(
    title="Model Performance Comparison after Hyper Parameter Tuning",
    title_x=0.5,  # Center the title horizontally
    title_y=0.95,  # Adjust the vertical position of the title
    title_xanchor='center',  # Anchor title to the center
    title_yanchor='top',  # Anchor title to the top for appropriate vertical spacing
    showlegend=False,
    height=600,  # Set height of the entire figure
    width=1000,  # Set width of the entire figure
    xaxis=dict(title="Dataset"),
    yaxis=dict(title="RMSE", range=[0, max(rmse_scores) + 100]),  # Custom y-axis for RMSE plot
    yaxis2=dict(title="R² Score", range=[0, 1]),  # Custom y-axis for R² Score plot
    template="plotly_white",  # White background for the plot
    margin=dict(l=40, r=40, t=100, b=40),  # Increase top margin for title spacing
)

# Show the plot
fig.show()

fig.write_image("Model_Results_HyperTuned.png", width=1800, height=800, scale=2)

In [None]:
import joblib

# Save the XGBoost model
joblib.dump(best_model, 'xgboost_model.pkl')

# Save the scaler
joblib.dump(scaler, 'scaler.pkl')