# Extreme Gradient Boosting 
(for PM 2.5)

In [3]:
!pip install xgboost



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [4]:
!conda install -c conda-forge xgboost -y


Channels:
 - conda-forge
 - defaults
Platform: osx-arm64
Collecting package metadata (repodata.json): | Retrying (Retry(total=2, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x10a5663c0>: Failed to resolve 'repo.anaconda.com' ([Errno 8] nodename nor servname provided, or not known)")': /pkgs/main/osx-arm64/repodata.json.zst

Retrying (Retry(total=2, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x10a5c03b0>: Failed to resolve 'repo.anaconda.com' ([Errno 8] nodename nor servname provided, or not known)")': /pkgs/r/osx-arm64/repodata.json.zst

Retrying (Retry(total=2, connect=None, read=None, redirect=None, status=None)) after connection broken by 'NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x10a566330>: Failed to resolve 'repo.anaconda.com' ([Errno 8] nodename

In [5]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Load dataset
df = pd.read_csv("/Users/shrutikute/Downloads/updated_air_quality_dataset.csv")

#  Map county names to city names (using "County Name" column)
county_to_city = {
    "Cook": "Chicago",
    "New York": "New York",
    "Los Angeles": "Los Angeles"
}

df["City"] = df["County Name"].map(county_to_city)  # Create a 'City' column

# Define features and target variable
features = ['temperature_2m (°C)', 'relative_humidity_2m (%)', 
            'precipitation (mm)', 'wind_speed_100m (km/h)', 'Day', 'Hour']
target = 'PM2.5'

# Drop rows with missing target values
df = df.dropna(subset=[target])

#  Loop through each city and train a separate model
for city in ["New York", "Chicago", "Los Angeles"]:
    print(f"\n Training Model for {city} ")
    
    # Filter data for the specific city
    city_df = df[df["City"] == city]
    
    # Check if there's enough data
    if city_df.shape[0] < 50:  # Adjust threshold as needed
        print(f" Not enough data for {city}. Skipping...")
        continue
    
    # Extract features and target
    X = city_df[features]
    y = city_df[target]

    # Standardizing features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
# Splits Data into Training (80%) and Testing (20%)
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    # Train XGBoost model with optimized hyperparameters
    xgb_regressor = xgb.XGBRegressor(
        objective='reg:squarederror', 
        n_estimators=500, 
        learning_rate=0.05, 
        max_depth=8, 
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    )

    xgb_regressor.fit(X_train, y_train)

    # Make predictions
    y_pred = xgb_regressor.predict(X_test)

    # Evaluate model performance
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f" {city} Model Performance:")
    print(f"   • Mean Absolute Error (MAE): {mae:.4f}")
    print(f"   • Mean Squared Error (MSE): {mse:.4f}")
    print(f"   • R-squared Score (R²): {r2:.4f}")



 Training Model for New York 
 New York Model Performance:
   • Mean Absolute Error (MAE): 2.5263
   • Mean Squared Error (MSE): 15.3329
   • R-squared Score (R²): 0.5273

 Training Model for Chicago 
 Chicago Model Performance:
   • Mean Absolute Error (MAE): 2.5957
   • Mean Squared Error (MSE): 15.1180
   • R-squared Score (R²): 0.5433

 Training Model for Los Angeles 
 Los Angeles Model Performance:
   • Mean Absolute Error (MAE): 6.4970
   • Mean Squared Error (MSE): 128.3980
   • R-squared Score (R²): 0.6084


In [6]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import numpy as np
import warnings

# Suppress warnings
warnings.filterwarnings("ignore")

# Load dataset
df = pd.read_csv("/Users/shrutikute/Downloads/updated_air_quality_dataset.csv")

#  Fix column names to remove whitespace
df.columns = df.columns.str.replace(" ", "_")

#  Map County Names to City Names
county_to_city = {
    "Cook": "Chicago",
    "New York": "New York",
    "Los Angeles": "Los Angeles"
}
df["City"] = df["County_Name"].map(county_to_city)

#  Feature Engineering - Convert 'Day' to Cyclic Encoding
df['Day_sin'] = np.sin(2 * np.pi * df['Day'] / 7)
df['Day_cos'] = np.cos(2 * np.pi * df['Day'] / 7)

#  Feature Engineering - Rolling Average for PM2.5 (3-hour trend)
df["PM2.5_Rolling_Avg"] = df["PM2.5"].rolling(window=3, min_periods=1).mean()

# Define updated features
features = [
    'temperature_2m_(°C)', 'relative_humidity_2m_(%)', 
    'precipitation_(mm)', 'wind_speed_100m_(km/h)',
    'Day_sin', 'Day_cos', 'PM2.5_Rolling_Avg', 'Hour'
]
target = 'PM2.5'

# Drop rows with missing values in features
df = df.dropna(subset=features)

#  Train a model for each city
for city in ["New York", "Chicago", "Los Angeles"]:
    print(f"\n Training Model for {city} ")
    
    # Filter data for the specific city
    city_df = df[df["City"] == city]
    
    # Check if there's enough data
    if city_df.shape[0] < 50:
        print(f" Not enough data for {city}. Skipping...")
        continue
    
    # Extract features and target
    X = city_df[features]
    y = city_df[target]

    # Standardizing features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Splits Data into Training (80%) and Testing (20%)
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    # Define parameter grid for Grid Search
    param_grid = {
        'n_estimators': [300, 500],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [6, 8],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0],
        'reg_alpha': [0.01, 0.1],  # L1 Regularization
        'reg_lambda': [1, 10, 50]  # L2 Regularization
    }

    # Initialize XGBoost model
    xgb_regressor = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

    # Perform Grid Search CV
    tuner = GridSearchCV(
        xgb_regressor, param_grid, scoring='r2', cv=3, verbose=0, n_jobs=-1
    )

    tuner.fit(X_train, y_train)

    # Best parameters
    best_params = tuner.best_params_
    print(f" Best Hyperparameters for {city}: {best_params}")

    # Train model with best parameters found through grid search
    xgb_best = xgb.XGBRegressor(**best_params, objective='reg:squarederror', random_state=42)
    xgb_best.fit(X_train, y_train)

    # Make predictions
    y_pred = xgb_best.predict(X_test)

    # Evaluate model performance
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f" {city} Model Performance:")
    print(f"   • Mean Absolute Error (MAE): {mae:.4f}")
    print(f"   • Mean Squared Error (MSE): {mse:.4f}")
    print(f"   • R-squared Score (R²): {r2:.4f}")



 Training Model for New York 
 Best Hyperparameters for New York: {'colsample_bytree': 0.8, 'learning_rate': 0.05, 'max_depth': 6, 'n_estimators': 300, 'reg_alpha': 0.1, 'reg_lambda': 50, 'subsample': 0.8}
 New York Model Performance:
   • Mean Absolute Error (MAE): 1.5079
   • Mean Squared Error (MSE): 7.5891
   • R-squared Score (R²): 0.7660

 Training Model for Chicago 
 Best Hyperparameters for Chicago: {'colsample_bytree': 1.0, 'learning_rate': 0.01, 'max_depth': 6, 'n_estimators': 500, 'reg_alpha': 0.01, 'reg_lambda': 10, 'subsample': 0.8}
 Chicago Model Performance:
   • Mean Absolute Error (MAE): 1.6409
   • Mean Squared Error (MSE): 7.0835
   • R-squared Score (R²): 0.7860

 Training Model for Los Angeles 
 Best Hyperparameters for Los Angeles: {'colsample_bytree': 0.8, 'learning_rate': 0.05, 'max_depth': 6, 'n_estimators': 500, 'reg_alpha': 0.1, 'reg_lambda': 50, 'subsample': 1.0}
 Los Angeles Model Performance:
   • Mean Absolute Error (MAE): 2.6565
   • Mean Squared Error 