In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import Ridge, LinearRegression, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb


In [17]:
X_train = pd.read_csv('../dataset_preparation/train_test_data/x_train.csv')
X_test = pd.read_csv('../dataset_preparation/train_test_data/x_test.csv')
y_train = pd.read_csv("../dataset_preparation/train_test_data/y_train.csv")
y_test = pd.read_csv("../dataset_preparation/train_test_data/y_test.csv")

In [18]:
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

In [19]:
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

In [20]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_poly)
X_test_scaled = scaler.transform(X_test_poly)

In [21]:
feature_names = [name.replace(' ', '_') for name in poly.get_feature_names_out()]
feature_names

['co',
 'no2',
 'o3',
 'pm10',
 'pm25',
 'so2',
 'hour',
 'city_name_Ahmedabad',
 'city_name_Bengaluru',
 'city_name_Chennai',
 'city_name_Delhi',
 'city_name_Gajuwaka',
 'city_name_Hyderabad',
 'city_name_Kolkata',
 'city_name_Mumbai',
 'city_name_Pune',
 'co^2',
 'co_no2',
 'co_o3',
 'co_pm10',
 'co_pm25',
 'co_so2',
 'co_hour',
 'co_city_name_Ahmedabad',
 'co_city_name_Bengaluru',
 'co_city_name_Chennai',
 'co_city_name_Delhi',
 'co_city_name_Gajuwaka',
 'co_city_name_Hyderabad',
 'co_city_name_Kolkata',
 'co_city_name_Mumbai',
 'co_city_name_Pune',
 'no2^2',
 'no2_o3',
 'no2_pm10',
 'no2_pm25',
 'no2_so2',
 'no2_hour',
 'no2_city_name_Ahmedabad',
 'no2_city_name_Bengaluru',
 'no2_city_name_Chennai',
 'no2_city_name_Delhi',
 'no2_city_name_Gajuwaka',
 'no2_city_name_Hyderabad',
 'no2_city_name_Kolkata',
 'no2_city_name_Mumbai',
 'no2_city_name_Pune',
 'o3^2',
 'o3_pm10',
 'o3_pm25',
 'o3_so2',
 'o3_hour',
 'o3_city_name_Ahmedabad',
 'o3_city_name_Bengaluru',
 'o3_city_name_Chennai',

In [22]:
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=poly.get_feature_names_out())
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=poly.get_feature_names_out())

In [23]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso Regression": Lasso(alpha=0.1),
    "Ridge Regression": Ridge(alpha=1.0),
    "ElasticNet": ElasticNet(alpha=0.1, l1_ratio=0.5),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
    "AdaBoost": AdaBoostRegressor(n_estimators=100, random_state=42),
    "Support Vector Regressor (SVR)": SVR(kernel='rbf', C=100, gamma=0.1, epsilon=0.1),
    "K-Nearest Neighbors (KNN)": KNeighborsRegressor(n_neighbors=5),
    "XGBoost": xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42),
   
}

In [24]:
results = {}
for name, model in models.items():
    model.fit(X_train_scaled_df, y_train)
    y_pred = model.predict(X_test_scaled_df)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    cv_scores = cross_val_score(model, X_train_scaled_df, y_train, cv=5, scoring='r2')
    cv_r2_mean = np.mean(cv_scores)
    
    results[name] = {
        "MSE": mse,
        "RMSE": rmse,
        "R2": r2,
        "Cross-Validated R2": cv_r2_mean
    }

In [27]:
for model, metrics in results.items():
    print(f"{model} Results:")
    print(f"Mean Squared Error (MSE): {metrics['MSE']:.2f}")
    print(f"Root Mean Squared Error (RMSE): {metrics['RMSE']:.2f}")
    print(f"R-squared (R2): {metrics['R2']:.2f}")
    print(f"Cross-Validated R2 Score: {metrics['Cross-Validated R2']:.2f}\n")

Linear Regression Results:
Mean Squared Error (MSE): 3923.37
Root Mean Squared Error (RMSE): 62.64
R-squared (R2): -0.22
Cross-Validated R2 Score: -0.85

Lasso Regression Results:
Mean Squared Error (MSE): 4675.48
Root Mean Squared Error (RMSE): 68.38
R-squared (R2): -0.46
Cross-Validated R2 Score: 0.92

Ridge Regression Results:
Mean Squared Error (MSE): 3303.62
Root Mean Squared Error (RMSE): 57.48
R-squared (R2): -0.03
Cross-Validated R2 Score: 0.88

ElasticNet Results:
Mean Squared Error (MSE): 1019.80
Root Mean Squared Error (RMSE): 31.93
R-squared (R2): 0.68
Cross-Validated R2 Score: 0.86

Decision Tree Results:
Mean Squared Error (MSE): 257.78
Root Mean Squared Error (RMSE): 16.06
R-squared (R2): 0.92
Cross-Validated R2 Score: 0.82

Random Forest Results:
Mean Squared Error (MSE): 455.27
Root Mean Squared Error (RMSE): 21.34
R-squared (R2): 0.86
Cross-Validated R2 Score: 0.84

Gradient Boosting Results:
Mean Squared Error (MSE): 215.69
Root Mean Squared Error (RMSE): 14.69
R-squ

In [28]:
from sklearn.model_selection import GridSearchCV

gb_params = {
    "n_estimators": [100, 200, 300],
    "learning_rate": [0.01, 0.1, 0.2],
    "max_depth": [3, 5, 7]
}
gb_grid = GridSearchCV(GradientBoostingRegressor(random_state=42), gb_params, cv=5, scoring="r2", n_jobs=-1)
gb_grid.fit(X_train_scaled_df, y_train)
best_gb = gb_grid.best_estimator_

In [29]:
rf_params = {
    "n_estimators": [100, 200, 300],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5, 10]
}
rf_grid = GridSearchCV(RandomForestRegressor(random_state=42), rf_params, cv=5, scoring="r2", n_jobs=-1)
rf_grid.fit(X_train_scaled_df, y_train)
best_rf = rf_grid.best_estimator_