In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
train=pd.read_csv('E:\\aqi_prediction\\artifact\\train.csv')
test=pd.read_csv('E:\\aqi_prediction\\artifact\\test.csv')

In [8]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

#train data
X_train=train.drop('AQI', axis=1)
y_train=train['AQI']

numerical_features=X_train.select_dtypes(include=[np.number]).columns
categorical_features=X_train.select_dtypes(include=['object']).columns
numerical_pipeline=Pipeline(steps=[('scaler', StandardScaler())])
categorical_pipeline=Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])
preprocessor = ColumnTransformer(transformers=[('num', numerical_pipeline, numerical_features),('cat', categorical_pipeline, categorical_features)])
X_train_transformed = preprocessor.fit_transform(X_train)

#test data
X_test=test.drop('AQI', axis=1)
y_test=test['AQI']

numerical_features=X_test.select_dtypes(include=[np.number]).columns
categorical_features=X_test.select_dtypes(include=['object']).columns
numerical_pipeline=Pipeline(steps=[('scaler', StandardScaler())])
categorical_pipeline=Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])
preprocessor = ColumnTransformer(transformers=[('num', numerical_pipeline, numerical_features),('cat', categorical_pipeline, categorical_features)])
X_test_transformed = preprocessor.fit_transform(X_test)


In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import r2_score

model_lr = LinearRegression()
model_rf = RandomForestRegressor()
model_gb = GradientBoostingRegressor()
model_ada = AdaBoostRegressor()
model_dt = DecisionTreeRegressor()
model_xgb = XGBRegressor(verbosity=0)
model_cat = CatBoostRegressor(verbose=0)

models = [
    ("Linear Regression", model_lr),
    ("Random Forest", model_rf),
    ("Gradient Boosting", model_gb),
    ("AdaBoost", model_ada),
    ("Decision Tree", model_dt),
    ("XGBoost", model_xgb),
    ("CatBoost", model_cat)
]

for name, model in models:
    print(f"\nModel: {name}")
    model.fit(X_train_transformed, y_train)
    y_pred = model.predict(X_test_transformed)
    r2 = r2_score(y_test, y_pred)
    print(f"R-squared: {r2}")



Model: Linear Regression
R-squared: 0.7046066399397025

Model: Random Forest
R-squared: 0.8577960964115999

Model: Gradient Boosting
R-squared: 0.8858498933401276

Model: AdaBoost
R-squared: 0.2382585301027178

Model: Decision Tree
R-squared: 0.668062233887743

Model: XGBoost
R-squared: 0.8957027694575089

Model: CatBoost
R-squared: 0.8981586494207845


In [12]:
models = [
    ("CatBoost", model_cat),
    ("XGBoost", model_xgb),
    ("Gradient Boosting", model_gb)
]

In [15]:
param_grid_cat = {
    'iterations': [100, 200, 300],
    'depth': [4, 6, 8, 9],
    'learning_rate': [0.01, 0.05, 0.1],
}
param_grid_xgb = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7, 8],
    'learning_rate': [0.01, 0.05, 0.1]
}
param_grid_gb = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7, 8]
}
param_grid=[param_grid_cat, param_grid_xgb, param_grid_gb]

In [16]:
from sklearn.model_selection import GridSearchCV

for (name, model),param_grid in zip(models, param_grid):
    grid = GridSearchCV(model, param_grid=param_grid, cv=5, scoring='r2', n_jobs=-1)
    grid.fit(X_train_transformed, y_train)
    print("Best Params:", grid.best_params_)
    print("Best CV R² Score:", grid.best_score_)
    y_pred = grid.predict(X_test_transformed)
    r2 = r2_score(y_test, y_pred)
    print(f"Test R-squared: {r2}")
    print("\n")

Best Params: {'depth': 9, 'iterations': 300, 'learning_rate': 0.1}
Best CV R² Score: 0.8880096472702872
Test R-squared: 0.9061955487544032


Best Params: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200}
Best CV R² Score: 0.8825033403789607
Test R-squared: 0.8981361565667281


Best Params: {'learning_rate': 0.05, 'max_depth': 7, 'n_estimators': 300}
Best CV R² Score: 0.880298953400998
Test R-squared: 0.8825837397896195




In [None]:
param_grid_cat = {
    'depth': [6, 8, 9, 10, 11],
    'iterations': [300, 310, 320, 330, 340],
    'learning_rate': [0.05, 0.1, 0.15],
    'l2_leaf_reg': [1, 3, 5, 7],
    'bagging_temperature': [0, 0.5, 1],
    'border_count': [32, 64, 128]
}
grid = GridSearchCV(model_cat, param_grid=param_grid_cat, cv=5, scoring='r2', n_jobs=-1)
grid.fit(X_train_transformed, y_train)
print("Best Params:", grid.best_params_)
print("Best CV R² Score:", grid.best_score_)
y_pred = grid.predict(X_test_transformed)
r2 = r2_score(y_test, y_pred)
print(f"Test R-squared: {r2}")
print("\n")