In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split,RandomizedSearchCV
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error,root_mean_squared_error
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
import os
import pickle
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv(r"data/Merged_Train_Data.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,year,month,day,hour,O3_forecast,NO2_forecast,T_forecast,q_forecast,u_forecast,v_forecast,w_forecast,NO2_satellite,HCHO_satellite,ratio_satellite,O3_target,NO2_target,location
0,0,2022.0,7.0,28.0,0.0,73.35,57.54,22.99,7.68,-3.46,1.28,1.02,,,,5.03,6.75,Satyawati College
1,1,2022.0,7.0,28.0,1.0,82.77,57.25,23.9,7.7,-1.35,0.29,0.99,,,,5.08,6.07,Satyawati College
2,2,2022.0,7.0,28.0,2.0,92.19,56.97,23.89,7.72,0.76,-0.7,0.96,,,,6.95,3.38,Satyawati College
3,3,2022.0,7.0,28.0,3.0,101.62,56.68,23.92,7.74,2.87,-1.69,0.93,,,,5.8,4.85,Satyawati College
4,4,2022.0,7.0,28.0,4.0,113.51,64.06,25.55,7.81,2.45,-1.15,0.66,,,,7.22,7.72,Satyawati College


In [4]:
df.columns

Index(['Unnamed: 0', 'year', 'month', 'day', 'hour', 'O3_forecast',
       'NO2_forecast', 'T_forecast', 'q_forecast', 'u_forecast', 'v_forecast',
       'w_forecast', 'NO2_satellite', 'HCHO_satellite', 'ratio_satellite',
       'O3_target', 'NO2_target', 'location'],
      dtype='object')

In [5]:
df = df.drop(columns=["Unnamed: 0","NO2_satellite","HCHO_satellite","ratio_satellite"])
df.head()

Unnamed: 0,year,month,day,hour,O3_forecast,NO2_forecast,T_forecast,q_forecast,u_forecast,v_forecast,w_forecast,O3_target,NO2_target,location
0,2022.0,7.0,28.0,0.0,73.35,57.54,22.99,7.68,-3.46,1.28,1.02,5.03,6.75,Satyawati College
1,2022.0,7.0,28.0,1.0,82.77,57.25,23.9,7.7,-1.35,0.29,0.99,5.08,6.07,Satyawati College
2,2022.0,7.0,28.0,2.0,92.19,56.97,23.89,7.72,0.76,-0.7,0.96,6.95,3.38,Satyawati College
3,2022.0,7.0,28.0,3.0,101.62,56.68,23.92,7.74,2.87,-1.69,0.93,5.8,4.85,Satyawati College
4,2022.0,7.0,28.0,4.0,113.51,64.06,25.55,7.81,2.45,-1.15,0.66,7.22,7.72,Satyawati College


In [6]:
X = df.drop(columns=["O3_target","NO2_target"])
y = df[["O3_target","NO2_target"]]

In [7]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=42)

In [8]:
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

num_transformer = StandardScaler()
cat_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", cat_transformer, cat_features),
        ("StandardScaler", num_transformer, num_features),        
    ]
)

In [9]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [10]:
def evaluate_model(true,predicted):
    mae = mean_absolute_error(true,predicted)
    mse = mean_squared_error(true,predicted)
    rmse = root_mean_squared_error(true,predicted)
    r2 = r2_score(true,predicted)

    return mae,mse,rmse,r2

In [11]:
models = {
    "Linear Regression": LinearRegression(n_jobs=-1),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "KNN": KNeighborsRegressor(n_jobs=-1),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(n_jobs=-1),
    "Adaboost": AdaBoostRegressor(),
    "XgBoost": XGBRegressor(n_jobs=-1)
}

model_list = []
mae_list = []
mse_list = []
rmse_list = []
r2_list = []

for name, model in models.items():
    if name in ["Adaboost","XgBoost"]:
        model = MultiOutputRegressor(model)

    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    train_mae, train_mse, train_rmse, train_r2 = evaluate_model(y_train, y_train_pred)
    test_mae, test_mse, test_rmse, test_r2 = evaluate_model(y_test, y_test_pred)

    print(name)
    model_list.append(name)

    print("Model performance for Training set")
    print("- Mean Absolute Error: {:.4f}".format(train_mae))
    print("- Mean Squared Error: {:.4f}".format(train_mse))
    print("- Root Mean Squared Error: {:.4f}".format(train_rmse))
    print("- R2 Score: {:.4f}".format(train_r2))

    print("-" * 35)

    print("Model performance for Test set")
    print("- Mean Absolute Error: {:.4f}".format(test_mae))
    print("- Mean Squared Error: {:.4f}".format(test_mse))
    print("- Root Mean Squared Error: {:.4f}".format(test_rmse))
    print("- R2 Score: {:.4f}".format(test_r2))

    mae_list.append(test_mae)
    mse_list.append(test_mse)
    rmse_list.append(test_rmse)
    r2_list.append(test_r2)

    print("=" * 35)
    print("\n")

Linear Regression
Model performance for Training set
- Mean Absolute Error: 21.3926
- Mean Squared Error: 837.1066
- Root Mean Squared Error: 28.7888
- R2 Score: 0.1287
-----------------------------------
Model performance for Test set
- Mean Absolute Error: 21.2940
- Mean Squared Error: 830.5020
- Root Mean Squared Error: 28.6627
- R2 Score: 0.1247


Ridge
Model performance for Training set
- Mean Absolute Error: 21.3926
- Mean Squared Error: 837.1066
- Root Mean Squared Error: 28.7888
- R2 Score: 0.1287
-----------------------------------
Model performance for Test set
- Mean Absolute Error: 21.2941
- Mean Squared Error: 830.5017
- Root Mean Squared Error: 28.6627
- R2 Score: 0.1247


Lasso
Model performance for Training set
- Mean Absolute Error: 21.9083
- Mean Squared Error: 865.8431
- Root Mean Squared Error: 29.2904
- R2 Score: 0.0977
-----------------------------------
Model performance for Test set
- Mean Absolute Error: 21.7980
- Mean Squared Error: 856.9451
- Root Mean Square

In [12]:
results = pd.DataFrame(list(zip(model_list,mae_list,mse_list,rmse_list,r2_list)),columns=["Model","MAE","MSE","RMSE","R2"])
results

Unnamed: 0,Model,MAE,MSE,RMSE,R2
0,Linear Regression,21.294045,830.501951,28.662723,0.124675
1,Ridge,21.294056,830.501731,28.662719,0.124675
2,Lasso,21.798048,856.945131,29.124099,0.095955
3,KNN,9.195868,224.17133,14.899986,0.763295
4,Decision Tree,8.144178,243.935502,15.618374,0.734292
5,Random Forest,6.71222,124.450002,11.155714,0.864297
6,Adaboost,23.311138,786.557021,28.02409,0.153285
7,XgBoost,10.107437,221.928635,14.872963,0.76249


In [13]:
rf_params = {
    "n_estimators": [200,300,400],
    "max_depth": [None,10,20,30],
    "min_samples_split": [2,5,10],
    "min_samples_leaf": [1,2,4],
    "max_features": ["sqrt","log2",None]
}

xgb_params = {
    "estimator__n_estimators": [200,400,600],
    "estimator__max_depth": [4,6,8],
    "estimator__learning_rate": [0.03,0.05,0.1],
    "estimator__subsample": [0.8,1.0],
    "estimator__colsample_bytree": [0.8,1.0]
}

rf = RandomForestRegressor(n_jobs=1,random_state=42)
xgb = XGBRegressor(tree_method="hist",n_jobs=1,random_state=42)
xgb_multi = MultiOutputRegressor(xgb)

In [14]:
randomcv_models = [
    ("Random Forest",rf,rf_params),
    ("XGBoost",xgb_multi,xgb_params)               
]

In [15]:
model_param = {}
for name, model, params in randomcv_models:
    random = RandomizedSearchCV(
        estimator=model,
        param_distributions=params,
        n_iter=20,
        cv=3,
        verbose=2,
        n_jobs=-1,
        scoring="r2",
        random_state=42
    )
    random.fit(X_train,y_train)
    model_param[name] = random.best_params_

for model_name in model_param:
    print(f"---------------- Best Params for {model_name} -------------------")
    print(model_param[model_name])

Fitting 3 folds for each of 20 candidates, totalling 60 fits
Fitting 3 folds for each of 20 candidates, totalling 60 fits
---------------- Best Params for Random Forest -------------------
{'n_estimators': 400, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': None, 'max_depth': 30}
---------------- Best Params for XGBoost -------------------
{'estimator__subsample': 0.8, 'estimator__n_estimators': 600, 'estimator__max_depth': 8, 'estimator__learning_rate': 0.1, 'estimator__colsample_bytree': 1.0}


In [None]:
save_path = "../artifacts"
os.makedirs(save_path,exist_ok=True)

models = {
    "Random Forest": RandomForestRegressor(
        n_estimators=400,
        min_samples_split=5,
        min_samples_leaf=1,
        max_features=None,
        max_depth=30,
        n_jobs=-1,
        random_state=42
    ),

    "XGBoost": MultiOutputRegressor(
        XGBRegressor(
            n_estimators=600,
            max_depth=8,
            learning_rate=0.1,
            subsample=0.8,
            colsample_bytree=1.0,
            tree_method="hist",
            n_jobs=-1,
            random_state=42
        )
    )
}

for name,model in models.items():

    model.fit(X_train,y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    model_train_mae,model_train_mse,model_train_rmse,model_train_r2 = evaluate_model(y_train,y_train_pred)
    model_test_mae,model_test_mse,model_test_rmse,model_test_r2 = evaluate_model(y_test,y_test_pred)

    print(name)
    print('Model performance for Training set')
    print("- Mean Squared Error: {:.4f}".format(model_train_mse))
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))
    print()
    print('Model performance for Test set')
    print("- Mean Squared Error: {:.4f}".format(model_test_mse))
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    print("="*50)
    print()

    file_path = os.path.join(save_path,f"{name.replace(' ','_')}_V1.pkl")

    with open(file_path,"wb") as f:
        pickle.dump(model,f)

    print(f"{name} saved at {file_path}")
    print("--------------------------------------------------\n")


with open(os.path.join(save_path,"preprocessor_V1.pkl"),"wb") as f:
    pickle.dump(preprocessor,f)

print("Preprocessor saved successfully.")

Random Forest
Model performance for Training set
- Mean Squared Error: 30.5753
- Root Mean Squared Error: 5.5294
- Mean Absolute Error: 3.3953
- R2 Score: 0.9673

Model performance for Test set
- Mean Squared Error: 130.7327
- Root Mean Squared Error: 11.4338
- Mean Absolute Error: 6.9983
- R2 Score: 0.8574

Random Forest saved at ../artifacts\Random_Forest.pkl
--------------------------------------------------

XGBoost
Model performance for Training set
- Mean Squared Error: 57.1196
- Root Mean Squared Error: 7.5388
- Mean Absolute Error: 5.1976
- R2 Score: 0.9400

Model performance for Test set
- Mean Squared Error: 130.4319
- Root Mean Squared Error: 11.3970
- Mean Absolute Error: 7.5430
- R2 Score: 0.8607

XGBoost saved at ../artifacts\XGBoost.pkl
--------------------------------------------------

Preprocessor saved successfully.
