# Training, testing and evaluating models for data


In [82]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, roc_auc_score, make_scorer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
from datetime import datetime
import pandas as pd
import numpy as np
from taxipred.utils.constants import get_clean_data
from functions import split_features_target

df = get_clean_data()
df.head()

Unnamed: 0,Trip_Distance_km,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Time_of_Day_Afternoon,Day_of_Week_Weekday,Traffic_Conditions_High,Weather_Rain,Weather_Snow,Trip_Price
0,19.35,3.56,0.8,0.32,53.82,False,True,False,False,False,36.2624
1,47.59,3.502989,0.62,0.43,40.57,True,True,True,False,False,53.6163
2,36.87,2.7,1.21,0.15,37.27,False,False,True,False,False,52.9032
3,30.33,3.48,0.51,0.15,116.81,False,True,False,False,False,36.4698
4,27.070547,2.93,0.63,0.32,22.64,False,True,True,False,False,15.618


In [83]:
df_numeric, df_categorical, df_target = split_features_target(df)

### Splitting the data into target and features

In [84]:
# merging features into one dataframe
df_features = pd.concat([df_numeric, df_categorical], axis=1)

X, y = df_features, df_target
X.head()

Unnamed: 0,Trip_Distance_km,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes
0,19.35,3.56,0.8,0.32,53.82
1,47.59,3.502989,0.62,0.43,40.57
2,36.87,2.7,1.21,0.15,37.27
3,30.33,3.48,0.51,0.15,116.81
4,27.070547,2.93,0.63,0.32,22.64


### train|test split

In [85]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()


X_train.shape, X_test.shape

((800, 5), (200, 5))

In [86]:
from sklearn.base import BaseEstimator


# model = Pipeline(
#     steps=[
#         ("preprocessor", StandardScaler()),
#         ("classifier", RandomForestRegressor()),
#     ]
# )
# model.get_params()

models = {"linear": {"model": LinearRegression(), "scale": True},
          "knn": {"model": KNeighborsRegressor(), "scale": True},
          "svr": {"model": SVR(), "scale": True},
          "rnd": {"model": RandomForestRegressor(), "scale": False},
          "mlp": {"model": MLPRegressor(), "scale": True},
          "ridge": {"model": Ridge(), "scale": True},
          "lasso":{"model": Lasso(), "scale": True},
          "xgb": {"model": XGBRegressor(), "scale": True}
}
print(models.keys())

dict_keys(['linear', 'knn', 'svr', 'rnd', 'mlp', 'ridge', 'lasso', 'xgb'])


### method for prediction models 

https://www.geeksforgeeks.org/machine-learning/hyperparameter-tuning-in-linear-regression/

In [87]:

# creating a dict for different estimators
models = {"linear": {"model": LinearRegression(), "scale": True},
          "knn": {"model": KNeighborsRegressor(), "scale": True},
          "svr": {"model": SVR(), "scale": True},
          "rnd": {"model": RandomForestRegressor(), "scale": False},
          "mlp": {"model": MLPRegressor(), "scale": True},
          "ridge": {"model": Ridge(), "scale": True},
          "lasso":{"model": Lasso(), "scale": True},
          "xgb": {"model": XGBRegressor(), "scale": True}
}

# method for tuning/validating different estimators
def train_evaluate(X_train, y_train, X_test, y_test, model=models): 
    
    results = []
    best_model = None
    best_score = -float("inf")
    
    for name, model in models.items():
        steps = []
        if model["scale"]:
            steps.append(("scaler", StandardScaler()))
        steps.append(("model", model["model"]))
        
        # ensure y_train is 1D-array
        # if hasattr(y_train, "values") and y_train.ndim == 2:
        #     y_train = y_train.values.ravel()
    
        #get/set params
        pipeline = Pipeline(steps)
        grid = GridSearchCV(pipeline, pipeline.get_params(), cv=3, scoring="r2", n_jobs=-1)
        grid.fit(X_train, y_train)
    
        score = grid.best_score_
        results.append({
            "Model": name,
            "Best R2": score,
            "Best Params": grid.best_params_
        })

        if score > best_score:
            best_score = score
            best_model = grid.best_estimator_
            best_name = name
            
        pipeline.set_params(grid.best_params_)
    

    print(f"Best model: ({best_name})")   

    # train & predict model
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    

    # evaluate
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    # visualize results
    metrics = {
        "MAE": round(mae, 2),
        "MSE": round(mse, 2),
        "RMSE": round(rmse, 2),
        "R2": round(r2, 2)
    }
    # Export model
    # joblib.dump(model, model_path)
    #return model, metrics
    return model, metrics, pd.DataFrame(results).sort_values(by="Best R2", ascending=False), best_model

### tuning the model for with cross-validation for omptimal hyperparameters

In [88]:
# def tune_hyperparams():
#     # Define the parameter grid for GridSearchCV
#     param_grid = {
#         'n_estimators': [50, 100, 150],  
#         'max_depth': [None, 10, 20], 
#         'min_samples_split': [2, 5, 10],  
#         'min_samples_leaf': [1, 2, 4], 
#         'max_features': ['sqrt', 'log2', None]
# }
#     # Initialize the Random Forest model
#     rf_model = RandomForestRegressor(random_state=42)
#     # Set up GridSearchCV
#     grid_search = GridSearchCV(estimator=rf_model, 
#                             param_grid=param_grid, 
#                             cv=5, 
#                             n_jobs=-1, 
#                             scoring='accuracy')
#     # Fit GridSearchCV to the training data
#     grid_search.fit(X_train, y_train)
#     # Get the best model from GridSearchCV and make predictions
#     best_rf_model = grid_search.best_estimator_
#     y_pred_gs = best_rf_model.predict(X_test)
#     y_pred_prob_gs = best_rf_model.predict_proba(X_test)[:, 1]  # For ROC-AUC
#     # Calculate accuracy and ROC-AUC for the best model
#     accuracy = accuracy_score(y_test, y_pred_gs)
#     roc_auc = roc_auc_score(y_test, y_pred_prob_gs)
#     # Cross-validation for accuracy and ROC-AUC
#     accuracy_scores = grid_search.cv_results_['mean_test_score']
#     mean_accuracy = np.mean(accuracy_scores)
#     std_accuracy = np.std(accuracy_scores)
#     roc_auc_scores = cross_val_score(best_rf_model, X_train, y_train, cv=5, scoring=make_scorer(roc_auc_score))
#     mean_roc_auc = np.mean(roc_auc_scores)
#     std_roc_auc = np.std(roc_auc_scores)
#     # Display results
#     print(f"Best Hyperparameters from Grid Search: {grid_search.best_params_}")
#     print(f"Cross-validation Accuracy: {mean_accuracy:.4f} ± {std_accuracy:.4f}")
#     print(f"Cross-validation ROC-AUC: {mean_roc_auc:.4f} ± {std_roc_auc:.4f}")
#     print(f"Test Accuracy: {accuracy:.4f}")
#     print(f"Test ROC-AUC: {roc_auc:.4f}")
#     # https://www.blog.trainindata.com/random-forest-with-grid-search/

In [89]:
metric_results = []

for model in models:
    model, metrics = train_evaluate(X_train, y_train, X_test, y_test, model=model)
    metric_results.append({
        "Model": model,
        "MAE": metrics["MAE"],
        "MSE": metrics["MSE"],
        "RMSE": metrics["RMSE"],
        "R2": metrics["R2"]
    })

results_df = pd.DataFrame(metric_results).sort_values(by="R2", ascending=False)
print(results_df)

TypeError: Parameter grid for parameter 'memory' needs to be a list or a numpy array, but got None (of type NoneType) instead. Single values need to be wrapped in a list with one element.

### exporting data using joblib

In [None]:
# model, metrics = train_evaluate(X_train, y_train, X_test, y_test, model_type="random_forest", model_path="random_forest.joblib")
# loaded_model = joblib.load("random_forest.joblib")
# loaded_model