# Training, testing and evaluating models for data


In [49]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import pandas as pd
import numpy as np
from taxipred.utils.constants import get_clean_data
from functions import split_features_target

df = get_clean_data()
df.head()

Unnamed: 0,Trip_Distance_km,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Time_of_Day_Afternoon,Day_of_Week_Weekday,Traffic_Conditions_High,Weather_Rain,Weather_Snow,Trip_Price
0,19.35,3.56,0.8,0.32,53.82,False,True,False,False,False,36.2624
1,47.59,3.502989,0.62,0.43,40.57,True,True,True,False,False,53.6163
2,36.87,2.7,1.21,0.15,37.27,False,False,True,False,False,52.9032
3,30.33,3.48,0.51,0.15,116.81,False,True,False,False,False,36.4698
4,27.070547,2.93,0.63,0.32,22.64,False,True,True,False,False,15.618


In [50]:
df_numeric, df_categorical, df_target = split_features_target(df)

### Splitting the data into target and features

In [51]:
# merging features into one dataframe
df_features = pd.concat([df_numeric, df_categorical], axis=1)

X, y = df_features, df_target
X.head()

Unnamed: 0,Trip_Distance_km,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes
0,19.35,3.56,0.8,0.32,53.82
1,47.59,3.502989,0.62,0.43,40.57
2,36.87,2.7,1.21,0.15,37.27
3,30.33,3.48,0.51,0.15,116.81
4,27.070547,2.93,0.63,0.32,22.64


### train|test split

In [52]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape

((800, 5), (200, 5))

### method for prediction models 

In [None]:
# creating a list for models to apply to my method
models = ["linear",
          "knn",
          "svr",
          "rnd",
          "mlp",
          "ridge",
          "lasso",
          "xgb"
          ]

# method for running prediction models
def train_evaluate(X_train, y_train, X_test, y_test, model_type=""):#, model_path=""):
    
    # loop for identifying testing models
    if model_type == "linear":
            model = LinearRegression()
            needs_scaling = True
    elif model_type == "knn":
            model = KNeighborsRegressor(n_neighbors=5)
            needs_scaling = True
    elif model_type == "svr":
            model = SVR()
            needs_scaling = True
    elif model_type == "mlp":
            model = MLPRegressor(hidden_layer_sizes=(100,), activation='relu', solver='adam', max_iter=5000, random_state=42)
            needs_scaling = True
    elif model_type == "xgb":
            model = XGBRegressor(n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42)
            needs_scaling = True
    elif model_type == "ridge":
            model = Ridge()
            needs_scaling = True
    elif model_type == "lasso":
            model = Lasso()
            needs_scaling = True
    else:
        model = RandomForestRegressor(n_estimators=100, random_state=42)
        needs_scaling = False

    # ensure y_train is 1D-array
    if hasattr(y_train, "values") and y_train.ndim == 2:
        y_train = y_train.values.ravel()

    # scaler if need (upgrade to testing wich is better)
    if needs_scaling:
        #scaler = MinMaxScaler()
        scaler = StandardScaler()
  
            
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

    # train & predict model
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # evaluate
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    # visualize results
    metrics = {
        "MAE": round(mae, 2),
        "MSE": round(mse, 2),
        "RMSE": round(rmse, 2),
        "R2": round(r2, 2)
    }
    # Export model
    # joblib.dump(model, model_path)
    return model, metrics

In [54]:
results = []

for model_type in models:
    model, metrics = train_evaluate(X_train, y_train, X_test, y_test, model_type=model_type)
    results.append({
        "Model": model_type,
        "MAE": metrics["MAE"],
        "MSE": metrics["MSE"],
        "RMSE": metrics["RMSE"],
        "R2": metrics["R2"]
    })

results_df = pd.DataFrame(results).sort_values(by="R2", ascending=False)
print(results_df)

    Model   MAE     MSE   RMSE    R2
7     xgb  4.87   51.02   7.14  0.92
4     mlp  4.14   47.13   6.86  0.92
3     rnd  5.53   59.82   7.73  0.90
1     knn  6.53   79.22   8.90  0.87
5   ridge  6.86   88.75   9.42  0.85
0  linear  6.86   88.76   9.42  0.85
6   lasso  7.35   95.19   9.76  0.84
2     svr  6.98  113.77  10.67  0.81


### exporting data using joblib

In [55]:
# model, metrics = train_evaluate(X_train, y_train, X_test, y_test, model_type="random_forest", model_path="random_forest.joblib")
# loaded_model = joblib.load("random_forest.joblib")
# loaded_model