# Training, testing and evaluating models for data


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
import joblib
import pandas as pd
from taxipred.utils.constants import get_clean_data
from Cleaning_functions import split_features_target
from model_functions import cv_test_benchmark

df = get_clean_data()
df.head()

Unnamed: 0,Trip_Distance_km,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Time_of_Day_Afternoon,Day_of_Week_Weekday,Traffic_Conditions_High,Weather_Rain,Weather_Snow,Trip_Price
0,19.35,3.56,0.8,0.32,53.82,False,True,False,False,False,36.2624
1,47.59,3.502989,0.62,0.43,40.57,True,True,True,False,False,53.6163
2,36.87,2.7,1.21,0.15,37.27,False,False,True,False,False,52.9032
3,30.33,3.48,0.51,0.15,116.81,False,True,False,False,False,36.4698
4,27.070547,2.93,0.63,0.32,22.64,False,True,True,False,False,15.618


In [2]:
df_numeric, df_categorical, df_target = split_features_target(df)

### Splitting the data into target and features

In [3]:
# merging features into one dataframe
df_features = pd.concat([df_numeric, df_categorical], axis=1)

X, y = df_features, df_target
X.head()

Unnamed: 0,Trip_Distance_km,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes
0,19.35,3.56,0.8,0.32,53.82
1,47.59,3.502989,0.62,0.43,40.57
2,36.87,2.7,1.21,0.15,37.27
3,30.33,3.48,0.51,0.15,116.81
4,27.070547,2.93,0.63,0.32,22.64


### train|test split

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()


X_train.shape, X_test.shape

((800, 5), (200, 5))

### method for prediction models 

https://www.geeksforgeeks.org/machine-learning/hyperparameter-tuning-in-linear-regression/

In [None]:

# creating a dataframe for different estimators to evaluate
df_models = pd.DataFrame([
    {"name": "linear", "model": LinearRegression(), "scale": True},
    {"name": "ridge", "model": Ridge(), "scale": True},
    {"name": "lasso", "model": Lasso(), "scale": True},
    {"name": "knn", "model": KNeighborsRegressor(), "scale": True},
    {"name": "svr", "model": SVR(), "scale": True},
    #{"name": "mlp", "model": MLPRegressor(max_iter=5000, random_state=42), "scale": True}, # commented for time resource management
    {"name": "xgb", "model": XGBRegressor(random_state=42), "scale": True},
    {"name": "rf", "model": RandomForestRegressor(random_state=42), "scale": False},
])

### optimize and compare models 

In [7]:
df_results = cv_test_benchmark(X_train, y_train, X_test, y_test, df_models)

In [8]:

df_results = df_results.sort_values(by="test_R2", ascending=False)
df_results    

Unnamed: 0,model,cv_R2,test_R2,delta_cv_test,test_MAE,test_MSE,test_RMSE,train_time_sec,best_params,best_estimator
4,svr,0.866,0.93,0.064,3.118,42.724,6.536,0.13,"{'model__C': 10, 'model__kernel': 'rbf'}","(StandardScaler(), SVR(C=10))"
5,mlp,0.892,0.919,0.027,4.543,49.25,7.018,116.95,"{'model__activation': 'relu', 'model__hidden_l...","(StandardScaler(), MLPRegressor(hidden_layer_s..."
6,xgb,0.87,0.91,0.04,4.966,54.853,7.406,0.9,"{'model__learning_rate': 0.05, 'model__max_dep...","(StandardScaler(), XGBRegressor(base_score=Non..."
7,rf,0.863,0.904,0.041,5.447,58.02,7.617,1.96,"{'model__max_depth': 20, 'model__n_estimators'...","((DecisionTreeRegressor(max_depth=20, max_feat..."
3,knn,0.81,0.859,0.05,6.839,85.43,9.243,0.07,{'model__n_neighbors': 7},"(StandardScaler(), KNeighborsRegressor(n_neigh..."
2,lasso,0.835,0.854,0.019,6.866,88.773,9.422,0.03,{'model__alpha': 0.01},"(StandardScaler(), Lasso(alpha=0.01))"
0,linear,0.835,0.854,0.019,6.863,88.756,9.421,2.39,{'model__fit_intercept': True},"(StandardScaler(), LinearRegression())"
1,ridge,0.835,0.854,0.019,6.865,88.746,9.421,1.82,{'model__alpha': 1.0},"(StandardScaler(), Ridge())"


###  result 

opted for XGBoost (might revisit SVR depending on time left till deadline) despite SVR (based on complexity) and MLP (Slow and resource demanding) scored higher

In [10]:
# selecting model of choice XGB
xgb_row = df_results.loc[df_results["model"] == "xgb"].iloc[0]
xgb_model = xgb_row["best_estimator"]

### Training model on full train before export.

In [None]:
xgb_model.fit(X_train, y_train) # move to function?

0,1,2
,steps,"[('scaler', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


### exporting data using joblib to models folder

In [12]:
# Export model
joblib.dump(xgb_model, "models/xgb_model.joblib")

['models/xgb_model.joblib']