# Training, testing and evaluating models for data


In [18]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, make_scorer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import pandas as pd
import numpy as np
from taxipred.utils.constants import get_clean_data
from Cleaning_functions import split_features_target
from model_functions import hyper_optimize, train_evaluate, tune_model

df = get_clean_data()
df.head()

Unnamed: 0,Trip_Distance_km,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Time_of_Day_Afternoon,Day_of_Week_Weekday,Traffic_Conditions_High,Weather_Rain,Weather_Snow,Trip_Price
0,19.35,3.56,0.8,0.32,53.82,False,True,False,False,False,36.2624
1,47.59,3.502989,0.62,0.43,40.57,True,True,True,False,False,53.6163
2,36.87,2.7,1.21,0.15,37.27,False,False,True,False,False,52.9032
3,30.33,3.48,0.51,0.15,116.81,False,True,False,False,False,36.4698
4,27.070547,2.93,0.63,0.32,22.64,False,True,True,False,False,15.618


In [19]:
df_numeric, df_categorical, df_target = split_features_target(df)

### Splitting the data into target and features

In [20]:
# merging features into one dataframe
df_features = pd.concat([df_numeric, df_categorical], axis=1)

X, y = df_features, df_target
X.head()

Unnamed: 0,Trip_Distance_km,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes
0,19.35,3.56,0.8,0.32,53.82
1,47.59,3.502989,0.62,0.43,40.57
2,36.87,2.7,1.21,0.15,37.27
3,30.33,3.48,0.51,0.15,116.81
4,27.070547,2.93,0.63,0.32,22.64


### train|test split

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()


X_train.shape, X_test.shape

((800, 5), (200, 5))

### method for prediction models 

https://www.geeksforgeeks.org/machine-learning/hyperparameter-tuning-in-linear-regression/

In [None]:

# creating a dataframe for different estimators to evaluate
df_models = pd.DataFrame([
    {"name": "linear", "model": LinearRegression(), "scale": True},
    {"name": "ridge", "model": Ridge(), "scale": True},
    {"name": "lasso", "model": Lasso(), "scale": True},
    {"name": "knn", "model": KNeighborsRegressor(), "scale": True},
    {"name": "svr", "model": SVR(), "scale": True},
    {"name": "mlp", "model": MLPRegressor(max_iter=5000, random_state=42), "scale": True},
    {"name": "xgb", "model": XGBRegressor(random_state=42), "scale": True},
    {"name": "rf", "model": RandomForestRegressor(random_state=42), "scale": False},
])

df_models.head()

Unnamed: 0,name,model,scale
0,linear,LinearRegression(),True
1,ridge,Ridge(),True
2,lasso,Lasso(),True
3,knn,KNeighborsRegressor(),True
4,svr,SVR(),True


In [23]:
df_results, best_model, best_name= hyper_optimize(X_train, y_train, df_models)

In [24]:

df_results = df_results.sort_values(by="best r2", ascending=False)
df_results    

Unnamed: 0,model,best r2,best params
5,mlp,0.891001,"{'model__activation': 'relu', 'model__hidden_l..."
6,xgb,0.869875,"{'model__learning_rate': 0.05, 'model__max_dep..."
4,svr,0.866089,"{'model__C': 10, 'model__kernel': 'rbf'}"
7,rf,0.86328,"{'model__max_depth': 20, 'model__n_estimators'..."
1,ridge,0.834864,{'model__alpha': 1.0}
2,lasso,0.834859,{'model__alpha': 0.01}
0,linear,0.834858,{'model__fit_intercept': True}
3,knn,0.809608,{'model__n_neighbors': 7}


In [None]:
# train/eval
#model, metrics = train_evaluate(X_train, y_train, X_test, y_test, best_model)

#print(metrics)


### exporting data using joblib

In [27]:
# model, metrics = train_evaluate(X_train, y_train, X_test, y_test, model_type="random_forest", model_path="random_forest.joblib")
# loaded_model = joblib.load("random_forest.joblib")
# loaded_model