###### Uppgift 2 - ML modell
###### Gå igenom data science workflow, med att testa olika modeller och evaluera.
######  Välj därefter en modell och träna på all data. Dvs.  Ta fram en ML modell för att prediktera taxipriser
###### Exportera därefter datan med hjälp av joblib


In [None]:
import pandas as pd


In [None]:
df = pd.read_csv("taxi_cleaned_training_data.csv")
df.head()

In [None]:
df.info()

# Linear Regression model with Scikit learn steps


0. Divide into X & y
- X = What model knows
- y= The predicition of the model

In [None]:
X, y = df.drop(columns="Trip_Price", axis=1), df["Trip_Price"]
X.head(5)

In [None]:
y.head(5)

1. Train| test split


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.33, random_state=42)

### Dummy Encoding
-Categorical feature: Time_of_Day is string want to represent binary

In [None]:
X_train = pd.get_dummies(X_train, drop_first=True).astype(int)
X_test = pd.get_dummies(X_test, drop_first=True).astype(int)

X_train, X_test = X_train.align(X_test, join="left", axis=1, fill_value=0)

2. Scale dataset

In [None]:
from sklearn.preprocessing import MinMaxScaler

# instantiate a MinMaxScaler instance
scaler = MinMaxScaler()

# important note: fit on X-train and not X_test -> this avoids data leakage
scaler.fit(X_train) # use training data to fit the scaler

# transforms or scales X_train and X_test
scaled_X_train = scaler.transform(X_train)
scaled_X_test = scaler.transform(X_test)

scaled_X_train.shape, scaled_X_test.shape

In [None]:
scaled_X_train.min(), scaled_X_train.max()

In [None]:
# we have used parameters X_min abd X_max from X_train to scale X_test
# if you get exactly 0 and 1 here then probably you have fit X_test which would leak data
scaled_X_test.min(), scaled_X_test.max()

# Check algorithms

In [None]:
from sklearn.linear_model import LinearRegression

# this model uses SVD approach for solving normal equation
model = LinearRegression()
model.fit(scaled_X_train, y_train)
print(f"Parameters: {model.coef_}")
print(f"Intercept parameter: {model.intercept_}")

3. Predict on test data

In [None]:
y_pred = model.predict(scaled_X_test)
y_pred

4. Evalutation

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

mae_lin_reg = mean_absolute_error(y_test, y_pred)
mse_lin_reg = mean_squared_error(y_test, y_pred)
rmse_lin_reg = np.sqrt(mse_lin_reg)

mae_lin_reg, mse_lin_reg, rmse_lin_reg

# Random forest model with Scikit learn steps


### 1. train|test split 

- Done above

### 2. SKIP the scaling

- tree based algorithms should not scale in general 
- Random forest is based on decision trees and decision trees divides features using threshold 
- but its ok if one has done it

### 3. Training 

In [None]:
from sklearn.ensemble import RandomForestRegressor

clf = RandomForestRegressor()
clf.fit(X_train, y_train)

### 4. Predict on test data

In [None]:
y_pred = clf.predict(X_test)
y_pred

In [None]:
y_test.values

### 5. Evaluate

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

mae_rand_forest = mean_absolute_error(y_test, y_pred)
mse_rand_forest = mean_squared_error(y_test, y_pred)
rmse_rand_forest = np.sqrt(mse_rand_forest)

mae_rand_forest, mse_rand_forest, rmse_rand_forest

# KNN model with Scikit learn steps

### 1. train|test split

- Done


### 2. scale dataset

- KNN very important to scale

### 3. KNN

In [None]:
from sklearn.neighbors import KNeighborsRegressor

# for simplicity we'll go for default
model = KNeighborsRegressor(n_neighbors=5)
model.fit(scaled_X_train, y_train)

### 4. Predict on test data 

In [None]:
y_pred = model.predict(scaled_X_test)
y_pred

In [None]:
y_test.values

### 5. Evaluation

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

mae_knn = mean_absolute_error(y_test, y_pred)
mse_knn = mean_squared_error(y_test, y_pred)
rmse_knn = np.sqrt(mse_knn)

mae_knn, mse_knn, rmse_knn

## Theory Conclussion: Which models to choose compare RMSE 


- RMSE has same unit as y
- Trip duration in min ->RMSE in min
- Passenger count -> RMSE in passener
- RMSE = primary metric - big errors
- MAE = secondary  - typical errors
- MSE = math optimization

In [None]:
model ={}

model["Linear Regression"] = {
     "MAE": mae_lin_reg,
    "RMSE": rmse_lin_reg
}


model["Random Forest"] = {
    "MAE": mae_rand_forest,
    "RMSE": rmse_rand_forest
}

model["KNN"] = {
    "MAE":mae_knn, 
    "RMSE": rmse_knn
}

model





### Conclussion : According to statics better choose Linear Regression after looking at the variable RMSE is  lowest their

In [None]:
import matplotlib.pyplot as plt
residuals = y_test - y_pred 

plt.scatter(y_test, residuals)
plt.axhline(0, color="red")
plt.xlabel("True Price")
plt.ylabel("Residual")
plt.show

# Training up choosen Model = LR for production

In [None]:
choosen_model = LinearRegression()
choosen_model.fit(X_train, y_train)

X_features = pd.get_dummies(X,drop_first=True)

### Joblib = save and load trained models efficently

- used by sklearn proj
- persist models like LinearRegression, RandomForest, KNN
- created 2 different joblib 

In [None]:
import joblib

joblib.dump({"model": choosen_model, "scaler": scaler, "features": X_features.columns.tolist()}, "LM_model.joblib", compress=("xz", 3), protocol=5)

