In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from pprint import pprint

df = pd.read_csv("../src/taxipred/data/taxi_trip_pricing_train.csv")
df.head()




Unnamed: 0,Trip_Distance_km,Time_of_Day,Day_of_Week,Passenger_Count,Traffic_Conditions,Weather,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Trip_Price
0,19.35,Morning,Weekday,3.0,Low,Clear,3.56,0.8,0.32,53.82,36.2624
1,36.87,Evening,Weekend,1.0,High,Clear,2.7,1.21,0.15,37.27,52.9032
2,30.33,Evening,Weekday,4.0,Low,Unknown,3.48,0.51,0.15,116.81,36.4698
3,8.64,Afternoon,Weekend,2.0,Medium,Clear,2.55,1.71,0.48,89.33,60.2028
4,41.79,Night,Weekend,3.0,High,Clear,4.6,1.77,0.11,86.95,88.1328


In [None]:
X, y = df.drop(["Trip_Price", "Traffic_Conditions", "Weather", "Base_Fare", "Per_Km_Rate", "Per_Minute_Rate", "Trip_Duration_Minutes"], axis="columns"), df["Trip_Price"] 
X = pd.get_dummies(X, drop_first=True)

# Kolla att allt nu är numeriskt
print(X.head())

X.head(5)

   Trip_Distance_km  Passenger_Count  Time_of_Day_Evening  \
0             19.35              3.0                False   
1             36.87              1.0                 True   
2             30.33              4.0                 True   
3              8.64              2.0                False   
4             41.79              3.0                False   

   Time_of_Day_Morning  Time_of_Day_Night  Time_of_Day_Unknown  \
0                 True              False                False   
1                False              False                False   
2                False              False                False   
3                False              False                False   
4                False               True                False   

   Day_of_Week_Weekday  Day_of_Week_Weekend  
0                 True                False  
1                False                 True  
2                 True                False  
3                False                 True  
4     

Unnamed: 0,Trip_Distance_km,Passenger_Count,Time_of_Day_Evening,Time_of_Day_Morning,Time_of_Day_Night,Time_of_Day_Unknown,Day_of_Week_Weekday,Day_of_Week_Weekend
0,19.35,3.0,False,True,False,False,True,False
1,36.87,1.0,True,False,False,False,False,True
2,30.33,4.0,True,False,False,False,True,False
3,8.64,2.0,False,False,False,False,False,True
4,41.79,3.0,False,False,True,False,False,True


In [41]:
y.head(5)

0    36.2624
1    52.9032
2    36.4698
3    60.2028
4    88.1328
Name: Trip_Price, dtype: float64

In [42]:
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=42)

In [43]:
scaler = MinMaxScaler()
scaler.fit(X_train)

scaled_X_train = scaler.transform(X_train)
scaled_X_test = scaler.transform(X_test)

print(f"{scaled_X_train.min():.2f} ≤ scaled_X_train ≤ {scaled_X_train.max():.2f}")
print(f"{scaled_X_test.min():.2f} ≤ scaled_X_test ≤ {scaled_X_test.max():.2f}")

0.00 ≤ scaled_X_train ≤ 1.00
-0.00 ≤ scaled_X_test ≤ 1.00


In [44]:
model = LinearRegression()
model.fit(scaled_X_train, y_train)

print(f"Parameters: {model.coef_}")
print(f"Intercept parameter: {model.intercept_}")

Parameters: [56.8735735   5.68978889 -2.14876236  0.51192925 -0.20049127 -7.0724598
  0.34696203  0.59186057]
Intercept parameter: 21.935466746161353


In [45]:
test_sample_features = scaled_X_test[0].reshape(1,-1)
test_sample_target = y_test.values[0]

print(f"Scaled features {test_sample_features}, label {test_sample_target}")
print(f"Prediction: {model.predict(test_sample_features)[0]:.2f}")

Scaled features [[0.51090984 1.         0.         0.         0.         1.
  1.         0.        ]], label 54.62179999999999
Prediction: 49.96


In [None]:
y_pred = model.predict(scaled_X_test)
mae = mean_absolute_error(y_test, y_pred) 
rmse = mean_squared_error(y_test, y_pred) ** 0.5 
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R²: {r2:.2f}")

MAE: 14.09
RMSE: 17.11
R²: 0.45


## Trying random forest to see if it will be a better model

In [52]:
from sklearn.ensemble import RandomForestRegressor



forest_model = RandomForestRegressor(
    n_estimators=500,
    max_depth=None,           # begränsa djupet på varje träd
    min_samples_split=4,    # kräver fler samples för att splitta
    min_samples_leaf=5,     # kräver minst 2 samples per blad
    max_features=1.0,       # välj sqrt(features) vid varje split
    random_state=42,
    n_jobs=-1   
)

forest_model.fit(scaled_X_train, y_train)

y_pred_rf = forest_model.predict(scaled_X_test)


mae_rf = mean_absolute_error(y_test, y_pred_rf)
rmse_rf = mean_squared_error(y_test, y_pred_rf) ** 0.5
r2_rf = r2_score(y_test, y_pred_rf)

print("Random Forest Results:")
print(f"MAE: {mae_rf:.2f}")
print(f"RMSE: {rmse_rf:.2f}")
print(f"R²: {r2_rf:.2f}")

Random Forest Results:
MAE: 13.80
RMSE: 17.06
R²: 0.45


In [70]:
forest_model.fit(X, y)

0,1,2
,n_estimators,500
,criterion,'squared_error'
,max_depth,
,min_samples_split,4
,min_samples_leaf,5
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


## Trying a model without using scaled data

In [69]:
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=42)

unsclaed_forest_model = RandomForestRegressor( 
    n_estimators=500,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=3,
    max_features="sqrt",
    random_state=42,
    n_jobs=-1
)


unsclaed_forest_model.fit(X_train, y_train)

y_pred_unscaled = unsclaed_forest_model.predict(X_test)

mae_unscaled = mean_absolute_error(y_test, y_pred_unscaled)
rmse_unscaled = mean_squared_error(y_test, y_pred_unscaled) ** 0.5
r2_unscaled = r2_score(y_test, y_pred_unscaled)

print("Random Forest Results (Unscaled Data):")
print(f"MAE: {mae_unscaled:.2f}")
print(f"RMSE: {rmse_unscaled:.2f}")
print(f"R²: {r2_unscaled:.2f}")



Random Forest Results (Unscaled Data):
MAE: 14.37
RMSE: 17.59
R²: 0.42


## Training my choosen model on all data


In [48]:
scaler = MinMaxScaler()
scaler.fit(X)

scaled_X = scaler.transform(X)

forest_model.fit(scaled_X, y)


0,1,2
,n_estimators,500
,criterion,'squared_error'
,max_depth,
,min_samples_split,4
,min_samples_leaf,5
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [None]:
import joblib

joblib.dump(forest_model, "models/taxi_price_model.joblib")

In [49]:
#import joblib

#joblib.dump(forest_model, "taxi_price_model.pkl")

#joblib.dump(scaler, "scaler.pkl")

## Testing my model on the rows where our target is null

In [73]:
predict_df = pd.read_csv("../src/taxipred/data/taxi_trip_pricing_predict.csv")
predict_df_cleaned = predict_df.drop(["Trip_Price", "Traffic_Conditions", "Weather", "Base_Fare", "Per_Km_Rate", "Per_Minute_Rate", "Trip_Duration_Minutes"], axis="columns")

predict_df_cleaned.head()

Unnamed: 0,Trip_Distance_km,Time_of_Day,Day_of_Week,Passenger_Count
0,30.45,Morning,Weekday,3.0
1,4.19,Morning,Weekday,1.0
2,38.78,Morning,Weekday,3.0
3,16.58,Evening,Unknown,2.0
4,10.14,Afternoon,Weekday,4.0


In [76]:
predict_X = pd.get_dummies(predict_df_cleaned, drop_first=True)
predict_X.head()    

Unnamed: 0,Trip_Distance_km,Passenger_Count,Time_of_Day_Evening,Time_of_Day_Morning,Time_of_Day_Night,Time_of_Day_Unknown,Day_of_Week_Weekday,Day_of_Week_Weekend
0,30.45,3.0,False,True,False,False,True,False
1,4.19,1.0,False,True,False,False,True,False
2,38.78,3.0,False,True,False,False,True,False
3,16.58,2.0,True,False,False,False,False,False
4,10.14,4.0,False,False,False,False,True,False


Unnamed: 0,Trip_Distance_km,Passenger_Count,Time_of_Day_Evening,Time_of_Day_Morning,Time_of_Day_Night,Time_of_Day_Unknown,Day_of_Week_Weekday,Day_of_Week_Weekend
0,30.45,3.0,False,True,False,False,True,False
1,4.19,1.0,False,True,False,False,True,False
2,38.78,3.0,False,True,False,False,True,False
3,16.58,2.0,True,False,False,False,False,False
4,10.14,4.0,False,False,False,False,True,False


## Predicting trip prices and adding it to a column in my original dataframe

In [89]:
predicted_prices = forest_model.predict(predict_X)
predict_df["Predicted_Trip_Price"] = predicted_prices.round(0)#.astype(int)

predict_df

Unnamed: 0,Trip_Distance_km,Time_of_Day,Day_of_Week,Passenger_Count,Traffic_Conditions,Weather,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Trip_Price,Predicted_Trip_Price,Predicted_Trip_Price_in_SEK
0,30.45,Morning,Weekday,3.0,High,Clear,2.77,1.78,0.34,110.33,,61.0,61
1,4.19,Morning,Weekday,1.0,Low,Clear,4.07,1.89,0.19,69.06,,28.0,28
2,38.78,Morning,Weekday,3.0,Medium,Clear,3.08,1.62,0.15,90.14,,78.0,78
3,16.58,Evening,Unknown,2.0,Low,Clear,4.46,1.23,0.16,62.79,,39.0,39
4,10.14,Afternoon,Weekday,4.0,Low,Rain,4.5,1.62,0.38,25.41,,28.0,28
5,46.9,Evening,Weekday,1.0,High,Clear,4.14,1.33,0.13,66.38,,83.0,83
6,33.51,Afternoon,Weekend,3.0,Low,Clear,2.31,1.32,0.4,62.87,,55.0,55
7,32.95,Morning,Weekday,1.0,Medium,Clear,3.36,1.56,0.19,81.37,,63.0,63
8,35.9,Unknown,Weekend,2.0,High,Clear,4.03,1.79,0.13,107.25,,62.0,62
9,44.71,Night,Weekday,1.0,Low,Rain,2.5,1.94,0.47,45.94,,63.0,63


## Extracting my model to my models folder

In [91]:
import joblib

joblib.dump(forest_model, "../src/taxipred/models/taxi_price_predictor.joblib")

['../src/taxipred/models/taxi_price_predictor.joblib']