In [41]:
import pandas as pd 
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder , OrdinalEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from all_func import *
import pickle 
import seaborn as sns
import matplotlib.pyplot as plt

In [42]:
df = pd.read_csv("taxi_trip_pricing.csv")

In [43]:
with open("preprocessor.pkl",'rb') as file:
    preprocessor = pickle.load(file)

In [44]:
preprocessor

### MODEL SELECTION

In [45]:
x = df.drop(columns=['Trip_Price'])

In [46]:
x

Unnamed: 0,Trip_Distance_km,Time_of_Day,Day_of_Week,Passenger_Count,Traffic_Conditions,Weather,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes
0,19.35,Morning,Weekday,3.0,Low,Clear,3.56,0.80,0.32,53.82
1,47.59,Afternoon,Weekday,1.0,High,Clear,,0.62,0.43,40.57
2,36.87,Evening,Weekend,1.0,High,Clear,2.70,1.21,0.15,37.27
3,30.33,Evening,Weekday,4.0,Low,,3.48,0.51,0.15,116.81
4,,Evening,Weekday,3.0,High,Clear,2.93,0.63,0.32,22.64
...,...,...,...,...,...,...,...,...,...,...
995,5.49,Afternoon,Weekend,4.0,Medium,Clear,2.39,0.62,0.49,58.39
996,45.95,Night,Weekday,4.0,Medium,Clear,3.12,0.61,,61.96
997,7.70,Morning,Weekday,3.0,Low,Rain,2.08,1.78,,54.18
998,47.56,Morning,Weekday,1.0,Low,Clear,2.67,0.82,0.17,114.94


In [47]:
y = df['Trip_Price']
y

0      36.2624
1          NaN
2      52.9032
3      36.4698
4      15.6180
        ...   
995    34.4049
996    62.1295
997    33.1236
998    61.2090
999    45.4437
Name: Trip_Price, Length: 1000, dtype: float64

In [48]:
from sklearn.model_selection import train_test_split

In [49]:
x_train, x_test , y_train , y_test = train_test_split(x, y, test_size=0.2,random_state=30)

In [50]:
x_train

Unnamed: 0,Trip_Distance_km,Time_of_Day,Day_of_Week,Passenger_Count,Traffic_Conditions,Weather,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes
795,43.73,Morning,Weekday,4.0,Low,Clear,4.64,1.97,0.44,108.76
341,24.20,Afternoon,Weekday,1.0,Low,Snow,2.43,0.75,0.28,103.17
217,37.58,Afternoon,Weekday,4.0,Low,Snow,2.91,1.16,0.47,84.52
856,2.79,Evening,Weekday,1.0,Low,,3.62,1.98,0.21,101.08
604,6.91,Morning,Weekday,3.0,Low,Clear,2.38,0.89,0.42,20.57
...,...,...,...,...,...,...,...,...,...,...
500,35.21,Afternoon,Weekend,4.0,Medium,Clear,3.94,0.56,0.39,110.06
813,,Morning,Weekday,3.0,High,Snow,2.81,1.04,0.34,
941,28.15,Afternoon,Weekday,2.0,High,Clear,2.06,0.61,0.18,17.02
421,39.69,Afternoon,Weekday,3.0,Medium,Rain,4.90,0.57,0.38,


In [51]:
x_test

Unnamed: 0,Trip_Distance_km,Time_of_Day,Day_of_Week,Passenger_Count,Traffic_Conditions,Weather,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes
923,40.19,Afternoon,Weekday,1.0,Medium,Clear,2.79,1.12,0.10,51.19
921,24.88,Morning,Weekend,2.0,High,Clear,3.40,1.54,0.20,68.14
516,19.45,Afternoon,Weekday,1.0,High,Clear,2.40,0.86,0.29,16.93
87,,,Weekday,1.0,Medium,Clear,3.41,0.57,0.39,92.72
879,11.88,Afternoon,Weekday,2.0,Medium,Clear,4.67,0.63,0.27,39.27
...,...,...,...,...,...,...,...,...,...,...
711,30.19,Morning,Weekday,2.0,Medium,,2.37,0.76,0.40,113.33
517,40.72,Evening,Weekday,2.0,High,Clear,3.64,0.62,0.35,98.49
984,36.68,Morning,Weekday,4.0,Low,Clear,2.58,1.11,0.39,108.41
886,32.98,Morning,Weekday,3.0,High,Clear,3.67,0.64,0.49,109.94


In [52]:
y_train

795    138.6425
341     49.4676
217     86.2272
856     30.3710
604     17.1693
         ...   
500     66.5810
813     66.8568
941     22.2951
421     31.0915
805     76.4647
Name: Trip_Price, Length: 800, dtype: float64

In [53]:
y_test

923    52.9218
921    55.3432
516    24.0367
87     57.9476
879    22.7573
        ...   
711    70.6464
517    63.3579
984    85.5747
886    78.6478
961    39.4771
Name: Trip_Price, Length: 200, dtype: float64

In [54]:
x_train.isnull().sum()

Trip_Distance_km         39
Time_of_Day              43
Day_of_Week              43
Passenger_Count          44
Traffic_Conditions       42
Weather                  42
Base_Fare                36
Per_Km_Rate              40
Per_Minute_Rate          45
Trip_Duration_Minutes    41
dtype: int64

In [55]:
x_test.isnull().sum()

Trip_Distance_km         11
Time_of_Day               7
Day_of_Week               7
Passenger_Count           6
Traffic_Conditions        8
Weather                   8
Base_Fare                14
Per_Km_Rate              10
Per_Minute_Rate           5
Trip_Duration_Minutes     9
dtype: int64

In [56]:
y_train.isnull().sum()

np.int64(40)

In [57]:
y_test.isnull().sum()

np.int64(9)

### USING SIMPLE IMPUTER SEPERATELY TO HANDLE NULL VALUES

In [58]:
num_null=["Trip_Distance_km", "Passenger_Count", "Base_Fare", "Per_Km_Rate", "Per_Minute_Rate", "Trip_Duration_Minutes"]

cate_null = ["Time_of_Day","Day_of_Week","Traffic_Conditions","Weather"]

In [59]:
num_imputer = SimpleImputer(strategy="mean")
cat_imputer = SimpleImputer(strategy="most_frequent")

### USED SIMPLE IMPUTER FOR X_TRAIN 

In [60]:
x_train[["Time_of_Day","Day_of_Week","Traffic_Conditions","Weather"]] = cat_imputer.fit_transform(x_train[["Time_of_Day","Day_of_Week","Traffic_Conditions","Weather"]])
x_train[["Trip_Distance_km", "Passenger_Count", "Base_Fare", "Per_Km_Rate", "Per_Minute_Rate", "Trip_Duration_Minutes"]] = num_imputer.fit_transform(x_train[["Trip_Distance_km", "Passenger_Count", "Base_Fare", "Per_Km_Rate", "Per_Minute_Rate", "Trip_Duration_Minutes"]])


In [61]:
x_train.isnull().sum()

Trip_Distance_km         0
Time_of_Day              0
Day_of_Week              0
Passenger_Count          0
Traffic_Conditions       0
Weather                  0
Base_Fare                0
Per_Km_Rate              0
Per_Minute_Rate          0
Trip_Duration_Minutes    0
dtype: int64

In [62]:
x_test[["Time_of_Day","Day_of_Week","Traffic_Conditions","Weather"]] = cat_imputer.transform(x_test[["Time_of_Day","Day_of_Week","Traffic_Conditions","Weather"]])
x_test[["Trip_Distance_km", "Passenger_Count", "Base_Fare", "Per_Km_Rate", "Per_Minute_Rate", "Trip_Duration_Minutes"]] = num_imputer.transform(x_test[["Trip_Distance_km", "Passenger_Count", "Base_Fare", "Per_Km_Rate", "Per_Minute_Rate", "Trip_Duration_Minutes"]])

In [63]:
x_test.isnull().sum()

Trip_Distance_km         0
Time_of_Day              0
Day_of_Week              0
Passenger_Count          0
Traffic_Conditions       0
Weather                  0
Base_Fare                0
Per_Km_Rate              0
Per_Minute_Rate          0
Trip_Duration_Minutes    0
dtype: int64

In [64]:
y_train_array = np.array(y_train).reshape(-1, 1)
y_train_imputer = SimpleImputer(strategy="mean")
y_train = y_train_imputer.fit_transform(y_train_array).ravel()



In [65]:
print(type(y_train))

<class 'numpy.ndarray'>


In [66]:
y_train_series = pd.Series(y_train, name="Trip_Price")
print(type(y_train_series))

<class 'pandas.core.series.Series'>


In [67]:
y_test = np.array(y_test).reshape(-1,1)
y_test_imputer = SimpleImputer(strategy="mean")
y_test = y_test_imputer.fit_transform(y_test).ravel()


In [68]:
y_test_series = pd.Series(y_test, name="Trip_Price")


### PREPROCESSING THE X TRIAN / Y TRAIN 

In [69]:
x_train_processed = preprocessor.fit_transform(x_train)

In [70]:
x_test_processed = preprocessor.transform(x_test)

### MODEL CREATION

In [71]:
from sklearn.linear_model import LinearRegression

In [72]:
Lr = LinearRegression()

In [73]:
Lr.fit(x_train_processed, y_train_series)

In [74]:
y_predict=Lr.predict(x_test_processed)

In [75]:
y_test

array([ 52.9218    ,  55.3432    ,  24.0367    ,  57.9476    ,
        22.7573    ,  49.36      ,  99.8038    ,  26.9202    ,
        51.031     ,  54.24268422,  45.8975    ,  45.9582    ,
        14.892     ,  10.2366    , 102.4302    ,  35.0854    ,
       101.9914    , 123.928     ,  84.873     ,  38.035     ,
        62.1295    ,  73.5818    ,  14.6127    ,  38.3875    ,
        54.6348    ,  49.852     ,  24.6973    ,  41.8347    ,
        34.4316    ,  61.871     ,  41.544     ,  29.4228    ,
        53.16      ,  25.7527    ,   9.8718    ,  33.5331    ,
        81.9816    ,  36.4698    ,  54.24268422,  60.7394    ,
        25.5304    ,  61.9923    ,  25.1968    ,  94.2463    ,
        54.6715    ,  66.2817    , 116.4206    ,  59.7613    ,
        22.3206    ,  58.6974    ,   9.9494    ,  36.4212    ,
        65.084     ,  69.9148    ,  43.9909    ,  89.0764    ,
        44.5307    ,  57.5336    ,  24.4491    ,  36.2624    ,
        91.8308    ,  26.2988    ,  49.5082    ,  31.41

In [76]:
y_predict

array([ 63.5357928 ,  59.5947194 ,  20.4853086 ,  54.64220102,
         8.44297152,  50.85295955, 109.1001739 ,  18.99791422,
        78.34974695,  63.65681449,  62.33196711,  56.28019172,
        38.65990611,  -2.47006403, 101.18816635,  38.74183676,
       107.87263856, 121.42867023,  73.9815097 ,  47.29389531,
        73.26124214,  77.19505545,   0.22286449,  34.87116257,
        72.184741  ,  45.90627976,  18.21424557,  56.79241946,
        58.06113553,  69.53824699,  58.49130657,  26.57188287,
        53.73529026,  17.65922408,  10.23153807,  35.85249364,
        85.83185255,  53.05141959,  76.31476369,  55.7157844 ,
        13.75449424,  68.37293622,  18.00843357,  92.88428723,
        84.47786707,  71.6024927 , 109.58310072,  75.96401117,
        12.34436607,  65.64337105,  -3.89004995,  34.72461055,
        80.07129522,  91.79314745,  42.88826521,  76.28229185,
        50.87013392,  66.41464167,  25.34838466,  32.88419174,
        75.6771162 ,  13.16019146,  45.89366617,  28.91

In [77]:
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error,root_mean_squared_error

In [78]:
mean_squared_error(y_test,y_predict)

168.9096540937914

In [79]:
r2_score(y_test,y_predict)

0.8052335492378389