## Feature engineering
- dummy/one-hot encoding doubled the amount of data and almost doubled number of columns - not good
- error margins were significant with dummies + linear regr
- instead, some feature engineering of categorical data.

## Domain awareness
- create new, smarter features from the raw categorical ones
- target Trip_Price:


1. Time - rush-hour or not (IsRushHour 1/0)? weekend vs weekday
2. Time - time of day (4 values) weighted differently
3. Weather - binary/numeric (IsRain == 1, otherwise 0 if it's not raining)
4. Traffic - IsTraffic binary. EDA indicates only high traffic has real bearing on trip price

In [60]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [61]:
df = pd.read_csv("../src/taxipred/data/taxi_trip_pricing.csv")
df.head()

Unnamed: 0,Trip_Distance_km,Time_of_Day,Day_of_Week,Passenger_Count,Traffic_Conditions,Weather,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Trip_Price
0,19.35,Morning,Weekday,3.0,Low,Clear,3.56,0.8,0.32,53.82,36.2624
1,47.59,Afternoon,Weekday,1.0,High,Clear,,0.62,0.43,40.57,
2,36.87,Evening,Weekend,1.0,High,Clear,2.7,1.21,0.15,37.27,52.9032
3,30.33,Evening,Weekday,4.0,Low,,3.48,0.51,0.15,116.81,36.4698
4,,Evening,Weekday,3.0,High,Clear,2.93,0.63,0.32,22.64,15.618


In [62]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Trip_Distance_km       950 non-null    float64
 1   Time_of_Day            950 non-null    object 
 2   Day_of_Week            950 non-null    object 
 3   Passenger_Count        950 non-null    float64
 4   Traffic_Conditions     950 non-null    object 
 5   Weather                950 non-null    object 
 6   Base_Fare              950 non-null    float64
 7   Per_Km_Rate            950 non-null    float64
 8   Per_Minute_Rate        950 non-null    float64
 9   Trip_Duration_Minutes  950 non-null    float64
 10  Trip_Price             951 non-null    float64
dtypes: float64(7), object(4)
memory usage: 86.1+ KB


In [63]:
# check if any rows have high number of missing values
# as above - Trip_Price is missing in 49 rows...
# First: drop any rows missing Trip_Price since that is Target
df["Trip_Price"].isnull().sum()

np.int64(49)

In [64]:
# drop entire row where Trip_Price is missing
df_trip_price = df.dropna(subset=["Trip_Price"])
df_trip_price

Unnamed: 0,Trip_Distance_km,Time_of_Day,Day_of_Week,Passenger_Count,Traffic_Conditions,Weather,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Trip_Price
0,19.35,Morning,Weekday,3.0,Low,Clear,3.56,0.80,0.32,53.82,36.2624
2,36.87,Evening,Weekend,1.0,High,Clear,2.70,1.21,0.15,37.27,52.9032
3,30.33,Evening,Weekday,4.0,Low,,3.48,0.51,0.15,116.81,36.4698
4,,Evening,Weekday,3.0,High,Clear,2.93,0.63,0.32,22.64,15.6180
5,8.64,Afternoon,Weekend,2.0,Medium,Clear,2.55,1.71,0.48,89.33,60.2028
...,...,...,...,...,...,...,...,...,...,...,...
995,5.49,Afternoon,Weekend,4.0,Medium,Clear,2.39,0.62,0.49,58.39,34.4049
996,45.95,Night,Weekday,4.0,Medium,Clear,3.12,0.61,,61.96,62.1295
997,7.70,Morning,Weekday,3.0,Low,Rain,2.08,1.78,,54.18,33.1236
998,47.56,Morning,Weekday,1.0,Low,Clear,2.67,0.82,0.17,114.94,61.2090


In [65]:
df_trip_price["Trip_Price"].isnull().sum()

np.int64(0)

In [66]:
df_trip_price.info()

<class 'pandas.core.frame.DataFrame'>
Index: 951 entries, 0 to 999
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Trip_Distance_km       901 non-null    float64
 1   Time_of_Day            902 non-null    object 
 2   Day_of_Week            905 non-null    object 
 3   Passenger_Count        903 non-null    float64
 4   Traffic_Conditions     901 non-null    object 
 5   Weather                905 non-null    object 
 6   Base_Fare              907 non-null    float64
 7   Per_Km_Rate            907 non-null    float64
 8   Per_Minute_Rate        902 non-null    float64
 9   Trip_Duration_Minutes  905 non-null    float64
 10  Trip_Price             951 non-null    float64
dtypes: float64(7), object(4)
memory usage: 89.2+ KB


In [67]:
# decide that out of 11 columns, if a row is missing more than 3-4 values, then drop it
df[df.isnull().sum(axis=1) >= 3]

Unnamed: 0,Trip_Distance_km,Time_of_Day,Day_of_Week,Passenger_Count,Traffic_Conditions,Weather,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Trip_Price
19,15.27,Morning,,,Low,Clear,3.93,0.73,0.12,,27.3543
107,38.02,Evening,,4.0,,Clear,,1.31,0.35,33.73,66.2817
125,21.93,Afternoon,,3.0,Low,,4.53,0.76,,74.52,48.024
137,,,,3.0,Low,Clear,4.52,1.38,0.35,57.56,73.587
173,12.1,Morning,Weekend,,Low,Clear,,0.59,0.31,,19.2643
177,19.95,Morning,Weekday,3.0,Low,,2.5,0.88,0.49,,
262,7.86,Afternoon,,3.0,Medium,Rain,4.82,0.75,0.26,,
344,,Evening,,,Low,Clear,4.62,1.86,0.43,7.19,45.8975
583,17.11,Afternoon,Weekday,,Medium,,2.77,1.23,,91.82,36.6701
631,44.87,Afternoon,,4.0,Low,Clear,4.27,,0.34,90.81,


In [68]:
# check missing data status in rows after Trip_Price is dropped
# only 9 rows are missing >=3 value counts, so drop these
df_trip_price[df_trip_price.isnull().sum(axis=1) >= 3]

Unnamed: 0,Trip_Distance_km,Time_of_Day,Day_of_Week,Passenger_Count,Traffic_Conditions,Weather,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Trip_Price
19,15.27,Morning,,,Low,Clear,3.93,0.73,0.12,,27.3543
107,38.02,Evening,,4.0,,Clear,,1.31,0.35,33.73,66.2817
125,21.93,Afternoon,,3.0,Low,,4.53,0.76,,74.52,48.024
137,,,,3.0,Low,Clear,4.52,1.38,0.35,57.56,73.587
173,12.1,Morning,Weekend,,Low,Clear,,0.59,0.31,,19.2643
344,,Evening,,,Low,Clear,4.62,1.86,0.43,7.19,45.8975
583,17.11,Afternoon,Weekday,,Medium,,2.77,1.23,,91.82,36.6701
796,,Evening,Weekday,,,Clear,3.05,0.74,0.35,105.71,74.5843
966,36.23,Morning,Weekend,4.0,,Rain,3.61,,0.27,,97.1094


In [71]:
# drop rows with >=3 missing values
df_clean = df_trip_price[df_trip_price.isnull().sum(axis=1) < 3]
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 942 entries, 0 to 999
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Trip_Distance_km       895 non-null    float64
 1   Time_of_Day            894 non-null    object 
 2   Day_of_Week            901 non-null    object 
 3   Passenger_Count        899 non-null    float64
 4   Traffic_Conditions     895 non-null    object 
 5   Weather                898 non-null    object 
 6   Base_Fare              900 non-null    float64
 7   Per_Km_Rate            899 non-null    float64
 8   Per_Minute_Rate        895 non-null    float64
 9   Trip_Duration_Minutes  899 non-null    float64
 10  Trip_Price             942 non-null    float64
dtypes: float64(7), object(4)
memory usage: 88.3+ KB
