# Cleaning data

### load and import

In [30]:
import pandas as pd
from Cleaning_functions import split_features_target, fillna_mean_median, handle_outliers
from taxipred.utils.constants import get_taxi_data
df = get_taxi_data()

### Sort data into target and numeric, categoric feats

In [31]:
X, df_numeric, df_boolean, df_categorical, df_target, y = split_features_target(df)
df_numeric.head()

Unnamed: 0,Trip_Distance_km,Passenger_Count,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes
0,19.35,3.0,3.56,0.8,0.32,53.82
1,47.59,1.0,,0.62,0.43,40.57
2,36.87,1.0,2.7,1.21,0.15,37.27
3,30.33,4.0,3.48,0.51,0.15,116.81
4,,3.0,2.93,0.63,0.32,22.64


### saving rows with NaN in trip_price for prediction model input

In [32]:
df_missing_label = df[df["Trip_Price"].isna()].drop(columns=["Trip_Price"])
df_missing_label.info()

<class 'pandas.core.frame.DataFrame'>
Index: 49 entries, 1 to 991
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Trip_Distance_km       49 non-null     float64
 1   Time_of_Day            48 non-null     object 
 2   Day_of_Week            45 non-null     object 
 3   Passenger_Count        47 non-null     float64
 4   Traffic_Conditions     49 non-null     object 
 5   Weather                45 non-null     object 
 6   Base_Fare              43 non-null     float64
 7   Per_Km_Rate            43 non-null     float64
 8   Per_Minute_Rate        48 non-null     float64
 9   Trip_Duration_Minutes  45 non-null     float64
dtypes: float64(6), object(4)
memory usage: 4.2+ KB


### Removing outliers in target

In [33]:
df_clipped_target = handle_outliers(df_target)
df_clipped_target.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 1 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Trip_Price  951 non-null    float64
dtypes: float64(1)
memory usage: 7.9 KB


### Filling nulls with median or mean depending on value


In [34]:
df_filled_numeric = fillna_mean_median(df_numeric)

### Handle missing values and outliers in the numeric columns

In [35]:
df_clipped_numeric = handle_outliers(df_filled_numeric)

### Handling missing values in categorical columns using mode

In [36]:
df_categorical_mode = df_categorical.apply(lambda col:col.fillna(col.mode()[0]))

### Dumme encoding. Creating dummy columns and values for categorical features

In [37]:
dummy_categoric_features = pd.get_dummies(df_categorical_mode)
dummy_categoric_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype
---  ------                     --------------  -----
 0   Time_of_Day_Afternoon      1000 non-null   bool 
 1   Time_of_Day_Evening        1000 non-null   bool 
 2   Time_of_Day_Morning        1000 non-null   bool 
 3   Time_of_Day_Night          1000 non-null   bool 
 4   Day_of_Week_Weekday        1000 non-null   bool 
 5   Day_of_Week_Weekend        1000 non-null   bool 
 6   Traffic_Conditions_High    1000 non-null   bool 
 7   Traffic_Conditions_Low     1000 non-null   bool 
 8   Traffic_Conditions_Medium  1000 non-null   bool 
 9   Weather_Clear              1000 non-null   bool 
 10  Weather_Rain               1000 non-null   bool 
 11  Weather_Snow               1000 non-null   bool 
dtypes: bool(12)
memory usage: 11.8 KB


### Merging processed features

In [38]:
df_clean = pd.concat([df_clipped_numeric, dummy_categoric_features], axis=1)

df_clean.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Trip_Distance_km,1000.0,26.153327,15.521566,1.23,13.1075,26.995,37.7825,74.795
Passenger_Count,1000.0,2.453,1.079331,1.0,2.0,2.0,3.0,4.0
Base_Fare,1000.0,3.502989,0.848107,2.01,2.77,3.502989,4.2025,5.0
Per_Km_Rate,1000.0,1.233316,0.418922,0.5,0.87,1.233316,1.58,2.0
Per_Minute_Rate,1000.0,0.292916,0.112662,0.1,0.1975,0.292916,0.3825,0.5
Trip_Duration_Minutes,1000.0,62.118116,31.339413,5.01,37.1075,62.118116,87.775,119.84


#### temorary join target to clean features

In [39]:
df_with_target = pd.concat([df_clean, df_filled_target], axis=1)

### correlations with cleaned data

In [40]:
cleaned_correlations = df_with_target.corr(numeric_only=True)[["Trip_Price"]].sort_values(
    by="Trip_Price", ascending=False
)
print(cleaned_correlations)

                           Trip_Price
Trip_Price                   1.000000
Trip_Distance_km             0.717884
Per_Km_Rate                  0.381859
Trip_Duration_Minutes        0.318678
Per_Minute_Rate              0.226293
Traffic_Conditions_High      0.059150
Base_Fare                    0.046055
Weather_Rain                 0.026548
Passenger_Count              0.020573
Day_of_Week_Weekday          0.020459
Weather_Snow                 0.019309
Time_of_Day_Evening          0.005622
Time_of_Day_Afternoon        0.003808
Time_of_Day_Morning         -0.004737
Time_of_Day_Night           -0.006912
Traffic_Conditions_Low      -0.007922
Day_of_Week_Weekend         -0.020459
Weather_Clear               -0.034544
Traffic_Conditions_Medium   -0.039092


### dropping low correlating feats

In [41]:
df_cleaned = df_with_target.drop(["Time_of_Day_Night",
                           "Time_of_Day_Morning",
                           "Day_of_Week_Weekend",
                           "Traffic_Conditions_Low",
                           "Weather_Clear",
                           "Traffic_Conditions_Medium"],
                           axis="columns"
                           )

df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Trip_Distance_km         1000 non-null   float64
 1   Passenger_Count          1000 non-null   float64
 2   Base_Fare                1000 non-null   float64
 3   Per_Km_Rate              1000 non-null   float64
 4   Per_Minute_Rate          1000 non-null   float64
 5   Trip_Duration_Minutes    1000 non-null   float64
 6   Time_of_Day_Afternoon    1000 non-null   bool   
 7   Time_of_Day_Evening      1000 non-null   bool   
 8   Day_of_Week_Weekday      1000 non-null   bool   
 9   Traffic_Conditions_High  1000 non-null   bool   
 10  Weather_Rain             1000 non-null   bool   
 11  Weather_Snow             1000 non-null   bool   
 12  Trip_Price               1000 non-null   float64
dtypes: bool(6), float64(7)
memory usage: 60.7 KB


# Exporting data to csv file

In [42]:
export_path = "../src/taxipred/data/cleaned_data.csv"

df_cleaned.to_csv(export_path, index=False)