# Cleaning data

### load and import

In [27]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from functions import split_features_target, fillna_mean_median, handle_outliers
from taxipred.utils.constants import get_taxi_data
df = get_taxi_data()

### Sort data into target and numeric, categoric feats

In [28]:
df_numeric, df_categorical, df_target = split_features_target(df)
df_numeric.head()

Unnamed: 0,Trip_Distance_km,Passenger_Count,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes
0,19.35,3.0,3.56,0.8,0.32,53.82
1,47.59,1.0,,0.62,0.43,40.57
2,36.87,1.0,2.7,1.21,0.15,37.27
3,30.33,4.0,3.48,0.51,0.15,116.81
4,,3.0,2.93,0.63,0.32,22.64


### Filling nulls with median or mean depending on value


In [29]:
df_cleaned_numeric = fillna_mean_median(df_numeric)     
df_cleaned_numeric.info()     

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Trip_Distance_km       1000 non-null   float64
 1   Passenger_Count        1000 non-null   float64
 2   Base_Fare              1000 non-null   float64
 3   Per_Km_Rate            1000 non-null   float64
 4   Per_Minute_Rate        1000 non-null   float64
 5   Trip_Duration_Minutes  1000 non-null   float64
dtypes: float64(6)
memory usage: 47.0 KB


### Handle missing values and outliers in the numeric columns

In [30]:
df_cleaned_numeric_outliers = handle_outliers(df_cleaned_numeric)
df_cleaned_numeric_outliers.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Trip_Distance_km,1000.0,27.070547,19.400775,1.23,13.1075,26.995,37.7825,146.067047
Passenger_Count,1000.0,2.453,1.079331,1.0,2.0,2.0,3.0,4.0
Base_Fare,1000.0,3.502989,0.848107,2.01,2.77,3.502989,4.2025,5.0
Per_Km_Rate,1000.0,1.233316,0.418922,0.5,0.87,1.233316,1.58,2.0
Per_Minute_Rate,1000.0,0.292916,0.112662,0.1,0.1975,0.292916,0.3825,0.5
Trip_Duration_Minutes,1000.0,62.118116,31.339413,5.01,37.1075,62.118116,87.775,119.84


### Handling missing values in categorical columns using mode

In [31]:
df_categorical.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Time_of_Day         950 non-null    object
 1   Day_of_Week         950 non-null    object
 2   Traffic_Conditions  950 non-null    object
 3   Weather             950 non-null    object
dtypes: object(4)
memory usage: 31.4+ KB


In [32]:
df_categorical_mode = df_categorical.apply(lambda col:col.fillna(col.mode()[0]))
df_categorical_mode.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Time_of_Day         1000 non-null   object
 1   Day_of_Week         1000 non-null   object
 2   Traffic_Conditions  1000 non-null   object
 3   Weather             1000 non-null   object
dtypes: object(4)
memory usage: 31.4+ KB


### Creating dummy columns and values for categorical features

In [33]:
dummy_categoric_features = pd.get_dummies(df_categorical_mode)
dummy_categoric_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype
---  ------                     --------------  -----
 0   Time_of_Day_Afternoon      1000 non-null   bool 
 1   Time_of_Day_Evening        1000 non-null   bool 
 2   Time_of_Day_Morning        1000 non-null   bool 
 3   Time_of_Day_Night          1000 non-null   bool 
 4   Day_of_Week_Weekday        1000 non-null   bool 
 5   Day_of_Week_Weekend        1000 non-null   bool 
 6   Traffic_Conditions_High    1000 non-null   bool 
 7   Traffic_Conditions_Low     1000 non-null   bool 
 8   Traffic_Conditions_Medium  1000 non-null   bool 
 9   Weather_Clear              1000 non-null   bool 
 10  Weather_Rain               1000 non-null   bool 
 11  Weather_Snow               1000 non-null   bool 
dtypes: bool(12)
memory usage: 11.8 KB


### Merging processed features

In [34]:
df_clean = pd.concat([df_cleaned_numeric_outliers, dummy_categoric_features], axis=1)

df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 18 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Trip_Distance_km           1000 non-null   float64
 1   Passenger_Count            1000 non-null   float64
 2   Base_Fare                  1000 non-null   float64
 3   Per_Km_Rate                1000 non-null   float64
 4   Per_Minute_Rate            1000 non-null   float64
 5   Trip_Duration_Minutes      1000 non-null   float64
 6   Time_of_Day_Afternoon      1000 non-null   bool   
 7   Time_of_Day_Evening        1000 non-null   bool   
 8   Time_of_Day_Morning        1000 non-null   bool   
 9   Time_of_Day_Night          1000 non-null   bool   
 10  Day_of_Week_Weekday        1000 non-null   bool   
 11  Day_of_Week_Weekend        1000 non-null   bool   
 12  Traffic_Conditions_High    1000 non-null   bool   
 13  Traffic_Conditions_Low     1000 non-null   bool  

#### temorary join target to clean features

In [35]:
df_with_target = pd.concat([df_clean, df_target], axis=1)

### correlations with cleaned data

In [36]:
cleaned_correlations = df_with_target.corr(numeric_only=True)[["Trip_Price"]].sort_values(
    by="Trip_Price", ascending=False
)
print(cleaned_correlations)

                           Trip_Price
Trip_Price                   1.000000
Trip_Distance_km             0.841410
Per_Km_Rate                  0.263630
Trip_Duration_Minutes        0.219294
Per_Minute_Rate              0.134093
Traffic_Conditions_High      0.095082
Weather_Rain                 0.038793
Base_Fare                    0.035158
Day_of_Week_Weekday          0.030137
Time_of_Day_Afternoon        0.020465
Weather_Snow                 0.005764
Time_of_Day_Night           -0.004531
Time_of_Day_Evening         -0.005943
Passenger_Count             -0.013172
Time_of_Day_Morning         -0.014219
Day_of_Week_Weekend         -0.030137
Traffic_Conditions_Low      -0.030416
Weather_Clear               -0.038988
Traffic_Conditions_Medium   -0.044407


### dropping low correlating feats

In [37]:
df_dropped = df_clean.drop(["Weather_Rain",
                           "Base_Fare",
                           "Day_of_Week_Weekday",
                           "Time_of_Day_Afternoon",
                           "Weather_Snow",
                           "Time_of_Day_Night",
                           "Time_of_Day_Evening",
                           "Passenger_Count", 
                           "Time_of_Day_Morning",
                           "Day_of_Week_Weekend",
                           "Traffic_Conditions_Low",
                           "Weather_Clear",
                           "Traffic_Conditions_Medium"],
                           axis="columns"
                           )