# Cleaning data

### load and import

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from functions import split_features_target, fillna_mean_median, handle_outliers
from taxipred.utils.constants import get_taxi_data
df = get_taxi_data()

### Sort data into target and numeric, categoric feats

In [2]:
df_numeric, df_categorical, df_target = split_features_target(df)
df_numeric.head()

Unnamed: 0,Trip_Distance_km,Passenger_Count,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes
0,19.35,3.0,3.56,0.8,0.32,53.82
1,47.59,1.0,,0.62,0.43,40.57
2,36.87,1.0,2.7,1.21,0.15,37.27
3,30.33,4.0,3.48,0.51,0.15,116.81
4,,3.0,2.93,0.63,0.32,22.64


### Filling nulls with median or mean depending on value


In [3]:
df_cleaned_numeric = fillna_mean_median(df_numeric)     
df_cleaned_numeric.info()     

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Trip_Distance_km       1000 non-null   float64
 1   Passenger_Count        1000 non-null   float64
 2   Base_Fare              1000 non-null   float64
 3   Per_Km_Rate            1000 non-null   float64
 4   Per_Minute_Rate        1000 non-null   float64
 5   Trip_Duration_Minutes  1000 non-null   float64
dtypes: float64(6)
memory usage: 47.0 KB


### Handle missing values and outliers in the numeric columns

In [4]:
df_cleaned_numeric_outliers = handle_outliers(df_cleaned_numeric)
df_cleaned_numeric_outliers.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Trip_Distance_km,1000.0,27.070547,19.400775,1.23,13.1075,26.995,37.7825,146.067047
Passenger_Count,1000.0,2.453,1.079331,1.0,2.0,2.0,3.0,4.0
Base_Fare,1000.0,3.502989,0.848107,2.01,2.77,3.502989,4.2025,5.0
Per_Km_Rate,1000.0,1.233316,0.418922,0.5,0.87,1.233316,1.58,2.0
Per_Minute_Rate,1000.0,0.292916,0.112662,0.1,0.1975,0.292916,0.3825,0.5
Trip_Duration_Minutes,1000.0,62.118116,31.339413,5.01,37.1075,62.118116,87.775,119.84
