# Cleaning data for rows with missing Trip_price

In [62]:
import pandas as pd
from Cleaning_functions import split_features_target, fillna_mean_median, handle_outliers
from taxipred.utils.constants import get_taxi_data
df = get_taxi_data()

### create dataframe with rows where NaN in trip_price for prediction model input

In [63]:
df = df[df["Trip_Price"].isna()]
df.head()

Unnamed: 0,Trip_Distance_km,Time_of_Day,Day_of_Week,Passenger_Count,Traffic_Conditions,Weather,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Trip_Price
1,47.59,Afternoon,Weekday,1.0,High,Clear,,0.62,0.43,40.57,
8,30.45,Morning,Weekday,3.0,High,Clear,2.77,1.78,0.34,110.33,
11,48.53,Night,Weekday,3.0,Low,Clear,4.78,,0.5,79.94,
32,4.19,Morning,Weekday,1.0,Low,Clear,4.07,1.89,0.19,69.06,
94,38.78,Morning,Weekday,3.0,Medium,Clear,3.08,1.62,0.15,90.14,


### splitting df_missing_label, handling Nan in categoric and numeric coulumns and merging back to df

In [64]:
X, df_numeric, dfl_boolean, df_categorical, df_target = split_features_target(df) 


In [65]:

# Filling NaN in numeric columns with median or mean depending on value
df_numeric = fillna_mean_median(df_numeric)


In [66]:

# Handling missing values in categorical columns using mode
dfcategorical = df_categorical.apply(lambda col:col.fillna(col.mode()[0]))


In [67]:

# creating dummies
df_dummies  = pd.get_dummies(df_categorical)
df_dummies.info()

<class 'pandas.core.frame.DataFrame'>
Index: 49 entries, 1 to 991
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype
---  ------                     --------------  -----
 0   Time_of_Day_Afternoon      49 non-null     bool 
 1   Time_of_Day_Evening        49 non-null     bool 
 2   Time_of_Day_Morning        49 non-null     bool 
 3   Time_of_Day_Night          49 non-null     bool 
 4   Day_of_Week_Weekday        49 non-null     bool 
 5   Day_of_Week_Weekend        49 non-null     bool 
 6   Traffic_Conditions_High    49 non-null     bool 
 7   Traffic_Conditions_Low     49 non-null     bool 
 8   Traffic_Conditions_Medium  49 non-null     bool 
 9   Weather_Clear              49 non-null     bool 
 10  Weather_Rain               49 non-null     bool 
 11  Weather_Snow               49 non-null     bool 
dtypes: bool(12)
memory usage: 980.0 bytes


In [68]:

# merging numeric and categorical
df = pd.concat([df_dummies, df_numeric], axis=1)
df.info()



<class 'pandas.core.frame.DataFrame'>
Index: 49 entries, 1 to 991
Data columns (total 18 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Time_of_Day_Afternoon      49 non-null     bool   
 1   Time_of_Day_Evening        49 non-null     bool   
 2   Time_of_Day_Morning        49 non-null     bool   
 3   Time_of_Day_Night          49 non-null     bool   
 4   Day_of_Week_Weekday        49 non-null     bool   
 5   Day_of_Week_Weekend        49 non-null     bool   
 6   Traffic_Conditions_High    49 non-null     bool   
 7   Traffic_Conditions_Low     49 non-null     bool   
 8   Traffic_Conditions_Medium  49 non-null     bool   
 9   Weather_Clear              49 non-null     bool   
 10  Weather_Rain               49 non-null     bool   
 11  Weather_Snow               49 non-null     bool   
 12  Trip_Distance_km           49 non-null     float64
 13  Passenger_Count            49 non-null     float64
 14  

In [69]:
### dropping same feats as the cleaned data

In [70]:
df_cleaned = df.drop([
    "Time_of_Day_Night",
    "Time_of_Day_Morning",
    "Time_of_Day_Evening",
    "Day_of_Week_Weekend",
    "Traffic_Conditions_Low",
    "Weather_Clear",
    "Passenger_Count",
    "Traffic_Conditions_Medium"],
    axis="columns"
)
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 49 entries, 1 to 991
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Time_of_Day_Afternoon    49 non-null     bool   
 1   Day_of_Week_Weekday      49 non-null     bool   
 2   Traffic_Conditions_High  49 non-null     bool   
 3   Weather_Rain             49 non-null     bool   
 4   Weather_Snow             49 non-null     bool   
 5   Trip_Distance_km         49 non-null     float64
 6   Base_Fare                49 non-null     float64
 7   Per_Km_Rate              49 non-null     float64
 8   Per_Minute_Rate          49 non-null     float64
 9   Trip_Duration_Minutes    49 non-null     float64
dtypes: bool(5), float64(5)
memory usage: 2.5 KB


Exporting df_missing_label to csv files

In [71]:
export_path = "../src/taxipred/data/missing_label.json"

df_cleaned.to_json(export_path, orient="records", lines=False)