## 3b, Feature Engineering
- Dataset for 'integrate category 'Unknown'' into closest category

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
df_integrate = pd.read_csv("df_filled.csv")
df_integrate.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 875 entries, 0 to 874
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Trip_Distance_km       875 non-null    float64
 1   Time_of_Day            875 non-null    object 
 2   Day_of_Week            875 non-null    object 
 3   Passenger_Count        875 non-null    float64
 4   Traffic_Conditions     875 non-null    object 
 5   Weather                875 non-null    object 
 6   Base_Fare              875 non-null    float64
 7   Per_Km_Rate            875 non-null    float64
 8   Per_Minute_Rate        875 non-null    float64
 9   Trip_Duration_Minutes  875 non-null    float64
 10  Trip_Price             875 non-null    float64
dtypes: float64(7), object(4)
memory usage: 75.3+ KB


In [15]:
df_integrate["Weather"].value_counts()

Weather
Clear      581
Rain       201
Snow        52
Unknown     41
Name: count, dtype: int64

In [8]:
# 1. replace() -method to integrate 'Unknown':
## merge unknown into closest manually by choosing the behaviour it resembles closest
# in all 4 cat columns: weather, dow, tod and traffic:
df_model_integrate_unknown = df_integrate.copy()

# traffic - high
df_model_integrate_unknown["Traffic_Conditions"] = df_model_integrate_unknown["Traffic_Conditions"].replace("Unknown", "High")

In [9]:
df_model_integrate_unknown["Traffic_Conditions"].value_counts()

Traffic_Conditions
Low       344
Medium    329
High      202
Name: count, dtype: int64

In [10]:
# day_ow - night
df_model_integrate_unknown["Time_of_Day"] = df_model_integrate_unknown["Time_of_Day"].replace("Unknown", "Night")

In [11]:
# week - weekday
df_model_integrate_unknown["Day_of_Week"]=df_model_integrate_unknown["Day_of_Week"].replace("Unknown", "Weekday")

In [12]:
# weather - rain (i decided to add it to rain for ease)
# make sure to do replace method first, THEN create smart feature
df_model_integrate_unknown["Weather"]=df_model_integrate_unknown["Weather"].replace("Unknown", "Rain")

In [16]:
# check that unknown is integrated - 201+41 = 242
df_model_integrate_unknown["Weather"].value_counts()

Weather
Clear    581
Rain     242
Snow      52
Name: count, dtype: int64

In [14]:
df_model_integrate_unknown.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 875 entries, 0 to 874
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Trip_Distance_km       875 non-null    float64
 1   Time_of_Day            875 non-null    object 
 2   Day_of_Week            875 non-null    object 
 3   Passenger_Count        875 non-null    float64
 4   Traffic_Conditions     875 non-null    object 
 5   Weather                875 non-null    object 
 6   Base_Fare              875 non-null    float64
 7   Per_Km_Rate            875 non-null    float64
 8   Per_Minute_Rate        875 non-null    float64
 9   Trip_Duration_Minutes  875 non-null    float64
 10  Trip_Price             875 non-null    float64
dtypes: float64(7), object(4)
memory usage: 75.3+ KB


In [17]:
# add smart features, same as for df_model_keep_unknown
# IsBusinessHour:

df_model_integrate_unknown["IsBusinessHour"] = (
    (df_model_integrate_unknown["Weather"] == "Clear") &
    (df_model_integrate_unknown["Day_of_Week"] == "Weekday") &
    (df_model_integrate_unknown["Time_of_Day"].isin(["Morning", "Afternoon"])) & 
    (df_model_integrate_unknown["Traffic_Conditions"].isin(["Low", "Medium"]))
).astype(int)

In [None]:
# smart feature for weather Rainfall/snowfall. 'Clear' is implied. 'unknown' is its own category
df["IsRain"] = (df["Weather"] == "Rain").astype(int)
df["IsSnow"] = (df["Weather"] == "Snow").astype(int)
df["IsWeatherUnknown"] = (df["Weather"] == "Unknown").astype(int)

In [None]:
# smart feature to flag weekend (IsWeekend = 1)
df["IsWeekend"] = (df["Day_of_Week"] == "Weekend").astype(int)
df["IsDayUnknown"] = (df["Day_of_Week"] == "Unknown").astype(int)

In [None]:
# encode ordinals in traffic_condition to prep dataset for model training
df["Traffic_Conditions_Num"] = pd.Categorical(
    df["Traffic_Conditions"],
    categories=["Unknown", "Low", "Medium", "High"],
    ordered=True
).codes

In [None]:
# encode ordinals in time_of_day to prep dataset for model training
df["Time_of_Day_Num"] = pd.Categorical(
    df["Time_of_Day"],
    categories=["Unknown", "Morning", "Afternoon", "Evening", "Night"],
    ordered=True
).codes

In [None]:
df_model_integrate_unknown = df_model_integrate_unknown.drop(columns=["Time_of_Day", "Day_of_Week", "Traffic_Conditions", "Weather"])

In [None]:
# export dataset to csv

#df_model_integrate_unknown.to_csv("df_model_integrate_unknown.csv", index = False)