In [None]:
import os
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from datetime import datetime
from workalendar.usa import Illinois

RAW_DATA_DIR = "../../data/raw data"
PROCESSED_DATA_DIR = "../../data/processed data"

SUB_FOLDER = "2020"

cal = Illinois()

## Taxi Data

In [None]:
taxi_trips = pd.read_csv(os.path.join(RAW_DATA_DIR, SUB_FOLDER, "chicago_taxi_trips.csv"))

print("Original record count:", len(taxi_trips))

Original record count: 5131859


### Data Cleaning

In [None]:
taxi_trips = taxi_trips.dropna(subset=['trip_start_timestamp',
                                     'trip_end_timestamp',
                                     'trip_seconds',
                                     'trip_miles',
                                     'pickup_community_area',
                                     'dropoff_community_area',
                                     'fare',
                                     'pickup_census_tract',
                                     'dropoff_census_tract'])

taxi_trips["trip_start_timestamp"] = pd.to_datetime(taxi_trips["trip_start_timestamp"])
taxi_trips["trip_end_timestamp"] = pd.to_datetime(taxi_trips["trip_end_timestamp"])
taxi_trips["trip_start_date"] = taxi_trips["trip_start_timestamp"].dt.date
taxi_trips["trip_end_date"] = taxi_trips["trip_end_timestamp"].dt.date
taxi_trips = taxi_trips[taxi_trips["trip_start_date"] == taxi_trips["trip_end_date"]]

if (SUB_FOLDER == "2020"):
    taxi_trips["cutoff"] = pd.to_datetime("2020-01-06")
else:
    taxi_trips["cutoff"] = pd.to_datetime("2019-01-07")
taxi_trips["trip_start_date"] = pd.to_datetime(taxi_trips["trip_start_date"])
taxi_trips["days_from_cutoff"] = (taxi_trips["trip_start_date"] - taxi_trips["cutoff"]).dt.days
taxi_trips = taxi_trips[taxi_trips["days_from_cutoff"].between(-30, 30)]

print("Nan dropped record count:", len(taxi_trips))

Nan dropped record count: 1621195


### Data Manipulation

In [None]:
taxi_trips["nonworkday"] = taxi_trips["trip_start_date"].apply(lambda x: not cal.is_working_day(x)).astype(int)
taxi_trips["trip_start_time_of_day"] = taxi_trips["trip_start_timestamp"].dt.time
taxi_trips["trip_start_time_of_day_f"] = taxi_trips["trip_start_time_of_day"].apply(lambda x: x.hour + x.minute / 60 + x.second / 3600)
taxi_trips["trip_during_peak"] = taxi_trips["trip_start_time_of_day_f"].apply(lambda x: (7 < x and x < 10) or (16.5 < x and x < 20.5)).astype(int)

print("Manipulated record count:", len(taxi_trips))

Manipulated record count: 1621195


### Save Data

In [None]:
taxi_trips.to_csv(os.path.join(PROCESSED_DATA_DIR, SUB_FOLDER, "cleaned_taxi_data.csv"), index=False)