In [None]:
import os
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from datetime import datetime
from workalendar.usa import Illinois

RAW_DATA_DIR = "../../data/raw data"
PROCESSED_DATA_DIR = "../../data/processed data"

SUB_FOLDER = "2020"

cal = Illinois()

## TNP Trips Data

### Data Manipulation

In [None]:
tnp_trips["nonworkday"] = tnp_trips["trip_start_date"].apply(lambda x: not cal.is_working_day(x)).astype(int)
tnp_trips["trip_start_time_of_day"] = tnp_trips["trip_start_timestamp"].dt.time
tnp_trips["trip_start_time_of_day_f"] = tnp_trips["trip_start_time_of_day"].apply(lambda x: x.hour + x.minute / 60 + x.second / 3600)
tnp_trips["trip_during_peak"] = tnp_trips["trip_start_time_of_day_f"].apply(lambda x: (7 < x and x < 10) or (16.5 < x and x < 20.5)).astype(int)

print("Manipulated record count:", len(tnp_trips))

Manipulated record count: 7508517


### Save Data

In [None]:
tnp_trips.to_csv(os.path.join(PROCESSED_DATA_DIR, SUB_FOLDER, "cleaned_tnp_data.csv"), index=False)

In [None]:
tnp_trips = pd.read_csv(os.path.join(RAW_DATA_DIR, SUB_FOLDER, "chicago_tnp_trips.csv"))

print("Original record count:", len(tnp_trips))

Original record count: 32773716


### Data Cleaning

In [None]:
tnp_trips = tnp_trips.dropna(subset=['trip_start_timestamp',
                                     'trip_end_timestamp',
                                     'trip_seconds',
                                     'trip_miles',
                                     'pickup_community_area',
                                     'fare',
                                     'shared_trip_authorized',
                                     'dropoff_community_area',
                                     'pickup_census_tract',
                                     'dropoff_census_tract'])

tnp_trips = tnp_trips[tnp_trips["shared_trip_authorized"] == False]

tnp_trips["trip_start_timestamp"] = pd.to_datetime(tnp_trips["trip_start_timestamp"])
tnp_trips["trip_end_timestamp"] = pd.to_datetime(tnp_trips["trip_end_timestamp"])
tnp_trips["trip_start_date"] = tnp_trips["trip_start_timestamp"].dt.date
tnp_trips["trip_end_date"] = tnp_trips["trip_end_timestamp"].dt.date
tnp_trips = tnp_trips[tnp_trips["trip_start_date"] == tnp_trips["trip_end_date"]]

if (SUB_FOLDER == "2020"):
    tnp_trips["cutoff"] = pd.to_datetime("2020-01-06")
elif (SUB_FOLDER == "2019"):
    tnp_trips["cutoff"] = pd.to_datetime("2019-01-07")
tnp_trips["trip_start_date"] = pd.to_datetime(tnp_trips["trip_start_date"])
tnp_trips["days_from_cutoff"] = (tnp_trips["trip_start_date"] - tnp_trips["cutoff"]).dt.days
tnp_trips = tnp_trips[tnp_trips["days_from_cutoff"].between(-30, 30)]

lower_bound_2 = tnp_trips["trip_seconds"].quantile(0.05)
upper_bound_2 = tnp_trips["trip_seconds"].quantile(0.95)
lower_bound_3 = tnp_trips["trip_miles"].quantile(0.05)
upper_bound_3 = tnp_trips["trip_miles"].quantile(0.95)
tnp_trips = tnp_trips[(tnp_trips["trip_seconds"] >= lower_bound_2) & (tnp_trips["trip_seconds"] <= upper_bound_2) &
                      (tnp_trips["trip_miles"] >= lower_bound_3) & (tnp_trips["trip_miles"] <= upper_bound_3)]

print("Nan dropped record count:", len(tnp_trips))

Nan dropped record count: 7508517
