In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import geopandas as gpd
from shapely.geometry import Point
from haversine import haversine
from sklearn.model_selection import train_test_split

sns.set_theme()
warnings.simplefilter(action='ignore', category=FutureWarning)

# Loading the Data

In [2]:
df = pd.read_csv('../data/raw/train.csv')

In [3]:
df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,0,2,2016-06-29 18:21:02,1,-73.862762,40.768822,-73.891701,40.746689,N,1133
1,1,2,2016-04-25 13:03:26,1,-73.958038,40.783237,-73.97551,40.760853,N,887
2,2,2,2016-05-07 12:36:09,1,-73.96946,40.785519,-73.989243,40.771748,N,686
3,3,1,2016-05-14 18:44:17,1,-73.981743,40.736549,-73.998352,40.72644,N,818
4,4,2,2016-04-10 22:51:25,1,-73.977913,40.752609,-73.975647,40.733139,N,951


## 01 - Identify and Handle Anomalies

In [4]:
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
df['pickup_date'] = df['pickup_datetime'].dt.date

daily_trips = df.groupby('pickup_date').size().reset_index(name='trip_count')

trips_mean = daily_trips['trip_count'].mean()
trips_std = daily_trips['trip_count'].std()

threshold = 1.5

lower_pound = trips_mean - threshold * trips_std
upper_pound = trips_mean + threshold * trips_std

high_anomalies = daily_trips[daily_trips['trip_count'] > upper_pound]
low_anomalies = daily_trips[daily_trips['trip_count'] < lower_pound]

anomalous_days = pd.concat([high_anomalies, low_anomalies])

df['is_anomaly'] = 0
df.loc[df['pickup_date'].isin(anomalous_days['pickup_date']), 'is_anomaly'] = 1

## 02 - Extract useful temporal features from the pickup_datetime column
- pickup_day_of_year (ordinal day of the year)
- pickup_day_of_week (0=Monday, 6=Sunday)
- pickup_hour_of_day (hour of the day).
> These features will help the model capture time-based patterns.

In [5]:
df['pickup_day_of_year'] = df['pickup_datetime'].dt.dayofyear
df['pickup_day_of_week'] = df['pickup_datetime'].dt.dayofweek
df['pickup_hour_of_day'] = df['pickup_datetime'].dt.hour

## 03 - Identify Traffic Congestion

In [6]:
def vec_haversine(row):
    """function that calculates the distance from the latitude and longitude."""
    return haversine((row['pickup_latitude'], row['pickup_longitude']),(row['dropoff_latitude'], row['dropoff_longitude']))

df['trip_distance'] = df.apply(vec_haversine, axis=1)
df['speed'] = df['trip_distance'] / (df['trip_duration'] / 3600) # speed to be km/h

df_avg = df.groupby(['pickup_day_of_week', 'pickup_hour_of_day'])['speed'].mean().reset_index()

df['traffic_congestion'] = 0
df.loc[(((8 <= df['pickup_hour_of_day']) & (df['pickup_hour_of_day'] <= 18)) & \
            ((1 <= df['pickup_day_of_week']) & (df['pickup_day_of_week'] <= 4))), 'traffic_congestion'] = 1

## 04 - Passenger Count Enhancement

In [7]:
outliers = [0, 7, 8, 9]
df = df[~df['passenger_count'].isin(outliers)]

## 05 - Airport Proximity Features

In [8]:
airports = [
    (40.6437, -73.7838), # JFK
    (40.7732, -73.8821), # LGA
    (40.6935, -74.1883)  # EWR
]

def is_near_airport(lat, lon, airport_coords, radius=1.5):
    return haversine((lat, lon), airport_coords) <= radius

df['pickup_near_airport'] = df.apply(lambda x: any(is_near_airport(x['pickup_latitude'], x['pickup_longitude'], airport) for airport in airports), axis=1)
df['dropoff_near_airport'] = df.apply(lambda x: any(is_near_airport(x['dropoff_latitude'], x['dropoff_longitude'], airport) for airport in airports), axis=1)

## 06 - Boundaries of NYC

In [9]:
url = "https://d37ci6vzurychx.cloudfront.net/misc/taxi_zones.zip"
nyc_zones = gpd.read_file(url)

nyc_zones = nyc_zones.to_crs("EPSG:4326") # ensure its accuracy

gdf_taxi = gpd.GeoDataFrame(
    df, 
    geometry=gpd.points_from_xy(df['pickup_longitude'], df['pickup_latitude']),
    crs="EPSG:4326"
)

df = gpd.sjoin(gdf_taxi, nyc_zones, how="inner", predicate="within")

## 07 - Analyze and Clean Trip Duration

In [10]:
df['trip_duration'] = df['trip_duration'].apply(np.log1p)
trip_duration_mean = df['trip_duration'].mean()

min_duration, max_duration = 4, 8

previous_trips = df.shape[0]
df = df[(df['trip_duration'] >= min_duration) & (df['trip_duration'] <= max_duration)]

## 08 - Final Cleaned Data Frame

In [11]:
df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,...,pickup_near_airport,dropoff_near_airport,geometry,index_right,OBJECTID,Shape_Leng,Shape_Area,zone,LocationID,borough
0,0,2,2016-06-29 18:21:02,1,-73.862762,40.768822,-73.891701,40.746689,N,7.033506,...,False,False,POINT (-73.86276 40.76882),137,138,0.107467,0.000537,LaGuardia Airport,138,Queens
1,1,2,2016-04-25 13:03:26,1,-73.958038,40.783237,-73.97551,40.760853,N,6.788972,...,False,False,POINT (-73.95804 40.78324),235,236,0.044252,0.000103,Upper East Side North,236,Manhattan
2,2,2,2016-05-07 12:36:09,1,-73.96946,40.785519,-73.989243,40.771748,N,6.532334,...,False,False,POINT (-73.96946 40.78552),237,238,0.060109,0.000185,Upper West Side North,238,Manhattan
3,3,1,2016-05-14 18:44:17,1,-73.981743,40.736549,-73.998352,40.72644,N,6.708084,...,False,False,POINT (-73.98174 40.73655),106,107,0.038041,7.5e-05,Gramercy,107,Manhattan
4,4,2,2016-04-10 22:51:25,1,-73.977913,40.752609,-73.975647,40.733139,N,6.858565,...,False,False,POINT (-73.97791 40.75261),161,162,0.03527,4.8e-05,Midtown East,162,Manhattan


# Feature Processing

In [12]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

## 01 - Divide features 
Divide features into numerical and categorical groups.

In [13]:
numerical_features = [
    'trip_distance',
    'passenger_count',
    'pickup_hour_of_day',
    'pickup_day_of_year',
]

categorical_features = [
    'pickup_day_of_week',
    'pickup_near_airport',
    'dropoff_near_airport'
]

df = df[numerical_features + categorical_features + ['trip_duration']]

train, test = train_test_split(df, test_size=0.2, random_state=14, shuffle=True)
X_train = train.drop(columns=['trip_duration'])
y_train = train['trip_duration']
X_test = test.drop(columns=['trip_duration'])
y_test = test['trip_duration']

## 02 - Preprocess Features
For numerical features:
- Normalize or standardize features to ensure consistent scaling.

For categorical features:
- Encode them using one-hot encoding or ordinal encoding, as appropriate.

In [14]:
preprocessor = ColumnTransformer(
    transformers=[
        ('scaling', StandardScaler(), numerical_features),
        ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ], 
    verbose_feature_names_out=False
)

preprocessor.set_output(transform="pandas")
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.fit_transform(X_test)

In [15]:
day_mapping = {
    'pickup_day_of_week_0': 'pickup_on_Monday',
    'pickup_day_of_week_1': 'pickup_on_Tuesday',
    'pickup_day_of_week_2': 'pickup_on_Wednesday',
    'pickup_day_of_week_3': 'pickup_on_Thursday',
    'pickup_day_of_week_4': 'pickup_on_Friday',
    'pickup_day_of_week_5': 'pickup_on_Saturday',
    'pickup_day_of_week_6': 'pickup_on_Sunday'
}

X_train = X_train.rename(columns=day_mapping)
X_test = X_test.rename(columns=day_mapping)

In [16]:
X_train.head()

Unnamed: 0,trip_distance,passenger_count,pickup_hour_of_day,pickup_day_of_year,pickup_on_Monday,pickup_on_Tuesday,pickup_on_Wednesday,pickup_on_Thursday,pickup_on_Friday,pickup_on_Saturday,pickup_on_Sunday,pickup_near_airport_False,pickup_near_airport_True,dropoff_near_airport_False,dropoff_near_airport_True
1032823,2.832762,-0.505698,-0.71615,0.455003,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
728437,-0.14533,0.254749,0.99505,-0.863832,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1014586,-0.666403,-0.505698,-2.116223,0.028321,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
1069756,-0.501012,-0.505698,0.061668,-0.456545,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
732951,1.501551,-0.505698,-0.871714,-0.378966,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
