# init

## imports

In [1]:
import geopandas as gpd
import numpy as np
import os
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

## configs

In [2]:
DATA_PATH = os.path.expanduser('~/class/new_york/data/.data/')

START_DATE = pd.Timestamp('2023-01-01').date()
END_DATE = pd.Timestamp('2023-04-30').date()

POPULAR_AREA_COUNT = 50
LAG_DAYS_COUNT = 10 

OUTPUT_DATASET_PATH = os.path.expanduser('~/class/new_york/data/dataset.csv')

# load data

In [3]:
rides_df = pd.read_parquet(DATA_PATH)
print(rides_df.columns)
rides_df.head()

Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge', 'airport_fee'],
      dtype='object')


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0
3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.9,1.0,N,138,7,1,12.1,7.25,0.5,0.0,0.0,1.0,20.85,0.0,1.25
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0


# data preprocessing

## filter by date

In [4]:
rides_df['Date'] = pd.to_datetime(rides_df['tpep_pickup_datetime']).dt.date
rides_df= rides_df[rides_df['Date'] >= START_DATE]
rides_df = rides_df[END_DATE >= rides_df['Date']]
rides_df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,Date
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0,2023-01-01
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0,2023-01-01
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0,2023-01-01
3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.9,1.0,N,138,7,1,12.1,7.25,0.5,0.0,0.0,1.0,20.85,0.0,1.25,2023-01-01
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0,2023-01-01


## defining popular areas

In [5]:
pop_area_df = (
    rides_df
    .groupby('PULocationID')
    ['VendorID']
    .count()
    .sort_values(ascending=False)
    .head(POPULAR_AREA_COUNT)
    .reset_index(name='count')
)
popular_areas_list = pop_area_df['PULocationID'].tolist()
pop_area_df.head()

Unnamed: 0,PULocationID,count
0,132,635312
1,237,590734
2,161,580792
3,236,542459
4,162,444517


## **labelling** - trips per area and date

In [6]:
daily_trips_df = (
    rides_df
    .groupby(['Date', 'PULocationID'])
    .size()
    .reset_index(name='Daily_trips')
)

daily_trips_df.head()

Unnamed: 0,Date,PULocationID,Daily_trips
0,2023-01-01,1,40
1,2023-01-01,4,174
2,2023-01-01,5,3
3,2023-01-01,6,1
4,2023-01-01,7,126


## filter popular areas daily trips

In [7]:
pop_areas_trips_df = daily_trips_df[daily_trips_df['PULocationID'].isin(popular_areas_list)]
pop_areas_trips_df.head()

Unnamed: 0,Date,PULocationID,Daily_trips
9,2023-01-01,13,398
18,2023-01-01,24,220
34,2023-01-01,43,1129
38,2023-01-01,48,2894
40,2023-01-01,50,868


## **feature extraction** - lag daily trips

In [9]:
for lag in range(1, LAG_DAYS_COUNT+1):
    pop_areas_trips_df[f'{lag}_day_lag'] = (
        pop_areas_trips_df
        .groupby('PULocationID')['Daily_trips']
        .rolling(window=lag+1)
        .apply(lambda x: x.iloc[0] if len(x) >= 1 else None)
        .reset_index(level=0, drop=True)
    )
pop_areas_trips_df.dropna(inplace=True)
pop_areas_trips_df.head()

Unnamed: 0,Date,PULocationID,Daily_trips,1_day_lag,2_day_lag,3_day_lag,4_day_lag,5_day_lag,6_day_lag,7_day_lag,8_day_lag,9_day_lag,10_day_lag
4389,2023-01-21,13,487,546.0,631.0,662.0,546.0,318.0,339.0,420.0,572.0,645.0,683.0
4398,2023-01-21,24,319,364.0,296.0,291.0,274.0,197.0,232.0,278.0,311.0,311.0,300.0
4413,2023-01-21,43,1936,1904.0,1918.0,1562.0,1471.0,1229.0,1399.0,1938.0,1806.0,1696.0,1549.0
4417,2023-01-21,48,3391,2810.0,2914.0,2600.0,2458.0,2178.0,3283.0,3712.0,2966.0,2865.0,2638.0
4419,2023-01-21,50,728,751.0,766.0,551.0,534.0,521.0,573.0,648.0,685.0,632.0,617.0


## save output

In [31]:
pop_areas_trips_df.to_csv(OUTPUT_DATASET_PATH, index=False)