# init

## imports

In [1]:
import os
import pandas as pd
import warnings

from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

warnings.filterwarnings('ignore')

## configs

In [2]:
DATASET_PATH = os.path.expanduser('~/class/new_york/data/dataset.csv')

TRAIN_TEST_SPLIT_DATE = '2023-04-01'

FEATURES = [
    '1_day_lag', '2_day_lag', '3_day_lag', 
    '4_day_lag', '5_day_lag', '6_day_lag',	
    '7_day_lag', '8_day_lag', '9_day_lag', '10_day_lag'
]
LABEL = ['Daily_trips']

PREDICTION_DATA_PATH = os.path.expanduser('~/class/new_york/data/test.csv')

# load data

In [3]:
dataset_df = pd.read_csv(DATASET_PATH)
dataset_df.head()

Unnamed: 0,Date,PULocationID,Daily_trips,1_day_lag,2_day_lag,3_day_lag,4_day_lag,5_day_lag,6_day_lag,7_day_lag,8_day_lag,9_day_lag,10_day_lag
0,2023-01-11,13,683,633.0,456.0,278.0,417.0,507.0,614.0,493.0,446.0,381.0,398.0
1,2023-01-11,24,300,271.0,239.0,225.0,320.0,306.0,288.0,277.0,261.0,169.0,220.0
2,2023-01-11,43,1549,1550.0,1376.0,1340.0,1919.0,1994.0,1859.0,1464.0,1474.0,1283.0,1129.0
3,2023-01-11,48,2638,2501.0,2039.0,2725.0,3232.0,2787.0,2632.0,2360.0,2053.0,1624.0,2894.0
4,2023-01-11,50,617,553.0,464.0,578.0,651.0,628.0,532.0,491.0,502.0,447.0,868.0


# train/test split

In [4]:
def train_test_split(dataset_df, split_date):
    train_df = dataset_df[dataset_df['Date'] < split_date]
    test_df = dataset_df[dataset_df['Date'] >= split_date]
    return train_df, test_df

train_df, test_df = train_test_split(dataset_df, TRAIN_TEST_SPLIT_DATE )
train_df.head()

Unnamed: 0,Date,PULocationID,Daily_trips,1_day_lag,2_day_lag,3_day_lag,4_day_lag,5_day_lag,6_day_lag,7_day_lag,8_day_lag,9_day_lag,10_day_lag
0,2023-01-11,13,683,633.0,456.0,278.0,417.0,507.0,614.0,493.0,446.0,381.0,398.0
1,2023-01-11,24,300,271.0,239.0,225.0,320.0,306.0,288.0,277.0,261.0,169.0,220.0
2,2023-01-11,43,1549,1550.0,1376.0,1340.0,1919.0,1994.0,1859.0,1464.0,1474.0,1283.0,1129.0
3,2023-01-11,48,2638,2501.0,2039.0,2725.0,3232.0,2787.0,2632.0,2360.0,2053.0,1624.0,2894.0
4,2023-01-11,50,617,553.0,464.0,578.0,651.0,628.0,532.0,491.0,502.0,447.0,868.0


In [5]:
test_df.head()

Unnamed: 0,Date,PULocationID,Daily_trips,1_day_lag,2_day_lag,3_day_lag,4_day_lag,5_day_lag,6_day_lag,7_day_lag,8_day_lag,9_day_lag,10_day_lag
4000,2023-04-01,13,518,672.0,753.0,760.0,746.0,556.0,402.0,480.0,605.0,740.0,710.0
4001,2023-04-01,24,335,318.0,316.0,275.0,238.0,250.0,268.0,269.0,300.0,266.0,272.0
4002,2023-04-01,43,1888,1776.0,1686.0,1489.0,1537.0,1435.0,1568.0,1985.0,1714.0,1831.0,1348.0
4003,2023-04-01,48,3868,3224.0,3112.0,2934.0,2628.0,2392.0,3034.0,3780.0,3073.0,3114.0,2880.0
4004,2023-04-01,50,842,774.0,675.0,668.0,613.0,564.0,712.0,772.0,639.0,645.0,595.0


# methods

## decision tree

### training model

In [7]:
dt_regressor = DecisionTreeRegressor(max_depth=500)

dt_regressor.fit(train_df[FEATURES], train_df[LABEL]) 

max_depth = dt_regressor.tree_.max_depth

print("Maximum tree depth in the Decision Tree Regressor:", max_depth)

Maximum tree depth in the Decision Tree Regressor: 31


In [7]:
dt = DecisionTreeRegressor(max_depth=16, min_samples_split=30, min_samples_leaf=40)
dt.fit(train_df[FEATURES], train_df[LABEL])

### predict

In [8]:
test_df['Pred_dt'] = dt.predict(test_df[FEATURES])
test_df.head()

Unnamed: 0,Date,PULocationID,Daily_trips,1_day_lag,2_day_lag,3_day_lag,4_day_lag,5_day_lag,6_day_lag,7_day_lag,8_day_lag,9_day_lag,10_day_lag,Pred_dt
4000,2023-04-01,13,518,672.0,753.0,760.0,746.0,556.0,402.0,480.0,605.0,740.0,710.0,469.793651
4001,2023-04-01,24,335,318.0,316.0,275.0,238.0,250.0,268.0,269.0,300.0,266.0,272.0,267.230769
4002,2023-04-01,43,1888,1776.0,1686.0,1489.0,1537.0,1435.0,1568.0,1985.0,1714.0,1831.0,1348.0,1891.365079
4003,2023-04-01,48,3868,3224.0,3112.0,2934.0,2628.0,2392.0,3034.0,3780.0,3073.0,3114.0,2880.0,3651.555556
4004,2023-04-01,50,842,774.0,675.0,668.0,613.0,564.0,712.0,772.0,639.0,645.0,595.0,803.0


## random forest

### training model

In [9]:
rf = RandomForestRegressor(n_estimators=500)
rf.fit(train_df[FEATURES], train_df[LABEL])

### predict

In [10]:
test_df['Pred_rf'] = rf.predict(test_df[FEATURES])
test_df.head()

Unnamed: 0,Date,PULocationID,Daily_trips,1_day_lag,2_day_lag,3_day_lag,4_day_lag,5_day_lag,6_day_lag,7_day_lag,8_day_lag,9_day_lag,10_day_lag,Pred_dt,Pred_rf
4000,2023-04-01,13,518,672.0,753.0,760.0,746.0,556.0,402.0,480.0,605.0,740.0,710.0,469.793651,464.444
4001,2023-04-01,24,335,318.0,316.0,275.0,238.0,250.0,268.0,269.0,300.0,266.0,272.0,267.230769,302.324
4002,2023-04-01,43,1888,1776.0,1686.0,1489.0,1537.0,1435.0,1568.0,1985.0,1714.0,1831.0,1348.0,1891.365079,1972.774
4003,2023-04-01,48,3868,3224.0,3112.0,2934.0,2628.0,2392.0,3034.0,3780.0,3073.0,3114.0,2880.0,3651.555556,3601.762
4004,2023-04-01,50,842,774.0,675.0,668.0,613.0,564.0,712.0,772.0,639.0,645.0,595.0,803.0,756.816


# save prediction

In [22]:
test_df.to_csv(PREDICTION_DATA_PATH, index=False)