# init

## imports

In [8]:
import os
import pandas as pd
import warnings

from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

warnings.filterwarnings('ignore')

## configs

In [21]:
DATASET_PATH = os.path.expanduser('~/class/new_york/data/dataset.csv')

TRAIN_TEST_SPLIT_DATE = '2023-04-01'

FEATURES = [
    '1_day_lag', '2_day_lag', '3_day_lag', 
    '4_day_lag', '5_day_lag', '6_day_lag',	
    '7_day_lag', '8_day_lag', '9_day_lag', '10_day_lag'
]
LABEL = ['Daily_trips']

PREDICTION_DATA_PATH = os.path.expanduser('~/class/new_york/data/test.csv')

# load data

In [5]:
dataset_df = pd.read_csv(DATASET_PATH)
dataset_df

Unnamed: 0,Date,PULocationID,Daily_trips,1_day_lag,2_day_lag,3_day_lag,4_day_lag,5_day_lag,6_day_lag,7_day_lag,8_day_lag,9_day_lag,10_day_lag
0,2023-01-11,13,683,633.0,456.0,278.0,417.0,507.0,614.0,493.0,446.0,381.0,398.0
1,2023-01-11,24,300,271.0,239.0,225.0,320.0,306.0,288.0,277.0,261.0,169.0,220.0
2,2023-01-11,43,1549,1550.0,1376.0,1340.0,1919.0,1994.0,1859.0,1464.0,1474.0,1283.0,1129.0
3,2023-01-11,48,2638,2501.0,2039.0,2725.0,3232.0,2787.0,2632.0,2360.0,2053.0,1624.0,2894.0
4,2023-01-11,50,617,553.0,464.0,578.0,651.0,628.0,532.0,491.0,502.0,447.0,868.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5495,2023-04-30,249,2490,3530.0,2734.0,2363.0,2081.0,1774.0,1452.0,2419.0,3201.0,2592.0,2296.0
5496,2023-04-30,261,440,481.0,551.0,583.0,598.0,497.0,441.0,477.0,611.0,570.0,663.0
5497,2023-04-30,262,1214,1445.0,1785.0,1693.0,1735.0,1579.0,1465.0,1221.0,1451.0,1611.0,1795.0
5498,2023-04-30,263,2189,2671.0,2561.0,2178.0,2069.0,1932.0,1735.0,1970.0,2957.0,2464.0,2262.0


# train/test split

In [10]:
def train_test_split(dataset_df, split_date):
    train_df = dataset_df[dataset_df['Date'] < split_date]
    test_df = dataset_df[dataset_df['Date'] >= split_date]
    return train_df, test_df

train_df, test_df = train_test_split(dataset_df, TRAIN_TEST_SPLIT_DATE )
train_df

Unnamed: 0,Date,PULocationID,Daily_trips,1_day_lag,2_day_lag,3_day_lag,4_day_lag,5_day_lag,6_day_lag,7_day_lag,8_day_lag,9_day_lag,10_day_lag
0,2023-01-11,13,683,633.0,456.0,278.0,417.0,507.0,614.0,493.0,446.0,381.0,398.0
1,2023-01-11,24,300,271.0,239.0,225.0,320.0,306.0,288.0,277.0,261.0,169.0,220.0
2,2023-01-11,43,1549,1550.0,1376.0,1340.0,1919.0,1994.0,1859.0,1464.0,1474.0,1283.0,1129.0
3,2023-01-11,48,2638,2501.0,2039.0,2725.0,3232.0,2787.0,2632.0,2360.0,2053.0,1624.0,2894.0
4,2023-01-11,50,617,553.0,464.0,578.0,651.0,628.0,532.0,491.0,502.0,447.0,868.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,2023-03-31,249,2786,2210.0,1969.0,1768.0,1382.0,2773.0,3887.0,2652.0,2148.0,1811.0,1649.0
3996,2023-03-31,261,518,591.0,610.0,472.0,444.0,584.0,583.0,599.0,545.0,549.0,483.0
3997,2023-03-31,262,1525,1563.0,1445.0,1454.0,1219.0,940.0,1254.0,1499.0,1469.0,1384.0,1404.0
3998,2023-03-31,263,2377,2138.0,1958.0,1830.0,1676.0,1816.0,2402.0,2343.0,1953.0,1935.0,1687.0


In [28]:
test_df

Unnamed: 0,Date,LocationID,Week_days,Daily_trips,1_day_lag,2_days_lag,3_days_lag,4_days_lag,5_days_lag,6_days_lag,7_days_lag,8_days_lag,9_days_lag,10_days_lag
3850,2023-04-01,13,Saturday,518,672.0,753.0,760.0,746.0,556.0,402.0,480.0,605.0,740.0,710.0
3851,2023-04-01,24,Saturday,335,318.0,316.0,275.0,238.0,250.0,268.0,269.0,300.0,266.0,272.0
3852,2023-04-01,43,Saturday,1888,1776.0,1686.0,1489.0,1537.0,1435.0,1568.0,1985.0,1714.0,1831.0,1348.0
3853,2023-04-01,48,Saturday,3868,3224.0,3112.0,2934.0,2628.0,2392.0,3034.0,3780.0,3073.0,3114.0,2880.0
3854,2023-04-01,50,Saturday,842,774.0,675.0,668.0,613.0,564.0,712.0,772.0,639.0,645.0,595.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5345,2023-04-30,246,Sunday,1384,1810.0,1628.0,2119.0,1826.0,1744.0,1394.0,1676.0,2116.0,1568.0,1889.0
5346,2023-04-30,249,Sunday,2490,3530.0,2734.0,2363.0,2081.0,1774.0,1452.0,2419.0,3201.0,2592.0,2296.0
5347,2023-04-30,261,Sunday,440,481.0,551.0,583.0,598.0,497.0,441.0,477.0,611.0,570.0,663.0
5348,2023-04-30,262,Sunday,1214,1445.0,1785.0,1693.0,1735.0,1579.0,1465.0,1221.0,1451.0,1611.0,1795.0


# methods

## decision tree

### training model

In [16]:
dt = DecisionTreeRegressor(max_depth=16, min_samples_split=30, min_samples_leaf=40)
dt.fit(train_df[FEATURES], train_df[LABEL])

### predict

In [18]:
test_df['Pred_dt'] = dt.predict(test_df[FEATURES])
test_df

Unnamed: 0,Date,PULocationID,Daily_trips,1_day_lag,2_day_lag,3_day_lag,4_day_lag,5_day_lag,6_day_lag,7_day_lag,8_day_lag,9_day_lag,10_day_lag,Pred_dt
4000,2023-04-01,13,518,672.0,753.0,760.0,746.0,556.0,402.0,480.0,605.0,740.0,710.0,469.793651
4001,2023-04-01,24,335,318.0,316.0,275.0,238.0,250.0,268.0,269.0,300.0,266.0,272.0,267.230769
4002,2023-04-01,43,1888,1776.0,1686.0,1489.0,1537.0,1435.0,1568.0,1985.0,1714.0,1831.0,1348.0,1891.365079
4003,2023-04-01,48,3868,3224.0,3112.0,2934.0,2628.0,2392.0,3034.0,3780.0,3073.0,3114.0,2880.0,3651.555556
4004,2023-04-01,50,842,774.0,675.0,668.0,613.0,564.0,712.0,772.0,639.0,645.0,595.0,803.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5495,2023-04-30,249,2490,3530.0,2734.0,2363.0,2081.0,1774.0,1452.0,2419.0,3201.0,2592.0,2296.0,2547.978261
5496,2023-04-30,261,440,481.0,551.0,583.0,598.0,497.0,441.0,477.0,611.0,570.0,663.0,417.212766
5497,2023-04-30,262,1214,1445.0,1785.0,1693.0,1735.0,1579.0,1465.0,1221.0,1451.0,1611.0,1795.0,1198.690476
5498,2023-04-30,263,2189,2671.0,2561.0,2178.0,2069.0,1932.0,1735.0,1970.0,2957.0,2464.0,2262.0,2056.925000


## random forest

### training model

In [19]:
rf = RandomForestRegressor(n_estimators=500)
rf.fit(train_df[FEATURES], train_df[LABEL])

### predict

In [20]:
test_df['Pred_rf'] = rf.predict(test_df[FEATURES])
test_df

Unnamed: 0,Date,PULocationID,Daily_trips,1_day_lag,2_day_lag,3_day_lag,4_day_lag,5_day_lag,6_day_lag,7_day_lag,8_day_lag,9_day_lag,10_day_lag,Pred_dt,Pred_rf
4000,2023-04-01,13,518,672.0,753.0,760.0,746.0,556.0,402.0,480.0,605.0,740.0,710.0,469.793651,458.082
4001,2023-04-01,24,335,318.0,316.0,275.0,238.0,250.0,268.0,269.0,300.0,266.0,272.0,267.230769,302.600
4002,2023-04-01,43,1888,1776.0,1686.0,1489.0,1537.0,1435.0,1568.0,1985.0,1714.0,1831.0,1348.0,1891.365079,1970.318
4003,2023-04-01,48,3868,3224.0,3112.0,2934.0,2628.0,2392.0,3034.0,3780.0,3073.0,3114.0,2880.0,3651.555556,3615.644
4004,2023-04-01,50,842,774.0,675.0,668.0,613.0,564.0,712.0,772.0,639.0,645.0,595.0,803.000000,758.004
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5495,2023-04-30,249,2490,3530.0,2734.0,2363.0,2081.0,1774.0,1452.0,2419.0,3201.0,2592.0,2296.0,2547.978261,2600.132
5496,2023-04-30,261,440,481.0,551.0,583.0,598.0,497.0,441.0,477.0,611.0,570.0,663.0,417.212766,453.692
5497,2023-04-30,262,1214,1445.0,1785.0,1693.0,1735.0,1579.0,1465.0,1221.0,1451.0,1611.0,1795.0,1198.690476,1282.666
5498,2023-04-30,263,2189,2671.0,2561.0,2178.0,2069.0,1932.0,1735.0,1970.0,2957.0,2464.0,2262.0,2056.925000,2096.156


# save prediction

In [22]:
test_df.to_csv(PREDICTION_DATA_PATH, index=False)