# init

In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import pyarrow
import geopandas as gpd
import seaborn as sns
import mapclassify
import matplotlib.colors as mcolors
import warnings
import math
from sklearn.ensemble import RandomForestRegressor
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from tabulate import tabulate

## configs

# load data

In [3]:
top_50_areas_df = pd.read_csv('/home/shaghayegh/class/New York project/top50areas.csv')
top_50_areas_df

Unnamed: 0,date,LocationID,Day_of_Week,daily_trips,1_day_ago,2_days_ago,3_days_ago,4_days_ago,5_days_ago,6_days_ago,7_days_ago,8_days_ago,9_days_ago,10_days_ago
0,2023-01-14,13,Saturday,420,572.0,645.0,683.0,633.0,456.0,278.0,417.0,507.0,614.0,493.0
1,2023-01-14,24,Saturday,278,311.0,311.0,300.0,271.0,239.0,225.0,320.0,306.0,288.0,277.0
2,2023-01-14,43,Saturday,1938,1806.0,1696.0,1549.0,1550.0,1376.0,1340.0,1919.0,1994.0,1859.0,1464.0
3,2023-01-14,48,Saturday,3712,2966.0,2865.0,2638.0,2501.0,2039.0,2725.0,3232.0,2787.0,2632.0,2360.0
4,2023-01-14,50,Saturday,648,685.0,632.0,617.0,553.0,464.0,578.0,651.0,628.0,532.0,491.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5345,2023-04-30,246,Sunday,1384,1810.0,1628.0,2119.0,1826.0,1744.0,1394.0,1676.0,2116.0,1568.0,1889.0
5346,2023-04-30,249,Sunday,2490,3530.0,2734.0,2363.0,2081.0,1774.0,1452.0,2419.0,3201.0,2592.0,2296.0
5347,2023-04-30,261,Sunday,440,481.0,551.0,583.0,598.0,497.0,441.0,477.0,611.0,570.0,663.0
5348,2023-04-30,262,Sunday,1214,1445.0,1785.0,1693.0,1735.0,1579.0,1465.0,1221.0,1451.0,1611.0,1795.0


In [14]:
warnings.filterwarnings('ignore')

## top 50 areas mape & rmse

In [38]:
def mape(real, pred):
    return (abs(pred - real) / real).mean()


mape_1_day_ago = mape(top_50_areas_df['daily_trips'], top_50_areas_df['1_day_ago'])
mape_1_day_ago

0.1870031381531779

In [44]:
mape_7_days_ago = mape(top_50_areas_df['daily_trips'], top_50_areas_df['7_days_ago'])
mape_7_days_ago

0.09505533346278026

In [48]:
def rmse(real, pred):
    a =((pred - real) ** 2).mean()
    return math.sqrt(a)

rmse_1_day_ago = rmse(top_50_areas_df['daily_trips'], top_50_areas_df['1_day_ago'])
rmse_1_day_ago

506.86874907124303

In [54]:
rmse_7_days_ago = rmse(top_50_areas_df['daily_trips'], top_50_areas_df['7_days_ago'])
rmse_7_days_ago

275.7492664431317

In [58]:
top_50_areas_df['ratio'] = top_50_areas_df['1_day_ago'] / top_50_areas_df['8_days_ago']
top_50_areas_df

Unnamed: 0,date,LocationID,Day_of_Week,daily_trips,1_day_ago,2_days_ago,3_days_ago,4_days_ago,5_days_ago,6_days_ago,7_days_ago,8_days_ago,9_days_ago,10_days_ago,ratio
2849,2023-01-14,13,Saturday,420,572.0,645.0,683.0,633.0,456.0,278.0,417.0,507.0,614.0,493.0,1.128205
2858,2023-01-14,24,Saturday,278,311.0,311.0,300.0,271.0,239.0,225.0,320.0,306.0,288.0,277.0,1.016340
2873,2023-01-14,43,Saturday,1938,1806.0,1696.0,1549.0,1550.0,1376.0,1340.0,1919.0,1994.0,1859.0,1464.0,0.905717
2877,2023-01-14,48,Saturday,3712,2966.0,2865.0,2638.0,2501.0,2039.0,2725.0,3232.0,2787.0,2632.0,2360.0,1.064227
2879,2023-01-14,50,Saturday,648,685.0,632.0,617.0,553.0,464.0,578.0,651.0,628.0,532.0,491.0,1.090764
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26271,2023-04-30,246,Sunday,1384,1810.0,1628.0,2119.0,1826.0,1744.0,1394.0,1676.0,2116.0,1568.0,1889.0,0.855388
26274,2023-04-30,249,Sunday,2490,3530.0,2734.0,2363.0,2081.0,1774.0,1452.0,2419.0,3201.0,2592.0,2296.0,1.102780
26282,2023-04-30,261,Sunday,440,481.0,551.0,583.0,598.0,497.0,441.0,477.0,611.0,570.0,663.0,0.787234
26283,2023-04-30,262,Sunday,1214,1445.0,1785.0,1693.0,1735.0,1579.0,1465.0,1221.0,1451.0,1611.0,1795.0,0.995865


In [59]:
top_50_areas_df['pred'] = top_50_areas_df['7_days_ago'] * top_50_areas_df['ratio']
top_50_areas_df

Unnamed: 0,date,LocationID,Day_of_Week,daily_trips,1_day_ago,2_days_ago,3_days_ago,4_days_ago,5_days_ago,6_days_ago,7_days_ago,8_days_ago,9_days_ago,10_days_ago,ratio,pred
2849,2023-01-14,13,Saturday,420,572.0,645.0,683.0,633.0,456.0,278.0,417.0,507.0,614.0,493.0,1.128205,470.461538
2858,2023-01-14,24,Saturday,278,311.0,311.0,300.0,271.0,239.0,225.0,320.0,306.0,288.0,277.0,1.016340,325.228758
2873,2023-01-14,43,Saturday,1938,1806.0,1696.0,1549.0,1550.0,1376.0,1340.0,1919.0,1994.0,1859.0,1464.0,0.905717,1738.071214
2877,2023-01-14,48,Saturday,3712,2966.0,2865.0,2638.0,2501.0,2039.0,2725.0,3232.0,2787.0,2632.0,2360.0,1.064227,3439.580911
2879,2023-01-14,50,Saturday,648,685.0,632.0,617.0,553.0,464.0,578.0,651.0,628.0,532.0,491.0,1.090764,710.087580
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26271,2023-04-30,246,Sunday,1384,1810.0,1628.0,2119.0,1826.0,1744.0,1394.0,1676.0,2116.0,1568.0,1889.0,0.855388,1433.629490
26274,2023-04-30,249,Sunday,2490,3530.0,2734.0,2363.0,2081.0,1774.0,1452.0,2419.0,3201.0,2592.0,2296.0,1.102780,2667.625742
26282,2023-04-30,261,Sunday,440,481.0,551.0,583.0,598.0,497.0,441.0,477.0,611.0,570.0,663.0,0.787234,375.510638
26283,2023-04-30,262,Sunday,1214,1445.0,1785.0,1693.0,1735.0,1579.0,1465.0,1221.0,1451.0,1611.0,1795.0,0.995865,1215.951068


In [60]:
mape_prediction = mape(top_50_areas_df['daily_trips'], top_50_areas_df['pred'] )
mape_prediction

0.10537339678804265

In [61]:
rmse_prediction = rmse(top_50_areas_df['daily_trips'], top_50_areas_df['pred'] )
rmse_prediction

308.7434410525133

In [62]:
result_table = [["1_day_ago",mape_1_day_ago, rmse_1_day_ago],
         ["7_days_ago",mape_7_days_ago, rmse_7_days_ago],
         ["Prediction",mape_prediction, rmse_prediction]]
print(tabulate(result_table, headers=["FREQUENCY","MAPE", "RMSE"], tablefmt="simple_grid"))

┌─────────────┬───────────┬─────────┐
│ FREQUENCY   │      MAPE │    RMSE │
├─────────────┼───────────┼─────────┤
│ 1_day_ago   │ 0.187003  │ 506.869 │
├─────────────┼───────────┼─────────┤
│ 7_days_ago  │ 0.0950553 │ 275.749 │
├─────────────┼───────────┼─────────┤
│ Prediction  │ 0.105373  │ 308.743 │
└─────────────┴───────────┴─────────┘


# random forest prediction 

### preparation

## train/test split

In [7]:
def train_test_split(dataset_df, split_date):
    train_df = dataset_df[dataset_df['date'] < split_date]
    test_df = dataset_df[dataset_df['date'] >= split_date]
    return train_df, test_df

train_df, test_df = train_test_split(top_50_areas_df, '2023-04-01')

In [8]:
train_df

Unnamed: 0,date,LocationID,Day_of_Week,daily_trips,1_day_ago,2_days_ago,3_days_ago,4_days_ago,5_days_ago,6_days_ago,7_days_ago,8_days_ago,9_days_ago,10_days_ago,ratio,pred
0,2023-01-14,13,Saturday,420,572.0,645.0,683.0,633.0,456.0,278.0,417.0,507.0,614.0,493.0,1.128205,470.461538
1,2023-01-14,24,Saturday,278,311.0,311.0,300.0,271.0,239.0,225.0,320.0,306.0,288.0,277.0,1.016340,325.228758
2,2023-01-14,43,Saturday,1938,1806.0,1696.0,1549.0,1550.0,1376.0,1340.0,1919.0,1994.0,1859.0,1464.0,0.905717,1738.071214
3,2023-01-14,48,Saturday,3712,2966.0,2865.0,2638.0,2501.0,2039.0,2725.0,3232.0,2787.0,2632.0,2360.0,1.064227,3439.580911
4,2023-01-14,50,Saturday,648,685.0,632.0,617.0,553.0,464.0,578.0,651.0,628.0,532.0,491.0,1.090764,710.087580
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3845,2023-03-31,246,Friday,1526,1798.0,1586.0,1613.0,1306.0,1561.0,1958.0,1561.0,1747.0,1463.0,1340.0,1.029193,1606.570120
3846,2023-03-31,249,Friday,2786,2210.0,1969.0,1768.0,1382.0,2773.0,3887.0,2652.0,2148.0,1811.0,1649.0,1.028864,2728.547486
3847,2023-03-31,261,Friday,518,591.0,610.0,472.0,444.0,584.0,583.0,599.0,545.0,549.0,483.0,1.084404,649.557798
3848,2023-03-31,262,Friday,1525,1563.0,1445.0,1454.0,1219.0,940.0,1254.0,1499.0,1469.0,1384.0,1404.0,1.063989,1594.919673


In [9]:
test_df

Unnamed: 0,date,LocationID,Day_of_Week,daily_trips,1_day_ago,2_days_ago,3_days_ago,4_days_ago,5_days_ago,6_days_ago,7_days_ago,8_days_ago,9_days_ago,10_days_ago,ratio,pred
3850,2023-04-01,13,Saturday,518,672.0,753.0,760.0,746.0,556.0,402.0,480.0,605.0,740.0,710.0,1.110744,533.157025
3851,2023-04-01,24,Saturday,335,318.0,316.0,275.0,238.0,250.0,268.0,269.0,300.0,266.0,272.0,1.060000,285.140000
3852,2023-04-01,43,Saturday,1888,1776.0,1686.0,1489.0,1537.0,1435.0,1568.0,1985.0,1714.0,1831.0,1348.0,1.036173,2056.802800
3853,2023-04-01,48,Saturday,3868,3224.0,3112.0,2934.0,2628.0,2392.0,3034.0,3780.0,3073.0,3114.0,2880.0,1.049138,3965.740319
3854,2023-04-01,50,Saturday,842,774.0,675.0,668.0,613.0,564.0,712.0,772.0,639.0,645.0,595.0,1.211268,935.098592
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5345,2023-04-30,246,Sunday,1384,1810.0,1628.0,2119.0,1826.0,1744.0,1394.0,1676.0,2116.0,1568.0,1889.0,0.855388,1433.629490
5346,2023-04-30,249,Sunday,2490,3530.0,2734.0,2363.0,2081.0,1774.0,1452.0,2419.0,3201.0,2592.0,2296.0,1.102780,2667.625742
5347,2023-04-30,261,Sunday,440,481.0,551.0,583.0,598.0,497.0,441.0,477.0,611.0,570.0,663.0,0.787234,375.510638
5348,2023-04-30,262,Sunday,1214,1445.0,1785.0,1693.0,1735.0,1579.0,1465.0,1221.0,1451.0,1611.0,1795.0,0.995865,1215.951068


In [10]:
FEATURES = ['1_day_ago', '2_days_ago',	'3_days_ago', '4_days_ago',	'5_days_ago', '6_days_ago',	'7_days_ago', '8_days_ago',	'9_days_ago', '10_days_ago']

In [11]:
LABEL = ['daily_trips']

### decision tree

In [15]:
dt = DecisionTreeRegressor(max_depth=16, min_samples_split=30, min_samples_leaf=40)
dt.fit(train_df[FEATURES], train_df[LABEL])

### random forest

In [16]:
rf = RandomForestRegressor(n_estimators=500)
rf.fit(train_df[FEATURES], train_df[LABEL])

  return fit_method(estimator, *args, **kwargs)


In [24]:
result_table = [["1_day_ago",mape(test_df['daily_trips'], test_df['1_day_ago']), rmse(test_df['daily_trips'], test_df['1_day_ago'])],
         ["7_days_ago",mape(test_df['daily_trips'], test_df['7_days_ago']), rmse(test_df['daily_trips'], test_df['7_days_ago'])],
         ["RF",mape(test_df['daily_trips'], test_df['pred']), rmse(test_df['daily_trips'], test_df['pred'])]]
print(tabulate(result_table, headers=["FREQUENCY","MAPE", "RMSE"], tablefmt="simple_grid"))

┌─────────────┬───────────┬─────────┐
│ FREQUENCY   │      MAPE │    RMSE │
├─────────────┼───────────┼─────────┤
│ 1_day_ago   │ 0.179403  │ 492.71  │
├─────────────┼───────────┼─────────┤
│ 7_days_ago  │ 0.0970103 │ 279.278 │
├─────────────┼───────────┼─────────┤
│ RF          │ 0.0855335 │ 241.925 │
└─────────────┴───────────┴─────────┘


In [25]:
(mape(test_df['daily_trips'], test_df['pred']) - mape(test_df['daily_trips'], test_df['7_days_ago'])) / mape(test_df['daily_trips'], test_df['7_days_ago'])

-0.11830453750630131

In [22]:
def rmse(real, pred):
    a =((pred - real) ** 2).mean()
    return math.sqrt(a)

rmse_1_day_ago = rmse(test_df['daily_trips'], test_df['pred'])
rmse_1_day_ago

241.9247887807779

In [23]:
def mape(real, pred):
    return (abs(pred - real) / real).mean()


mape_1_day_ago = mape(test_df['daily_trips'], test_df['1_day_ago'])
mape_1_day_ago

0.1794028571592319

In [21]:
label_pred_test_dt = dt.predict(test_df[FEATURES])
label_pred_test_rf = rf.predict(test_df[FEATURES])

test_df['pred'] = label_pred_test_rf
test_df[['pred', 'daily_trips', '1_day_ago', '7_days_ago']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['pred'] = label_pred_test_rf


Unnamed: 0,pred,daily_trips,1_day_ago,7_days_ago
3850,459.152,518,672.0,480.0
3851,292.014,335,318.0,269.0
3852,1983.086,1888,1776.0,1985.0
3853,3579.504,3868,3224.0,3780.0
3854,750.172,842,774.0,772.0
...,...,...,...,...
5345,1660.346,1384,1810.0,1676.0
5346,2570.674,2490,3530.0,2419.0
5347,459.710,440,481.0,477.0
5348,1275.452,1214,1445.0,1221.0
