In [1]:
import numpy as np
import pandas as pd
import time

In [2]:
df = pd.read_parquet('../time_series/MA+lat_lon+0/TS5-201902_202306.parquet', engine='pyarrow')
df.head()

Unnamed: 0,Name,year,month,day,hour,PULocationID,weekday,is_holiday,count,lat,lon,7DaysMA
0,lyft,2019,2,1,0,3,4,False,4,40.864294,-73.84651,170.285714
1,lyft,2019,2,1,1,3,4,False,2,40.864294,-73.84651,170.285714
2,lyft,2019,2,1,2,3,4,False,6,40.864294,-73.84651,170.285714
3,lyft,2019,2,1,3,3,4,False,1,40.864294,-73.84651,170.285714
4,lyft,2019,2,1,4,3,4,False,1,40.864294,-73.84651,170.285714


In [3]:
df = df.drop(['7DaysMA'], axis=1)
df.head()

Unnamed: 0,Name,year,month,day,hour,PULocationID,weekday,is_holiday,count,lat,lon
0,lyft,2019,2,1,0,3,4,False,4,40.864294,-73.84651
1,lyft,2019,2,1,1,3,4,False,2,40.864294,-73.84651
2,lyft,2019,2,1,2,3,4,False,6,40.864294,-73.84651
3,lyft,2019,2,1,3,3,4,False,1,40.864294,-73.84651
4,lyft,2019,2,1,4,3,4,False,1,40.864294,-73.84651


In [4]:
valid = df[~(df['year'] == 2023)]
valid = valid.drop('year', axis=1)

In [5]:
df = df.drop('year', axis=1)

In [6]:
from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()
df['Name'] = label.fit_transform(df['Name'])
valid['Name'] = label.fit_transform(valid['Name'])

In [7]:
df = df.astype({'is_holiday': 'int'})
valid = valid.astype({'is_holiday': 'int'})

In [8]:
from sklearn.model_selection import train_test_split

X = df.drop('count', axis=1).values
y = df['count'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=1)

X_valid = valid.drop('count', axis=1).values
y_valid = valid['count'].values

## scikit-learn

In [9]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from lightgbm import LGBMRegressor

# scaler = StandardScaler()
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)
X_valid_scaled = scaler.fit_transform(X_valid)

## LightGBM

In [11]:
start_time = time.time()


from sklearn.model_selection import GridSearchCV
kf = KFold(n_splits=3, shuffle=True, random_state=0)
param_grid = {'n_estimators': range(100, 501, 100),
              'learning_rate': np.logspace(0, -2, 3), 
              'max_depth': range(3, 11)}

lgbm = LGBMRegressor(subsample=0.8, 
                     n_jobs=-1, 
                     random_state=0, 
                     verbose=-1)

model_cv = GridSearchCV(lgbm, param_grid, cv=kf, scoring='r2')
model_cv.fit(X_train_scaled, y_train)
print(model_cv.best_params_, model_cv.best_score_)


end_time = time.time()
execution_time = end_time - start_time
print("執行時間:", execution_time, "秒")

KeyboardInterrupt: 

In [None]:
# 預設
# n_estimators=100
# learning_rate=0.1
# max_depth=-1
# num_leaves=31
# +
# subsample=0.8, 
# n_jobs=-1, 
# random_state=0, 
# verbose=-1



# 自訂
# model = LGBMRegressor(n_estimators=10000, 
#                       learning_rate=0.01, 
#                       max_depth=10, 
#                       n_jobs=-1, 
#                       random_state=0, 
#                       verbose=-1)

In [12]:
start_time = time.time()


model = LGBMRegressor(n_estimators=10000,
                      subsample=0.8, 
                      n_jobs=-1, 
                      random_state=0, 
                      verbose=-1
                     )
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print('Train')
print(f'MAE: {mae}\nR2: {r2}')
print('-' * 30)


prediction = model.predict(X_valid_scaled)
mae = mean_absolute_error(y_valid, prediction)
r2 = r2_score(y_valid, prediction)
print('Valid')
print(f'MAE: {mae}\nR2: {r2}')


end_time = time.time()
execution_time = end_time - start_time
print("執行時間:", execution_time, "秒")

Train
MAE: 9.012334537941504
R2: 0.9049812963911621
------------------------------
Valid
MAE: 9.069482689553901
R2: 0.9039097930654945
執行時間: 4992.2312178611755 秒


num_leaves  
3: 0.6637288667911974   
4: 0.8329159581614249  
6: 0.8685144536896037  
8: 0.8803630708230998  
10: 0.8848594048353419  
12: 0.8895765185174851   
14: 0.892580113955432  
16: 0.8926464578493545  
18: 0.8934328170771787  
20: 0.8935596196349447  
22: 0.8967957958265972  
23: 0.895428900230507  
24: 0.8973346290296784  
25: 0.8953890750211984  
26: 0.8960582691163776
27: 0.8972003652219567  Valid R2: 0.9210254260471169  
28: 0.897932838788989  Valid R2: 0.9224258657110439  
29: 0.8976192661333579  Valid R2: 0.9223652275276067  
30: 

In [33]:
start_time = time.time()


model = LGBMRegressor(n_estimators=10000, 
                      learning_rate=0.01, 
                      max_depth=10, 
                      num_leaves=29, 
                      subsample=0.8, 
                      #n_jobs=-1, 
                      random_state=0, 
                      verbose=-1)
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'MAE: {mae}\nR2: {r2}')


end_time = time.time()
execution_time = end_time - start_time
print("執行時間:", execution_time, "秒")

MAE: 11.676067302276069
R2: 0.8976192661333579
執行時間: 599.8441820144653 秒


In [34]:
prediction = model.predict(X_valid_scaled)
mean_absolute_error(y_valid, prediction)
r2 = r2_score(y_valid, prediction)
print(f'MAE: {mae}\nR2: {r2}')

MAE: 11.676067302276069
R2: 0.9223652275276067


## GradientBoostingRegressor

In [10]:
from sklearn.ensemble import GradientBoostingRegressor

start_time = time.time()


model = GradientBoostingRegressor(subsample=0.8,
                                  random_state=0)
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print('Train')
print(f'MAE: {mae}\nR2: {r2}')
print('-' * 30)


prediction = model.predict(X_valid_scaled)
mae = mean_absolute_error(y_valid, prediction)
r2 = r2_score(y_valid, prediction)
print('Valid')
print(f'MAE: {mae}\nR2: {r2}')


end_time = time.time()
execution_time = end_time - start_time
print("執行時間:", execution_time, "秒")

Train
MAE: 23.348174572093193
R2: 0.4711923591990671
------------------------------
Valid
MAE: 23.340244120233027
R2: 0.4627151470879144
執行時間: 5297.024292945862 秒


## NN

In [10]:
start_time = time.time()

import tensorflow as tf
from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Input
from tensorflow.keras import layers
from tensorflow.keras.regularizers import L1
tf.random.set_seed(42)

n_cols = X_train_scaled.shape[1]

model = Sequential([layers.Dense(64, input_shape=(n_cols,)),
                    layers.BatchNormalization(),
                    layers.LeakyReLU(),
                    layers.Dropout(0.2),
                    layers.Dense(32, kernel_regularizer=L1(0.01), input_shape=(n_cols,)),
                    layers.BatchNormalization(),
                    layers.LeakyReLU(),
                    layers.Dense(1)
])

model.compile(optimizer='adam', loss='mean_absolute_error', metrics=['mae'])
model.summary()


end_time = time.time()
execution_time = end_time - start_time
print("執行時間:", execution_time, "秒")

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 64)                576       
                                                                 
 batch_normalization (Batch  (None, 64)                256       
 Normalization)                                                  
                                                                 
 leaky_re_lu (LeakyReLU)     (None, 64)                0         
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 32)                2080      
                                                                 
 batch_normalization_1 (Bat  (None, 32)                128       
 chNormalization)                                       

In [12]:
start_time = time.time()


from keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

history = model.fit(X_train_scaled, y_train, 
                    validation_split=0.2, 
                    batch_size=1000, 
                    epochs=100, 
                    verbose=1, 
                    callbacks=early_stopping)

end_time = time.time()
execution_time = end_time - start_time
print("執行時間:", execution_time, "秒")

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
執行時間: 159.33337831497192 秒


In [13]:
result = model.evaluate(X_test_scaled, y_test)
print(result)

[16.820343017578125, 16.480937957763672]


In [14]:
from sklearn.metrics import r2_score

y_test_pred = model.predict(X_test_scaled)
r2 = r2_score(y_test, y_test_pred)

print("Test R²:", r2)

Test R²: 0.7460933215646455


In [15]:
y_valid_pred = model.predict(X_valid_scaled)
r2 = r2_score(y_valid, y_valid_pred)

print("Valid R²:", r2)

Valid R²: 0.7542718902910129
