# 範例 : 計程車費率預測
https://www.kaggle.com/c/new-york-city-taxi-fare-prediction

# [作業目標]
- 試著模仿範例寫法, 使用程車費率預測競賽練習時間欄位處理

# [作業重點]
- 新增星期幾(day of week)與第幾周(week of year)這兩項特徵, 觀察有什麼影響 (In[4], Out[4], In[5], Out[5])
- 新增加上年週期與周周期特徵 , 觀察有什麼影響 (In[8], Out[8], In[9], Out[9]) 

In [165]:
# 做完特徵工程前的所有準備
import pandas as pd
import numpy as np
import datetime
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor

data_path = 'D:/100days/data/'
df = pd.read_csv(data_path + 'taxi_data1.csv')

train_Y = df['fare_amount']
print('train_Y:', train_Y )
df = df.drop(['fare_amount'] , axis=1)
df.head()

train_Y: 0        4.5
1       12.0
2        6.5
3        6.5
4       11.0
5        6.1
6        6.1
7        5.0
8        4.5
9        6.9
10       6.1
11      16.9
12      15.7
13       8.5
14       8.5
15       9.0
16      12.1
17       9.5
18       4.9
19      17.0
20       7.0
21       8.1
22       4.1
23      10.5
24      10.0
25       7.0
26      28.5
27       4.9
28      10.1
29       6.1
        ... 
4970    16.9
4971    16.0
4972     8.1
4973     9.5
4974     5.5
4975     7.7
4976     5.3
4977     8.9
4978     5.7
4979     5.3
4980    13.0
4981    17.7
4982    23.0
4983     7.7
4984     9.7
4985     4.5
4986    12.5
4987     3.3
4988     9.0
4989     7.0
4990    10.5
4991     6.5
4992    13.5
4993     7.5
4994     4.1
4995     5.3
4996     4.5
4997     6.9
4998     4.5
4999    12.7
Name: fare_amount, Length: 5000, dtype: float64


Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2011-10-21 23:54:10 UTC,-73.99058,40.761071,-73.981128,40.758634,2
1,2015-02-03 10:42:03 UTC,-73.988403,40.723431,-73.989647,40.741695,1
2,2014-03-16 18:58:58 UTC,-74.015785,40.71511,-74.012029,40.707888,2
3,2009-06-13 16:10:54 UTC,-73.977322,40.787275,-73.95803,40.778838,3
4,2014-06-12 03:25:56 UTC,-73.989683,40.729717,-73.98249,40.761887,3


In [166]:
# 時間特徵分解方式:使用datetime
df['pickup_datetime'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S UTC'))
df['pickup_year'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%Y')).astype('int64')
df['pickup_month'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%m')).astype('int64')
df['pickup_day'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%d')).astype('int64')
df['pickup_hour'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%H')).astype('int64')
df['pickup_minute'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%M')).astype('int64')
df['pickup_second'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%S')).astype('int64')
df.head()

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second
0,2011-10-21 23:54:10,-73.99058,40.761071,-73.981128,40.758634,2,2011,10,21,23,54,10
1,2015-02-03 10:42:03,-73.988403,40.723431,-73.989647,40.741695,1,2015,2,3,10,42,3
2,2014-03-16 18:58:58,-74.015785,40.71511,-74.012029,40.707888,2,2014,3,16,18,58,58
3,2009-06-13 16:10:54,-73.977322,40.787275,-73.95803,40.778838,3,2009,6,13,16,10,54
4,2014-06-12 03:25:56,-73.989683,40.729717,-73.98249,40.761887,3,2014,6,12,3,25,56


In [167]:
# 將結果使用線性迴歸 / 梯度提升樹分別看結果
df_temp = df.drop(['pickup_datetime'] , axis=1)
scaler = MinMaxScaler()
train_X = scaler.fit_transform(df_temp)
Linear = LinearRegression()
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
GDBT = GradientBoostingRegressor()
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')

Linear Reg Score : 0.02687687147563771
Gradient Boosting Reg Score : 0.7119041998107072


# 作業1
* 對照範例，試著加入星期幾 (day of week) 與第幾周 (week of year) 這兩項特徵，  
看看結果會比原本只有時間特徵分解的結果更好或更差?=>越來越差, 原本>加上 "年"與"周"週期> 加上"年"與"周"與"日"週期(每次跑,結果會有點不一樣)

1.原本,將結果使用線性迴歸 / 梯度提升樹分別看結果+時間特徵分解方式:
Linear Reg Score : 0.02687687147563771
Gradient Boosting Reg Score : 0.7123651795251604

2.將結果使用線性迴歸 / 梯度提升樹分別看結果+加上"年週期"與"周週期"特徵:
Linear Reg Score : 0.028722639911796445
Gradient Boosting Reg Score : 0.7115431615503276

3.將結果使用線性迴歸 / 梯度提升樹分別看結果+加上"年週期"與"周週期"特徵+ 加上"日週期"特徵 (參考講義"週期循環特徵"):
Linear Reg Score : 0.028234094061822534
Gradient Boosting Reg Score : 0.7104454030679793

In [168]:
# 加上"年週期"與"周週期"特徵


###df['day of week)'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%A')).astype('object')
df['day number of week)'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%w')).astype('int64')
df['week of year)'] = df['pickup_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%U')).astype('int64')

df.head()

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second,day number of week),week of year)
0,2011-10-21 23:54:10,-73.99058,40.761071,-73.981128,40.758634,2,2011,10,21,23,54,10,5,42
1,2015-02-03 10:42:03,-73.988403,40.723431,-73.989647,40.741695,1,2015,2,3,10,42,3,2,5
2,2014-03-16 18:58:58,-74.015785,40.71511,-74.012029,40.707888,2,2014,3,16,18,58,58,0,11
3,2009-06-13 16:10:54,-73.977322,40.787275,-73.95803,40.778838,3,2009,6,13,16,10,54,6,23
4,2014-06-12 03:25:56,-73.989683,40.729717,-73.98249,40.761887,3,2014,6,12,3,25,56,4,23


In [169]:
# 將結果使用線性迴歸 / 梯度提升樹分別看結果
df_temp = df.drop(['pickup_datetime'] , axis=1)
scaler = MinMaxScaler()
train_X = scaler.fit_transform(df_temp)
Linear = LinearRegression()
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
GDBT = GradientBoostingRegressor()
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')



Linear Reg Score : 0.028722639911796445
Gradient Boosting Reg Score : 0.7105131209971896


In [170]:
# 加上"日週期"特徵 (參考講義"週期循環特徵")
import math
df['day_cycle'] = df['pickup_hour']/12 + df['pickup_minute']/720 + df['pickup_second']/43200
df['day_cycle'] = df['day_cycle'].map(lambda x:math.sin(x*math.pi))
df.head()



Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second,day number of week),week of year),day_cycle
0,2011-10-21 23:54:10,-73.99058,40.761071,-73.981128,40.758634,2,2011,10,21,23,54,10,5,42,-0.02545
1,2015-02-03 10:42:03,-73.988403,40.723431,-73.989647,40.741695,1,2015,2,3,10,42,3,2,5,0.333601
2,2014-03-16 18:58:58,-74.015785,40.71511,-74.012029,40.707888,2,2014,3,16,18,58,58,0,11,-0.967083
3,2009-06-13 16:10:54,-73.977322,40.787275,-73.95803,40.778838,3,2009,6,13,16,10,54,6,23,-0.888817
4,2014-06-12 03:25:56,-73.989683,40.729717,-73.98249,40.761887,3,2014,6,12,3,25,56,4,23,0.782427


In [171]:
# 將結果使用線性迴歸 / 梯度提升樹分別看結果
df_temp = df.drop(['pickup_datetime'] , axis=1)
train_X = scaler.fit_transform(df_temp)
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')



Linear Reg Score : 0.028234094061822534
Gradient Boosting Reg Score : 0.7113567073951753


# 作業2
* 對照範例的日週期效果，試著參考投影片完成年週期與周週期的特徵 (也可以用你自己想到的方式)，  
看看結果會比範例中的結果更好或更差? =>ANS:更差


1.範例:加上"日週期"特徵 (參考講義"週期循環特徵")+將結果使用線性迴歸 / 梯度提升樹分別看結果:
Linear Reg Score : 0.02687687147563771
Gradient Boosting Reg Score : 0.7102911572644972
2.作業2"年週期"與"周週期"特徵+ 將結果使用線性迴歸 / 梯度提升樹分別看結果:
Linear Reg Score : 0.02810639063313978
Gradient Boosting Reg Score : 0.707136187483522

In [172]:
# 加上"年週期"與"周週期"特徵
import math
df['year_cycle'] = df['pickup_month']/6 + df['pickup_day']/180 
df['year_cycle'] = df['year_cycle'].map(lambda x:math.cos(x*math.pi))

df['week_cycle'] = df['pickup_day']/3.5 + df['pickup_hour']/84
df['week_cycle'] = df['week_cycle'].map(lambda x:math.sin(x*math.pi))
df.head(10)



Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second,day number of week),week of year),day_cycle,year_cycle,week_cycle
0,2011-10-21 23:54:10,-73.99058,40.761071,-73.981128,40.758634,2,2011,10,21,23,54,10,5,42,-0.02545,0.777146,0.757972
1,2015-02-03 10:42:03,-73.988403,40.723431,-73.989647,40.741695,1,2015,2,3,10,42,3,2,5,0.333601,0.45399,0.07473
2,2014-03-16 18:58:58,-74.015785,40.71511,-74.012029,40.707888,2,2014,3,16,18,58,58,0,11,-0.967083,-0.275637,0.62349
3,2009-06-13 16:10:54,-73.977322,40.787275,-73.95803,40.778838,3,2009,6,13,16,10,54,6,23,-0.888817,-0.97437,-0.294755
4,2014-06-12 03:25:56,-73.989683,40.729717,-73.98249,40.761887,3,2014,6,12,3,25,56,4,23,0.782427,-0.978148,-0.993712
5,2011-07-16 01:19:59,-73.997631,40.721805,-74.009757,40.709941,1,2011,7,16,1,19,59,6,28,0.341952,-0.694658,0.965926
6,2009-06-27 18:15:00,-73.954743,40.789303,-73.964755,40.773345,1,2009,6,27,18,15,0,6,25,-0.997859,-0.891007,-0.222521
7,2009-01-23 20:38:16,-73.982279,40.764668,-73.990323,40.756427,1,2009,1,23,20,38,16,5,3,-0.770884,0.601815,0.56332
8,2010-08-09 14:46:03,-73.965039,40.769972,-73.955365,40.779438,1,2010,8,9,14,46,3,1,32,-0.662783,-0.358368,0.733052
9,2011-09-29 21:56:45,-73.98657,40.734727,-74.003757,40.741951,1,2011,9,29,21,56,45,4,39,-0.51223,0.48481,0.993712


In [173]:
# 將結果使用線性迴歸 / 梯度提升樹分別看結果
df_temp = df.drop(['pickup_datetime'] , axis=1)
train_X = scaler.fit_transform(df_temp)
print(f'Linear Reg Score : {cross_val_score(Linear, train_X, train_Y, cv=5).mean()}')
print(f'Gradient Boosting Reg Score : {cross_val_score(GDBT, train_X, train_Y, cv=5).mean()}')

Linear Reg Score : 0.02810639063313978
Gradient Boosting Reg Score : 0.7080672345824187
