In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
import datetime

data_dir = './data/'

df = pd.read_csv(data_dir + 'taxi_data1.csv')
train_label = df.fare_amount
df = df.drop(['fare_amount'], axis = 1)

df.head(5)

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2011-10-21 23:54:10 UTC,-73.99058,40.761071,-73.981128,40.758634,2
1,2015-02-03 10:42:03 UTC,-73.988403,40.723431,-73.989647,40.741695,1
2,2014-03-16 18:58:58 UTC,-74.015785,40.71511,-74.012029,40.707888,2
3,2009-06-13 16:10:54 UTC,-73.977322,40.787275,-73.95803,40.778838,3
4,2014-06-12 03:25:56 UTC,-73.989683,40.729717,-73.98249,40.761887,3


In [2]:
df.pickup_datetime = df.pickup_datetime.apply(lambda x: datetime.datetime.strptime(x,'%Y-%m-%d %H:%M:%S UTC'))
df['pickup_year'] = df.pickup_datetime.apply(lambda x: datetime.datetime.strftime(x,'%Y')).astype('int64')
df['pickup_month'] = df.pickup_datetime.apply(lambda x:datetime.datetime.strftime(x,'%m')).astype('int64')
df['pickup_day'] = df.pickup_datetime.apply(lambda x: datetime.datetime.strftime(x,'%d')).astype('int64')
df['pickup_hour'] = df.pickup_datetime.apply(lambda x: datetime.datetime.strftime(x,'%H')).astype('int64')
df['pickup_minute'] = df.pickup_datetime.apply(lambda x: datetime.datetime.strftime(x,'%M')).astype('int64')
df['pickup_second'] = df.pickup_datetime.apply(lambda x: datetime.datetime.strftime(x,'%S')).astype('int64')

df.head(5)

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second
0,2011-10-21 23:54:10,-73.99058,40.761071,-73.981128,40.758634,2,2011,10,21,23,54,10
1,2015-02-03 10:42:03,-73.988403,40.723431,-73.989647,40.741695,1,2015,2,3,10,42,3
2,2014-03-16 18:58:58,-74.015785,40.71511,-74.012029,40.707888,2,2014,3,16,18,58,58
3,2009-06-13 16:10:54,-73.977322,40.787275,-73.95803,40.778838,3,2009,6,13,16,10,54
4,2014-06-12 03:25:56,-73.989683,40.729717,-73.98249,40.761887,3,2014,6,12,3,25,56


In [3]:
import warnings
warnings.filterwarnings('ignore')
#LinearRegression / GBR
df_temp = df.drop(['pickup_datetime'], axis = 1)
MMScaler = MinMaxScaler()
train_x = MMScaler.fit_transform(df_temp)
LR = LinearRegression()
print(f' LR score : {cross_val_score(LR,train_x,train_label,cv=5).mean()} ')
GBR = GradientBoostingRegressor()
print(f' GBR score : {cross_val_score(GBR,train_x,train_label,cv=5).mean()} ')

 LR score : 0.026876871475636888 
 GBR score : 0.7100970049831021 


In [4]:
import math
df['day_circle'] = df.pickup_hour / 12 + df.pickup_minute / (12*60) + df.pickup_second / (12*60*60)
df.day_circle = df.day_circle.map(lambda x : math.sin(x*math.pi))
df.head(5)

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second,day_circle
0,2011-10-21 23:54:10,-73.99058,40.761071,-73.981128,40.758634,2,2011,10,21,23,54,10,-0.02545
1,2015-02-03 10:42:03,-73.988403,40.723431,-73.989647,40.741695,1,2015,2,3,10,42,3,0.333601
2,2014-03-16 18:58:58,-74.015785,40.71511,-74.012029,40.707888,2,2014,3,16,18,58,58,-0.967083
3,2009-06-13 16:10:54,-73.977322,40.787275,-73.95803,40.778838,3,2009,6,13,16,10,54,-0.888817
4,2014-06-12 03:25:56,-73.989683,40.729717,-73.98249,40.761887,3,2014,6,12,3,25,56,0.782427


In [5]:
#LinearRegression / GBR
df_temp = df.drop(['pickup_datetime'], axis = 1)
MMScaler = MinMaxScaler()
train_x = MMScaler.fit_transform(df_temp)
LR = LinearRegression()
print(f' LR score : {cross_val_score(LR,train_x,train_label,cv=5).mean()} ')
GBR = GradientBoostingRegressor()
print(f' GBR score : {cross_val_score(GBR,train_x,train_label,cv=5).mean()} ')

 LR score : 0.026412252675032777 
 GBR score : 0.7109130601157936 


## 作業1
對照範例，試著加入星期幾 (day of week) 與第幾周 (week of year) 這兩項特徵， <br />
看看結果會比原本只有時間特徵分解的結果更好或更差?

In [17]:
df = df.drop(['day_circle'],axis = 1)

In [18]:
temp_df = df.drop(['pickup_datetime'], axis = 1)
temp_df['day_of_week'] = temp_df.pickup_day % 7
temp_df.day_of_week.replace(to_replace = 0, value = 7 , inplace = True)
temp_df['week_of_year'] = ((temp_df.pickup_month * 30 + temp_df.pickup_day) / 7 ).astype('int64')

train_x = MinMaxScaler().fit_transform(temp_df)
LR = LinearRegression()
print(f' LR score : {cross_val_score(LR,train_x,train_label,cv=5).mean()} ')
GBR = GradientBoostingRegressor()
print(f' GBR score : {cross_val_score(GBR,train_x,train_label,cv=5).mean()} ')

temp_df.head(5)

 LR score : 0.026348883048719452 
 GBR score : 0.7117993147227895 


Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second,day_of_week,week_of_year
0,-73.99058,40.761071,-73.981128,40.758634,2,2011,10,21,23,54,10,7,45
1,-73.988403,40.723431,-73.989647,40.741695,1,2015,2,3,10,42,3,3,9
2,-74.015785,40.71511,-74.012029,40.707888,2,2014,3,16,18,58,58,2,15
3,-73.977322,40.787275,-73.95803,40.778838,3,2009,6,13,16,10,54,6,27
4,-73.989683,40.729717,-73.98249,40.761887,3,2014,6,12,3,25,56,5,27


In [20]:
temp_df['day_circle'] = temp_df.pickup_hour/12 + temp_df.pickup_minute / 12*60 + temp_df.pickup_second/12*60*60
temp_df.day_circle = temp_df.day_circle.map(lambda x: math.sin(x*math.pi))
train_x = MinMaxScaler().fit_transform(temp_df)
LR = LinearRegression()
print(f' LR score : {cross_val_score(LR,train_x,train_label,cv=5).mean()} ')
GBR = GradientBoostingRegressor()
print(f' GBR score : {cross_val_score(GBR,train_x,train_label,cv=5).mean()} ')

temp_df.head(25)

 LR score : 0.026445012527537882 
 GBR score : 0.710888015110399 


Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second,day_of_week,week_of_year,day_circle
0,-73.99058,40.761071,-73.981128,40.758634,2,2011,10,21,23,54,10,7,45,-0.258819
1,-73.988403,40.723431,-73.989647,40.741695,1,2015,2,3,10,42,3,3,9,0.5
2,-74.015785,40.71511,-74.012029,40.707888,2,2014,3,16,18,58,58,2,15,-1.0
3,-73.977322,40.787275,-73.95803,40.778838,3,2009,6,13,16,10,54,6,27,-0.866025
4,-73.989683,40.729717,-73.98249,40.761887,3,2014,6,12,3,25,56,5,27,-0.707107
5,-73.997631,40.721805,-74.009757,40.709941,1,2011,7,16,1,19,59,2,32,-0.258819
6,-73.954743,40.789303,-73.964755,40.773345,1,2009,6,27,18,15,0,6,29,1.0
7,-73.982279,40.764668,-73.990323,40.756427,1,2009,1,23,20,38,16,2,7,-0.866025
8,-73.965039,40.769972,-73.955365,40.779438,1,2010,8,9,14,46,3,2,35,-0.5
9,-73.98657,40.734727,-74.003757,40.741951,1,2011,9,29,21,56,45,1,42,-0.707107


## 作業2
對照範例的日週期效果，試著參考投影片完成年週期與周週期的特徵 (也可以用你自己想到的方式)， <br />
看看結果會比範例中的結果更好或更差?

In [22]:
temp_df['week_cycle'] = temp_df.day_of_week / 3.5 + temp_df.pickup_hour/84
temp_df.week_cycle = temp_df.week_cycle.map(lambda x: math.sin(x*math.pi))
temp_df['year_cycle']  = temp_df.pickup_month / 6 + temp_df.pickup_day/180
temp_df.year_cycle = temp_df.year_cycle.map(lambda x: math.sin(x*math.pi))

train_x = MinMaxScaler().fit_transform(temp_df)
LR = LinearRegression()
print(f' LR score : {cross_val_score(LR,train_x,train_label,cv=5).mean()} ')
GBR = GradientBoostingRegressor()
print(f' GBR score : {cross_val_score(GBR,train_x,train_label,cv=5).mean()} ')

temp_df.head(5)

 LR score : 0.02586412685447037 
 GBR score : 0.7123000095811877 


Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month,pickup_day,pickup_hour,pickup_minute,pickup_second,day_of_week,week_of_year,day_circle,week_cycle,year_cycle
0,-73.99058,40.761071,-73.981128,40.758634,2,2011,10,21,23,54,10,7,45,-0.258819,0.757972,-0.62932
1,-73.988403,40.723431,-73.989647,40.741695,1,2015,2,3,10,42,3,3,9,0.5,0.07473,0.891007
2,-74.015785,40.71511,-74.012029,40.707888,2,2014,3,16,18,58,58,2,15,-1.0,0.62349,0.961262
3,-73.977322,40.787275,-73.95803,40.778838,3,2009,6,13,16,10,54,6,27,-0.866025,-0.294755,-0.224951
4,-73.989683,40.729717,-73.98249,40.761887,3,2014,6,12,3,25,56,5,27,-0.707107,-0.993712,-0.207912


The result is better when we add the week_cycle and year_cycle