In [2]:
# unix time: https://www.unixtimestamp.com/
import datetime  # Convert to unix time
import math
import os
import pickle
import time  # Convert to unix time
import warnings

import dask.dataframe as dd  # similar to pandas
import matplotlib.pylab as plt
# if numpy is not installed already : pip3 install numpy
import numpy as np  # Do aritmetic operations on arrays
import pandas as pd  # pandas to create small dataframes
import seaborn as sns  # Plots
# to install xgboost: pip3 install xgboost
# if it didnt happen check install_xgboost.JPG
import xgboost as xgb
from matplotlib import rcParams  # Size of plots
from sklearn.cluster import KMeans, MiniBatchKMeans  # Clustering
# to install sklearn: pip install -U scikit-learn
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import (mean_absolute_error,
                             mean_absolute_percentage_error,
                             mean_squared_error, r2_score)

warnings.filterwarnings("ignore")

# Data Load

In [3]:
base_year = 2018
base_month_count = 1

In [4]:
def load_data():
    months_frame = []
    months_groupby = []
    for i in range(1,base_month_count+1):
        tmp_frame = pd.read_parquet(f'preprocessing_yellow_tripdata_{base_year+1}_{i}.parquet',engine='pyarrow')
        
        tmp_groupby = tmp_frame[['PULocationID','pickup_bins','trip_distance']].groupby(['PULocationID','pickup_bins'], dropna=False).count()
        tmp_tip_amount = tmp_frame[['PULocationID','pickup_bins','tip_amount']].groupby(['PULocationID','pickup_bins'], dropna=False).mean()
        #print(tmp_groupby)
        # print("-------------------------")
        #print(tmp_tip_amount)
        # print("-------------------------")
        #tmp_groupby = pd.concat([tmp_trip_distance, tmp_tip_amount])
        #print(tmp_groupby)
        tmp_groupby = pd.merge(tmp_groupby, tmp_tip_amount, on = ['PULocationID','pickup_bins'], how = "left")

        
        print(tmp_groupby)
        months_frame.append(tmp_frame)
        months_groupby.append(tmp_groupby)
    return months_frame, months_groupby

In [5]:
months_frame, months_groupby = load_data()

                          trip_distance  tip_amount
PULocationID pickup_bins                           
4            -52                      1    0.000000
              0                       1    0.000000
              1                       4    1.450000
              2                      11    2.130909
              3                       9    1.034444
...                                 ...         ...
263           4459                   17    1.778824
              4460                   25    0.873200
              4461                   23    1.158261
              4462                   17    1.254706
              4463                   24    1.257917

[249105 rows x 2 columns]


## 뉴욕 지역

In [6]:
taxi_zone_df = pd.read_csv('taxi_zone_lookup.csv')

In [7]:
region = "Manhattan"
nyc_region = taxi_zone_df[taxi_zone_df['Borough'] == region]
nyc_region_number = nyc_region['LocationID']
nyc_regions_cnt = len(nyc_region)

# Smoothing

In [8]:
# number of 10min indices for jan 2019= 24*31*60/10 = max_pickup_bins_len
interval = 10
days = [31,29,31,30,31,30,31,31,30,31,30,31]
pickup_bins_len = []

for day in days:
    pickup_bins_len.append(int(24*60*day/interval))
max_pickup_bins_len = max(pickup_bins_len)
print(max_pickup_bins_len)

4464


In [9]:
# Fills a value of zero for every bin where no pickup data is present 
# the count_values: number pickps that are happened in each region for each 10min intravel
# there wont be any value if there are no picksups.
# values: number of unique bins

# for every 10min intravel(pickup_bin) we will check it is there in our unique bin,
# if it is there we will add the count_values[index] to smoothed data
# if not we add smoothed data (which is calculated based on the methods that are discussed in the above markdown cell)
# we finally return smoothed data
def smoothing(count_values,values):
    smoothed_regions=[] # stores list of final smoothed values of each reigion
    ind=0
    repeat=0 
    smoothed_value=0
    for r in range(1,nyc_regions_cnt+1):
        smoothed_bins=[] #stores the final smoothed values
        repeat=0
        for i in range(max_pickup_bins_len):
            if repeat!=0: # prevents iteration for a value which is already visited/resolved
                repeat-=1
                continue
            if i in values[r-1]: #checks if the pickup-bin exists 
                smoothed_bins.append(count_values[ind-1]) # appends the value of the pickup bin if it exists
            else:
                if i!=0:
                    right_hand_limit=0
                    for j in range(i,max_pickup_bins_len):
                        if  j not in values[r-1]: #searches for the left-limit or the pickup-bin value which has a pickup value
                            continue
                        else:
                            right_hand_limit=j
                            break
                    if right_hand_limit==0:
                    #Case 1: When we have the last/last few values are found to be missing,hence we have no right-limit here
                        smoothed_value=count_values[ind-1]*1.0/((max_pickup_bins_len-1-i)+2)*1.0                               
                        for j in range(i,max_pickup_bins_len):                              
                            smoothed_bins.append(math.ceil(smoothed_value))
                        smoothed_bins[i-1] = math.ceil(smoothed_value)
                        repeat=(max_pickup_bins_len-1-i)
                        ind-=1
                    else:
                    #Case 2: When we have the missing values between two known values
                        smoothed_value=(count_values[ind-1]+count_values[ind])*1.0/((right_hand_limit-i)+2)*1.0             
                        for j in range(i,right_hand_limit+1):
                            smoothed_bins.append(math.ceil(smoothed_value))
                        smoothed_bins[i-1] = math.ceil(smoothed_value)
                        repeat=(right_hand_limit-i)
                else:
                    #Case 3: When we have the first/first few values are found to be missing,hence we have no left-limit here
                    right_hand_limit=0
                    for j in range(i,max_pickup_bins_len):
                        if  j not in values[r-1]:
                            continue
                        else:
                            right_hand_limit=j
                            break
                    smoothed_value=count_values[ind]*1.0/((right_hand_limit-i)+1)*1.0
                    for j in range(i,right_hand_limit+1):
                            smoothed_bins.append(math.ceil(smoothed_value))
                    repeat=(right_hand_limit-i)
            ind+=1
        smoothed_regions.extend(smoothed_bins)
    return smoothed_regions

In [10]:
def return_unq_pickup_bins(frame):
    values = []
    for i in nyc_region_number.values:
    # for i in range(1,266):
        new = frame[frame['PULocationID'] == i]
        list_unq = list(set(new['pickup_bins']))
        list_unq.sort()
        values.append(list_unq)
    return values

In [11]:
months_unique = []
for frame in months_frame:
    months_unique.append(return_unq_pickup_bins(frame))

In [12]:
months_smooth = []
months_smooth_tip = []
for groupby, unique in zip(months_groupby, months_unique):
    # smoothing을 할 것인가 filling을 할것인가
    months_smooth.append(smoothing(groupby['trip_distance'].values,unique))
    months_smooth_tip.append(smoothing(groupby['tip_amount'].values,unique))
    
# Making list of all the values of pickup data in every bin for a period of 3 months and storing them region-wise 
regions_cum = []
regions_cum_tip = []

# number of 10min indices for jan 2019= 24*31*60/10 = 4464      # pickup_bins_len[0]
# number of 10min indices for jan 2020 = 24*31*60/10 = 4464     # pickup_bins_len[0]
# number of 10min indices for feb 2020 = 24*29*60/10 = 4176     # pickup_bins_len[1]
# number of 10min indices for march 2020 = 24*31*60/10 = 4464   # pickup_bins_len[2]
# regions_cum: it will contain 40 lists, each list will contain 4464+4176+4464 values which represents the number of pickups 
# that are happened for three months in 2016 data

# nyc_regions_cnt개의 맨허튼 지역
for i in range(1,nyc_regions_cnt+1):
    cum = []
    cum_tip = []
    for index, smooth in enumerate(months_smooth):
        cum += smooth[pickup_bins_len[index]*(i-1):pickup_bins_len[index]*i]
    for index, smooth in enumerate(months_smooth_tip):
        cum_tip += smooth[pickup_bins_len[index]*(i-1):pickup_bins_len[index]*i]
    
    regions_cum.append(cum)
    regions_cum_tip.append(cum_tip)

print(len(regions_cum))
print(len(regions_cum[0]))
print(len(regions_cum_tip))
print(len(regions_cum_tip[0]))

69
4464
69
4464


# Modeling

## 회귀 모델

In [13]:

# Preparing data to be split into train and test, The below prepares data in cumulative form which will be later split into test and train
# number of 10min indices for jan 2019= 24*31*60/10 = 4464      # pickup_bins_len[0]
# number of 10min indices for jan 2020 = 24*31*60/10 = 4464     # pickup_bins_len[0]
# number of 10min indices for feb 2020 = 24*29*60/10 = 4176     # pickup_bins_len[1]
# number of 10min indices for march 2020 = 24*31*60/10 = 4464   # pickup_bins_len[2]
# regions_cum: it will contain 40 lists, each list will contain 4464+4176+4464 values which represents the number of pickups 
# that are happened for three months in 2016 data

# print(len(regions_cum))
# 265
# print(len(regions_cum[0]))
# 4368


# we take number of pickups that are happened in last 5 intravels
number_of_time_stamps = 5

# output varaible
# it is list of lists
# it will contain number of pickups 4368 for each cluster
# len(regions_cum[0]) == 4368
output = []
output_tip = []
sum(pickup_bins_len[:base_month_count])
# 우리 데이터
# len(regions_cum[0]) - 5(:= # of colunms)
# 4368 - 5 = 4363
# 13104 - 5 = 13099

# tsne_lat will contain 13104-5=13099 times lattitude of cluster center for every cluster
# Ex: [[cent_lat 13099times],[cent_lat 13099times], [cent_lat 13099times].... 40 lists]
# it is list of lists
# tsne_lat = []

# tsne_lon will contain 13104-5=13099 times logitude of cluster center for every cluster
# Ex: [[cent_long 13099times],[cent_long 13099times], [cent_long 13099times].... 40 lists]
# it is list of lists
# tsne_lon = []

# 우리는 lat, lon 대신에 목적지 ID (PULocationID: 출발지, DOLocationID: 도착지)를 사용할 것이다.
tsne_PULocationID = []

#tsne_Tip_amount = []
# we will code each day 
# sunday = 0, monday=1, tue = 2, wed=3, thur=4, fri=5, sat=6
# for every cluster we will be adding 13099 values, each value represent to which day of the week that pickup bin belongs to
# it is list of lists
tsne_weekday = []

# its an numbpy array, of shape (523960, 5)
# each row corresponds to an entry in out data
# for the first row we will have [f0,f1,f2,f3,f4] fi=number of pickups happened in i+1th 10min intravel(bin)
# the second row will have [f1,f2,f3,f4,f5]
# the third row will have [f2,f3,f4,f5,f6]
# and so on...
tsne_feature = []


tsne_feature = [0]*number_of_time_stamps
for i in range(1,nyc_regions_cnt+1):
    # tsne_lat.append([kmeans.cluster_centers_[i][0]]*13099) # kmeans.cluster_centers_[i][0] := Coordinates of cluster centers. 클러스트 센터의 상관계수
    # tsne_lon.append([kmeans.cluster_centers_[i][1]]*13099)

    # tsne_PULocationID
    tsne_PULocationID.append([i]*(len(regions_cum[0]) - 5))
    
    #tsne_Tip_amount.append([i]*(len(regions_cum[0]) - 5))

    day_of_the_week_dict = {2015: 4, 2016: 5, 2017: 1, 2018:1, 2019:2, 2020:3, 2021:5, 2022:6}
    # jan 1st 2016 is thursday, so we start our day from 4: "(int(k/144))%7+4"
    # our prediction start from 5th 10min intravel since we need to have number of pickups that are happened in last 5 pickup bins
    
    # jan 1st 2020 is tue -> 3
    tsne_weekday.append([int(((int(k/144))%7+day_of_the_week_dict[base_year+1])%7) for k in range(5,sum(pickup_bins_len[:base_month_count]))])

    # jan 1st 2021 is fri -> 5
    # tsne_weekday.append([int(((int(k/144))%7+5)%7) for k in range(5,sum(pickup_bins_len[:3]))])
    # regions_cum is a list of lists [[x1,x2,x3..x13104], [x1,x2,x3..x13104], [x1,x2,x3..x13104], [x1,x2,x3..x13104], [x1,x2,x3..x13104], .. 40 lsits]
    
    # 우리 데이터 
    # regions_cum [[x_1,x_2,...,x_{len(regions_cum[0]) - 5}],...265 lists] len(regions_cum[0]) - 5 = 4381
    tsne_feature = np.vstack((tsne_feature, [regions_cum[i-1][r:r+number_of_time_stamps] for r in range(0,len(regions_cum[i-1])-number_of_time_stamps)]))

    output.append(regions_cum[i-1][5:])
    output_tip.append(regions_cum_tip[i-1][5:])
tsne_feature = tsne_feature[1:]

In [38]:
print(tsne_feature.shape[0])
print(len(tsne_weekday)*len(tsne_weekday[0]))
print(len(output)*len(output[0]))
print(len(output_tip)*len(output_tip[0]))
print(nyc_regions_cnt*(len(regions_cum[0])-5))
print(len(tsne_PULocationID)*len(tsne_PULocationID[0]))

307671
307671
307671
307671
307671
307671


In [76]:
# Getting the predictions of exponential moving averages to be used as a feature in cumulative form

# upto now we computed 8 features for every data point that starts from 50th min of the day
# 1. cluster center lattitude
# 2. cluster center longitude
# 3. day of the week 
# 4. f_t_1: number of pickups that are happened previous t-1th 10min intravel
# 5. f_t_2: number of pickups that are happened previous t-2th 10min intravel
# 6. f_t_3: number of pickups that are happened previous t-3th 10min intravel
# 7. f_t_4: number of pickups that are happened previous t-4th 10min intravel
# 8. f_t_5: number of pickups that are happened previous t-5th 10min intravel

# from the baseline models we said the exponential weighted moving avarage gives us the best error
# we will try to add the same exponential weighted moving avarage at t as a feature to our data
# exponential weighted moving avarage => p'(t) = alpha*p'(t-1) + (1-alpha)*P(t-1) 
alpha=0.3

# it is a temporary array that store exponential weighted moving avarage for each 10min intravel, 
# for each cluster it will get reset
# for every cluster it contains 13104 values
predicted_values=[]
predicted_values_tip=[]

# it is similar like tsne_lat
# it is list of lists
# predict_list is a list of lists [[x5,x6,x7..x13104], [x5,x6,x7..x13104], [x5,x6,x7..x13104], [x5,x6,x7..x13104], [x5,x6,x7..x13104], .. 40 lsits]
predict_list = []
predict_list_tip = []
tsne_flat_exp_avg = []
for r in range(1,nyc_regions_cnt+1):
    for i in range(0,len(regions_cum[0])):
        if i==0:
            predicted_value= regions_cum[r-1][0]
            predicted_values.append(0)
            predicted_value_tip= regions_cum_tip[r-1][0]
            predicted_values_tip.append(0)
            continue
        predicted_values.append(predicted_value)
        predicted_value =int((alpha*predicted_value) + (1-alpha)*(regions_cum[r-1][i]))
        predicted_values_tip.append(predicted_value_tip)
        predicted_value_tip =(alpha*predicted_value_tip) + (1-alpha)*(regions_cum_tip[r-1][i])
    
    predict_list.append(predicted_values[5:])
    predicted_values=[]
    predict_list_tip.append(predicted_values_tip[5:])
    predicted_values_tip=[]

In [77]:
# train, test split : 70% 30% split
# Before we start predictions using the tree based regression models we take 3 months of 2016 pickup data 
# and split it such that for every region we have 70% data in train and 30% in test,
# ordered date-wise for every region

sizeof_train_data = int((len(regions_cum[0])-5)*0.7)
sizeof_test_data = int((len(regions_cum[0])-5)*0.3)


print("size of train data :", sizeof_train_data)
print("size of test data :", sizeof_test_data)

size of train data : 3121
size of test data : 1337


In [78]:
# extracting first 91nyc_regions_cnt timestamp values i.e 70% of 13099 (total timestamps) for our training data
train_features =  [tsne_feature[i*(len(regions_cum[0])-5):((len(regions_cum[0])-5)*i+sizeof_train_data)] for i in range(0,nyc_regions_cnt)]

test_features = [tsne_feature[((len(regions_cum[0])-5)*(i))+sizeof_train_data:(len(regions_cum[0])-5)*(i+1)] for i in range(0,nyc_regions_cnt)]

In [79]:
print("Number of data clusters",len(train_features), "Number of data points in trian data", len(train_features[0]), "Each data point contains", len(train_features[0][0]),"features")
print("Number of data clusters",len(train_features), "Number of data points in test data", len(test_features[0]), "Each data point contains", len(test_features[0][0]),"features")

Number of data clusters 69 Number of data points in trian data 3121 Each data point contains 5 features
Number of data clusters 69 Number of data points in test data 1338 Each data point contains 5 features


In [80]:
# extracting first sizeof_train_data timestamp values i.e 70% of 13099 (total timestamps) for our training data

tsne_train_flat_PULocationID = [i[:sizeof_train_data] for i in tsne_PULocationID]
tsne_train_flat_Tip_amount = [i[:sizeof_train_data] for i in predict_list_tip]
tsne_train_flat_weekday = [i[:sizeof_train_data] for i in tsne_weekday]
tsne_train_flat_output = [i[:sizeof_train_data] for i in output]
tsne_train_flat_output_tip = [i[:sizeof_train_data] for i in output_tip]
tsne_train_flat_exp_avg = [i[:sizeof_train_data] for i in predict_list]

In [81]:
# extracting the rest of the timestamp values i.e 30% of sizeof_train_data + sizeof_test_data (total timestamps) for our test data

tsne_test_flat_PULocationID = [i[sizeof_train_data:] for i in tsne_PULocationID]
tsne_test_flat_Tip_amount = [i[sizeof_train_data:] for i in predict_list_tip]
tsne_test_flat_weekday = [i[sizeof_train_data:] for i in tsne_weekday]
tsne_test_flat_output = [i[sizeof_train_data:] for i in output]
tsne_test_flat_output_tip = [i[sizeof_train_data:] for i in output_tip]
tsne_test_flat_exp_avg = [i[sizeof_train_data:] for i in predict_list]

In [82]:
# the above contains values in the form of list of lists (i.e. list of values of each region), here we make all of them in one list
train_new_features = []
for i in range(0,nyc_regions_cnt):
    train_new_features.extend(train_features[i])
test_new_features = []
for i in range(0,nyc_regions_cnt):
    test_new_features.extend(test_features[i])

In [83]:
tsne_train_PULocationID = sum(tsne_train_flat_PULocationID, [])
tsne_train_Tip_amount = sum(tsne_train_flat_Tip_amount, [])
tsne_train_weekday = sum(tsne_train_flat_weekday, [])
tsne_train_output = sum(tsne_train_flat_output, [])
tsne_train_output_tip = sum(tsne_train_flat_output_tip, [])
tsne_train_exp_avg = sum(tsne_train_flat_exp_avg,[])

In [72]:
tsne_test_PULocationID = sum(tsne_test_flat_PULocationID, [])
tsne_test_Tip_amount = sum(tsne_test_flat_Tip_amount, [])
tsne_test_weekday = sum(tsne_test_flat_weekday, [])
tsne_test_output = sum(tsne_test_flat_output, [])
tsne_test_output_tip = sum(tsne_test_flat_output_tip, [])
tsne_test_exp_avg = sum(tsne_test_flat_exp_avg,[])

In [84]:
# Preparing the data frame for our train data
columns = ['ft_5','ft_4','ft_3','ft_2','ft_1']
df_train = pd.DataFrame(data=train_new_features, columns=columns) 
# df_train['lat'] = tsne_train_lat
# df_train['lon'] = tsne_train_lon

df_train['PULocationID'] = tsne_train_PULocationID
df_train['Tip_amount'] = tsne_train_Tip_amount
df_train['weekday'] = tsne_train_weekday
df_train['exp_avg'] = tsne_train_exp_avg

print(df_train.shape)

(215349, 9)


In [85]:
# Preparing the data frame for our train data
df_test = pd.DataFrame(data=test_new_features, columns=columns) 
# df_test['lat'] = tsne_test_lat
# df_test['lon'] = tsne_test_lon

df_test['PULocationID'] = tsne_test_PULocationID
df_test['Tip_amount'] = tsne_test_Tip_amount
df_test['weekday'] = tsne_test_weekday
df_test['exp_avg'] = tsne_test_exp_avg
print(df_test.shape)

(92322, 9)


## Using Linear Regression

In [86]:
# find more about LinearRegression function here http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html
# -------------------------
# default paramters
# sklearn.linear_model.LinearRegression(fit_intercept=True, normalize=False, copy_X=True, n_jobs=1)

# some of methods of LinearRegression()
# fit(X, y[, sample_weight])	Fit linear model.
# get_params([deep])	Get parameters for this estimator.
# predict(X)	Predict using the linear model
# score(X, y[, sample_weight])	Returns the coefficient of determination R^2 of the prediction.
# set_params(**params)	Set the parameters of this estimator.
# -----------------------
# video link: https://www.appliedaicourse.com/course/applied-ai-course-online/lessons/geometric-intuition-1-2-copy-8/
# -----------------------

from sklearn.linear_model import LinearRegression
lr_reg=LinearRegression().fit(df_train, tsne_train_output)

y_pred = lr_reg.predict(df_test)
lr_test_predictions = [round(value) for value in y_pred]
y_pred = lr_reg.predict(df_train)
lr_train_predictions = [round(value) for value in y_pred]

## Using Random Forest Regressor

In [87]:
# Training a hyper-parameter tuned random forest regressor on our train data
# find more about LinearRegression function here http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html
# -------------------------
# default paramters
# sklearn.ensemble.RandomForestRegressor(n_estimators=10, criterion=’mse’, max_depth=None, min_samples_split=2, 
# min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=’auto’, max_leaf_nodes=None, min_impurity_decrease=0.0, 
# min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=1, random_state=None, verbose=0, warm_start=False)

# some of methods of RandomForestRegressor()
# apply(X)	Apply trees in the forest to X, return leaf indices.
# decision_path(X)	Return the decision path in the forest
# fit(X, y[, sample_weight])	Build a forest of trees from the training set (X, y).
# get_params([deep])	Get parameters for this estimator.
# predict(X)	Predict regression target for X.
# score(X, y[, sample_weight])	Returns the coefficient of determination R^2 of the prediction.
# -----------------------
# video link1: https://www.appliedaicourse.com/course/applied-ai-course-online/lessons/regression-using-decision-trees-2/
# video link2: https://www.appliedaicourse.com/course/applied-ai-course-online/lessons/what-are-ensembles/
# -----------------------

regr1 = RandomForestRegressor(max_features='sqrt',min_samples_leaf=4,min_samples_split=3,n_estimators=40, n_jobs=-1)
regr1.fit(df_train, tsne_train_output)

In [88]:
# Predicting on test data using our trained random forest model 

# the models regr1 is already hyper parameter tuned
# the parameters that we got above are found using grid search

y_pred = regr1.predict(df_test)
rndf_test_predictions = [round(value) for value in y_pred]
y_pred = regr1.predict(df_train)
rndf_train_predictions = [round(value) for value in y_pred]

In [52]:
#feature importances based on analysis using random forest
print (df_train.columns)
print (regr1.feature_importances_)

Index(['ft_5', 'ft_4', 'ft_3', 'ft_2', 'ft_1', 'PULocationID', 'Tip_amount',
       'weekday', 'exp_avg'],
      dtype='object')
[0.01527355 0.04986627 0.10413751 0.22040291 0.2385629  0.0040881
 0.0150577  0.0022944  0.35031666]


## Using XgBoost Regressor

In [89]:
# Training a hyper-parameter tuned Xg-Boost regressor on our train data

# find more about XGBRegressor function here http://xgboost.readthedocs.io/en/latest/python/python_api.html?#module-xgboost.sklearn
# -------------------------
# default paramters
# xgboost.XGBRegressor(max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, objective='reg:linear', 
# booster='gbtree', n_jobs=1, nthread=None, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, 
# colsample_bylevel=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, random_state=0, seed=None, 
# missing=None, **kwargs)

# some of methods of RandomForestRegressor()
# fit(X, y, sample_weight=None, eval_set=None, eval_metric=None, early_stopping_rounds=None, verbose=True, xgb_model=None)
# get_params([deep])	Get parameters for this estimator.
# predict(data, output_margin=False, ntree_limit=0) : Predict with data. NOTE: This function is not thread safe.
# get_score(importance_type='weight') -> get the feature importance
# -----------------------
# video link1: https://www.appliedaicourse.com/course/applied-ai-course-online/lessons/regression-using-decision-trees-2/
# video link2: https://www.appliedaicourse.com/course/applied-ai-course-online/lessons/what-are-ensembles/
# -----------------------

x_model = xgb.XGBRegressor(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=3,
 min_child_weight=3,
 gamma=0,
 subsample=0.8,
 reg_alpha=200, reg_lambda=200,
 colsample_bytree=0.8,nthread=4)
x_model.fit(df_train, tsne_train_output)

In [90]:
#predicting with our trained Xg-Boost regressor
# the models x_model is already hyper parameter tuned
# the parameters that we got above are found using grid search

y_pred = x_model.predict(df_test)
xgb_test_predictions = [round(value) for value in y_pred]
y_pred = x_model.predict(df_train)
xgb_train_predictions = [round(value) for value in y_pred]

In [91]:
df_train

Unnamed: 0,ft_5,ft_4,ft_3,ft_2,ft_1,PULocationID,Tip_amount,weekday,exp_avg
0,24,1,1,4,11,1,1.806325,2,8
1,1,1,4,11,9,1,1.266009,2,8
2,1,4,11,9,11,1,1.399257,2,10
3,4,11,9,11,13,1,1.776162,2,12
4,11,9,11,13,10,1,2.388549,2,10
...,...,...,...,...,...,...,...,...,...
215344,44,47,42,36,48,69,1.125631,2,44
215345,47,42,36,48,58,69,1.063396,2,53
215346,42,36,48,58,56,69,1.192769,2,55
215347,36,48,58,56,49,69,1.262974,2,50


In [92]:
tsne_train_output_tip

[1.0344444444444445,
 1.4563636363636363,
 1.9376923076923076,
 2.651,
 1.6978947368421051,
 2.1854545454545455,
 2.930625,
 2.4076470588235295,
 2.0195238095238093,
 1.6878947368421053,
 1.9994736842105265,
 1.139,
 2.2239999999999998,
 2.9709090909090907,
 1.9922222222222221,
 1.5235714285714284,
 2.147692307692308,
 2.234375,
 1.2825,
 1.3807142857142856,
 1.4080000000000001,
 1.358,
 2.131818181818182,
 2.26625,
 2.4739999999999998,
 1.745,
 0.3333333333333333,
 1.2871428571428571,
 1,
 1,
 1,
 1.9525000000000001,
 4.0,
 0.0,
 2.83,
 0.0,
 2.66,
 0.0,
 1.5,
 2.925,
 1.96,
 1.5983333333333334,
 1.3633333333333333,
 0.6666666666666666,
 1.4866666666666666,
 3.2133333333333334,
 0.0,
 0.0,
 0.43714285714285717,
 1.3333333333333333,
 2.33,
 9.6,
 1.5866666666666667,
 2.05,
 1,
 1,
 1,
 2.15,
 0.7166666666666667,
 0.3875,
 0.25,
 1.045,
 0.86,
 0.8866666666666667,
 1.38,
 1.175,
 1.61,
 1.5,
 2.12,
 0.0,
 0.525,
 1,
 1,
 1,
 1,
 1,
 1,
 1.502,
 0.58,
 3.58,
 2,
 2,
 2,
 2.0,
 1.71,
 0.8

In [56]:
#feature importances
x_model.get_booster().get_score(importance_type="weight")

{'ft_5': 842.0,
 'ft_4': 685.0,
 'ft_3': 817.0,
 'ft_2': 950.0,
 'ft_1': 1055.0,
 'PULocationID': 761.0,
 'Tip_amount': 722.0,
 'weekday': 226.0,
 'exp_avg': 702.0}

## Calculating the error metric values for various models

In [57]:
train_R2=[]
test_R2=[]

train_R2.append(r2_score(tsne_train_output,df_train['ft_1'].values))
train_R2.append(r2_score(tsne_train_output,df_train['exp_avg'].values))
train_R2.append(r2_score(tsne_train_output,rndf_train_predictions))
train_R2.append(r2_score(tsne_train_output,xgb_train_predictions))
train_R2.append(r2_score(tsne_train_output,lr_train_predictions))
train_R2.append(r2_score(tsne_train_output_tip,df_train['Tip_amount'].values))

test_R2.append(r2_score(tsne_test_output,df_test['ft_1'].values))
test_R2.append(r2_score(tsne_test_output,df_test['exp_avg'].values))
test_R2.append(r2_score(tsne_test_output,rndf_test_predictions))
test_R2.append(r2_score(tsne_test_output,xgb_test_predictions))
test_R2.append(r2_score(tsne_test_output,lr_test_predictions))
test_R2.append(r2_score(tsne_test_output_tip,df_test['Tip_amount'].values))

In [63]:
print(f"{base_year+1} 1 ~ {base_month_count} {region} R2")
print ("--------------------------------------------------------------------------------------------------------")
print ("Baseline Model\t\t\t","Train: ",train_R2[0],"Test: ",test_R2[0],sep='\t')
print ("Exponential Averages Forecasting","Train: ",train_R2[1],"Test: ",test_R2[1],sep='\t')
print ("Random Forest Regression\t","Train: ",train_R2[2],"Test: ",test_R2[2],sep='\t')
print ("XgBoost Regression\t\t","Train: ",train_R2[3],"Test: ",test_R2[3],sep='\t')
print ("Linear Regression\t\t","Train: ",train_R2[4],"Test: ",test_R2[4],sep='\t')
print ("Baseline Model of Tip\t\t","Train: ",train_R2[5],"Test: ",test_R2[5],sep='\t')

2019 1 ~ 1 Manhattan R2
--------------------------------------------------------------------------------------------------------
Baseline Model				Train: 	0.9275364793206575	Test: 	0.9312412872265359
Exponential Averages Forecasting	Train: 	0.9361166759474355	Test: 	0.9400040382885209
Random Forest Regression		Train: 	0.9695759719040822	Test: 	0.9429384790454273
XgBoost Regression			Train: 	0.9435557112890061	Test: 	0.9437120898550438
Linear Regression			Train: 	0.9378369832128254	Test: 	0.9415483950787751
Baseline Model of Tip			Train: 	-0.030201990127818323	Test: 	0.02006365706213742


In [94]:
train_MAE=[]
test_MAE=[]
train_MAE.append(mean_absolute_error(tsne_train_output,df_train['ft_1'].values))
train_MAE.append(mean_absolute_error(tsne_train_output,df_train['exp_avg'].values))
train_MAE.append(mean_absolute_error(tsne_train_output,rndf_train_predictions))
train_MAE.append(mean_absolute_error(tsne_train_output,xgb_train_predictions))
train_MAE.append(mean_absolute_error(tsne_train_output,lr_train_predictions))
train_MAE.append(mean_absolute_error(tsne_train_output,df_train['Tip_amount'].values))

test_MAE.append(mean_absolute_error(tsne_test_output,df_test['ft_1'].values))
test_MAE.append(mean_absolute_error(tsne_test_output,df_test['exp_avg'].values))
test_MAE.append(mean_absolute_error(tsne_test_output,rndf_test_predictions))
test_MAE.append(mean_absolute_error(tsne_test_output,xgb_test_predictions))
test_MAE.append(mean_absolute_error(tsne_test_output,lr_test_predictions))
test_MAE.append(mean_absolute_error(tsne_test_output,df_test['Tip_amount'].values))

In [96]:
print(f"{base_year+1} 1 ~ {base_month_count} {region} MAE")
print ("--------------------------------------------------------------------------------------------------------")
print ("Baseline Model\t\t\t","Train: ",train_MAE[0],"Test: ",test_MAE[0],sep='\t')
print ("Exponential Averages Forecasting","Train: ",train_MAE[1],"Test: ",test_MAE[1],sep='\t')
print ("Random Forest Regression\t","Train: ",train_MAE[2],"Test: ",test_MAE[2],sep='\t')
print ("XgBoost Regression\t\t","Train: ",train_MAE[3],"Test: ",test_MAE[3],sep='\t')
print ("Linear Regression\t\t","Train: ",train_MAE[4],"Test: ",test_MAE[4],sep='\t')
print ("Baseline Model of Tip\t\t","Train: ",train_MAE[5],"Test: ",test_MAE[5],sep='\t')

2019 1 ~ 1 Manhattan MAE
--------------------------------------------------------------------------------------------------------
Baseline Model				Train: 	4.376946259327882	Test: 	4.48041636879617
Exponential Averages Forecasting	Train: 	4.0952500359880935	Test: 	4.183076623123416
Random Forest Regression		Train: 	2.8067509020241563	Test: 	4.179220554147441
XgBoost Regression			Train: 	3.93488244663314	Test: 	4.153636186391109
Linear Regression			Train: 	4.0569494169928815	Test: 	4.1414830701241305
Baseline Model of Tip			Train: 	20.93751226296253	Test: 	22.493143562747775
