In [3]:
# !pip install geopy
# !pip install feature_engine

In [1]:
# import packages
import pprint
import pandas as pd
import numpy as np
import itertools
import statsmodels.api as sm
from datetime import datetime, date

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

%matplotlib inline

pp = pprint.PrettyPrinter()
from geopy import distance

In [2]:
# load data
train_data = pd.read_csv('https://raw.githubusercontent.com/Team-8-JHB-RegressionPredict/regression-predict-api-template/master/predict%20deliverable/data/Train.csv')
test_data = pd.read_csv('https://raw.githubusercontent.com/Team-8-JHB-RegressionPredict/regression-predict-api-template/master/predict%20deliverable/data/Test.csv')
riders_data = pd.read_csv('https://raw.githubusercontent.com/Team-8-JHB-RegressionPredict/regression-predict-api-template/master/predict%20deliverable/data/Riders.csv')

In [3]:
predictors = ['Distance (KM)',
 'rider_id_bins',
 'rider_speed',
 'No_of_Ratings',
 'No_Of_Orders',
 'Time from Arrival at Pickup to Actual Pickup',
 'Time from Confirmation to Arrival at Pickup',
 'Pickup_hour',
 'add_hour_minute',
 'sin_pickup_time'
]
target = 'Time from Pickup to Arrival'
pp.pprint(predictors + [target])

['Distance (KM)',
 'rider_id_bins',
 'rider_speed',
 'No_of_Ratings',
 'No_Of_Orders',
 'Time from Arrival at Pickup to Actual Pickup',
 'Time from Confirmation to Arrival at Pickup',
 'Pickup_hour',
 'add_hour_minute',
 'sin_pickup_time',
 'Time from Pickup to Arrival']


In [4]:
# merge the train & test data with the riders data.
def mergeRiders(df):
    df = pd.merge(left = df, right = riders_data, how = 'left')
    return df

train_data = mergeRiders(train_data)
test_data = mergeRiders(test_data)

#### RIDER ID

In [5]:
from feature_engine.categorical_encoders import OrdinalCategoricalEncoder
X = train_data.loc[:, ['Rider Id']]
y = train_data.loc[:, 'Time from Pickup to Arrival']

enc = OrdinalCategoricalEncoder(encoding_method='ordered')
enc.fit(X, y)
train_data['rider_id_enc'] = enc.transform(X)['Rider Id']

In [6]:
from feature_engine.discretisers import DecisionTreeDiscretiser
DT_disc = DecisionTreeDiscretiser(
                        cv=10,
                        scoring='neg_root_mean_squared_error',
                        variables=['rider_id_enc'],
                        regression=True,
                        param_grid={'max_depth': [1, 2, 3, 4, 5, 6, 10],
                                    'min_samples_leaf': [10, 4, 2, 1]},
                        random_state=1)
X_train = train_data.loc[:, ['rider_id_enc']]
y_train = train_data.loc[:, 'Time from Pickup to Arrival']
DT_disc.fit(X_train, y_train)
DT_discr = DT_disc.transform(X_train)
train_data['rider_id_bins'] = DT_discr['rider_id_enc']

### RIDER SPEED

In [7]:
def getSpeedPerOrder(df):
    df = df.copy()
    pick_up_to_arr_minutes = df['Time from Pickup to Arrival'].div(60)
    df['Speed_per_order'] = df['Distance (KM)'].div(pick_up_to_arr_minutes)
    return df

In [8]:
# apply the function to train_data
train_data = getSpeedPerOrder(df = train_data)

#generate summary stats
speed_summary_stats = train_data['Speed_per_order'].describe()
speed_summary_stats

count    21201.000000
mean         4.155090
std         31.754955
min          0.015337
25%          0.274390
50%          0.384410
75%          0.504808
max       1020.000000
Name: Speed_per_order, dtype: float64

In [9]:
# find bounds
def findBounds(summary_stats, std = 2.5):
    upper_bound = summary_stats['std'] * std
    lower_bound = summary_stats['std'] * -std
    return lower_bound, upper_bound

In [10]:
def findPercentile(x, i = 15):
    ith_percentile_lower_bound = np.percentile(x, i)
    return ith_percentile_lower_bound

In [11]:
rider_avg_speed = train_data.groupby('Rider Id')['Speed_per_order'].mean()

rider_speed_data = pd.DataFrame(rider_avg_speed).reset_index().rename({'Speed_per_order':'rider_avg_speed'})
rider_speed_data.rename(columns = {'Speed_per_order':'rider_avg_speed'}, inplace = True
)
avg_speed_summary_stats = rider_speed_data['rider_avg_speed'].describe()
avg_speed_summary_stats 

count    924.000000
mean       3.422313
std       12.797333
min        0.063091
25%        0.332414
50%        0.412914
75%        0.796303
max      151.676281
Name: rider_avg_speed, dtype: float64

In [12]:
train_data = pd.merge(
    left = train_data,
    right = rider_speed_data,
    how='left',
    on = 'Rider Id'
)
train_data.head()

Unnamed: 0,Order No,User Id,Vehicle Type,Platform Type,Personal or Business,Placement - Day of Month,Placement - Weekday (Mo = 1),Placement - Time,Confirmation - Day of Month,Confirmation - Weekday (Mo = 1),...,Rider Id,Time from Pickup to Arrival,No_Of_Orders,Age,Average_Rating,No_of_Ratings,rider_id_enc,rider_id_bins,Speed_per_order,rider_avg_speed
0,Order_No_4211,User_Id_633,Bike,3,Business,9,5,9:35:46 AM,9,5,...,Rider_Id_432,745,1637,1309,13.8,549,545,1692.627611,0.322148,0.316656
1,Order_No_25375,User_Id_2285,Bike,3,Personal,12,5,11:16:16 AM,12,5,...,Rider_Id_856,1993,396,339,13.6,69,538,1692.627611,0.481686,15.119684
2,Order_No_1899,User_Id_265,Bike,3,Business,30,2,12:39:25 PM,30,2,...,Rider_Id_155,455,1023,242,12.5,114,760,2041.411417,0.395604,0.289148
3,Order_No_9336,User_Id_1402,Bike,3,Business,15,5,9:25:34 AM,15,5,...,Rider_Id_855,1341,886,283,14.5,113,108,1173.795848,0.402685,16.953
4,Order_No_27883,User_Id_1737,Bike,1,Personal,13,1,9:55:18 AM,13,1,...,Rider_Id_770,1214,2311,872,14.1,533,385,1542.175647,0.444811,0.38504


In [13]:
lower, upper = findBounds(avg_speed_summary_stats)
lower, upper

(-31.99333354145434, 31.99333354145434)

In [14]:
ith_percentile_lower_bound = findPercentile(train_data['rider_avg_speed'], i = 2.5)
ith_percentile_lower_bound

0.26458558326025555

In [15]:
high_avg_speed_riders = train_data[train_data['rider_avg_speed'] > upper]['Rider Id'].unique()
print(high_avg_speed_riders[:5])
print(len(high_avg_speed_riders))

['Rider_Id_619' 'Rider_Id_39' 'Rider_Id_239' 'Rider_Id_613' 'Rider_Id_193']
21


In [16]:
low_avg_speed_riders = train_data[train_data['rider_avg_speed'] < ith_percentile_lower_bound]['Rider Id'].unique()
print(low_avg_speed_riders[:5])
print(len(low_avg_speed_riders))

['Rider_Id_302' 'Rider_Id_668' 'Rider_Id_508' 'Rider_Id_21' 'Rider_Id_431']
86


In [17]:
def riderChar(x):
    if x in high_avg_speed_riders:
        value = 'fast'
    elif x in low_avg_speed_riders:
        value = 'slow'
    else:
        value = 'reasonable'
    return value

train_data['rider_speed'] = train_data['Rider Id'].map(riderChar)

In [18]:
new_vars = ['rider_speed', 'rider_id_bins']
test_data = pd.merge(
    left = train_data.loc[:, new_vars + ['Rider Id']].drop_duplicates(),
    right = test_data,
    how = 'right',
    on = 'Rider Id'
)
test_data.head()

Unnamed: 0,rider_speed,rider_id_bins,Rider Id,Order No,User Id,Vehicle Type,Platform Type,Personal or Business,Placement - Day of Month,Placement - Weekday (Mo = 1),...,Temperature,Precipitation in millimeters,Pickup Lat,Pickup Long,Destination Lat,Destination Long,No_Of_Orders,Age,Average_Rating,No_of_Ratings
0,reasonable,1692.627611,Rider_Id_432,Order_No_5036,User_Id_2423,Bike,3,Business,2,2,...,21.1,,-1.255189,36.782203,-1.278818,36.820668,1637,1309,13.8,549
1,reasonable,1692.627611,Rider_Id_432,Order_No_18077,User_Id_3326,Bike,3,Business,11,4,...,27.1,,-1.255189,36.782203,-1.292824,36.783962,1637,1309,13.8,549
2,reasonable,1692.627611,Rider_Id_432,Order_No_677,User_Id_2064,Bike,2,Personal,22,2,...,21.2,,-1.285033,36.822932,-1.274355,36.811436,1637,1309,13.8,549
3,reasonable,1692.627611,Rider_Id_432,Order_No_22822,User_Id_3613,Bike,3,Business,1,1,...,25.8,,-1.273087,36.770659,-1.269682,36.802869,1637,1309,13.8,549
4,reasonable,1692.627611,Rider_Id_432,Order_No_12884,User_Id_290,Bike,1,Personal,30,3,...,22.6,,-1.280536,36.878096,-1.294037,36.785559,1637,1309,13.8,549


In [19]:
test_data['rider_speed'].fillna(train_data['rider_speed'].mode()[0], inplace = True)
test_data['rider_id_bins'].fillna(train_data['rider_id_bins'].mode()[0], inplace = True)

### TIME VARIABLES

In [20]:
month_day_vars = [
    'Placement - Day of Month', 
    'Confirmation - Day of Month', 
    'Arrival at Pickup - Day of Month', 
    'Pickup - Day of Month']
time_vars = [
    'Placement - Time',
    'Confirmation - Time',
    'Arrival at Pickup - Time',
    'Pickup - Time'
]


In [21]:
def getTimeObjects(df):
    datetime_vars = list()
    df = df.copy()
    for month_col, time_col in zip(month_day_vars, time_vars):
        new_col_name = '{}'.format(time_col.split('-')[0].replace(' ', ''))
        datetime_vars.append(new_col_name)
        print(new_col_name)
        
        values = list()
        Dates = list()
        for row in df.index.values:
            value = '2020' + '-' + '1' + '-' + str(df[month_col][row]) 
            values.append(value)
            
            date_string = values[row]
            time_string = df[time_col][row]

            Datetime = pd.to_datetime(date_string + ' ' + str(time_string))
    #         print(Datetime)
            Dates.append(Datetime)
        df[new_col_name] = Dates
    return df, datetime_vars

In [22]:
train_data, datetime_vars = getTimeObjects(train_data)
test_data, datetime_vars = getTimeObjects(test_data)
train_data.loc[:, datetime_vars].head()

Placement
Confirmation
ArrivalatPickup
Pickup
Placement
Confirmation
ArrivalatPickup
Pickup


Unnamed: 0,Placement,Confirmation,ArrivalatPickup,Pickup
0,2020-01-09 09:35:46,2020-01-09 09:40:10,2020-01-09 10:04:47,2020-01-09 10:27:30
1,2020-01-12 11:16:16,2020-01-12 11:23:21,2020-01-12 11:40:22,2020-01-12 11:44:09
2,2020-01-30 12:39:25,2020-01-30 12:42:44,2020-01-30 12:49:34,2020-01-30 12:53:03
3,2020-01-15 09:25:34,2020-01-15 09:26:05,2020-01-15 09:37:56,2020-01-15 09:43:06
4,2020-01-13 09:55:18,2020-01-13 09:56:18,2020-01-13 10:03:53,2020-01-13 10:05:23


In [23]:
iter_dict = {
    'Time from Placement to Confirmation': ['Confirmation', 'Placement'],
    'Time from Confirmation to Arrival at Pickup': ['ArrivalatPickup', 'Confirmation'],
    'Time from Arrival at Pickup to Actual Pickup': ['Pickup', 'ArrivalatPickup'],
    'Time from Placement to Actual Pickup': ['Pickup', 'Placement'],
    'Time from Placement to Arrival at Pickup': ['ArrivalatPickup', 'Placement'],
    'Time from Confirmation to Actual Pickup': ['Pickup', 'Confirmation']
}
def getTimeDifferences(df, iter_dict):
    df = df.copy()
    numeric_time_vars_sub = list()
    for new_column, inputs in iter_dict.items():
        numeric_time_vars_sub.append(new_column)
        col1 = inputs[0]
        col2 = inputs[1]
        df[new_column] = df[col1] - df[col2]
        df[new_column] = df[new_column].map(lambda timedelt: timedelt.total_seconds())
    return df, numeric_time_vars_sub

In [24]:
# apply function to both train and test data
train_data, numeric_time_vars_sub  = getTimeDifferences(train_data, iter_dict)
test_data, numeric_time_vars_sub  = getTimeDifferences(test_data, iter_dict)
train_data.loc[:, numeric_time_vars_sub].head()

Unnamed: 0,Time from Placement to Confirmation,Time from Confirmation to Arrival at Pickup,Time from Arrival at Pickup to Actual Pickup,Time from Placement to Actual Pickup,Time from Placement to Arrival at Pickup,Time from Confirmation to Actual Pickup
0,264.0,1477.0,1363.0,3104.0,1741.0,2840.0
1,425.0,1021.0,227.0,1673.0,1446.0,1248.0
2,199.0,410.0,209.0,818.0,609.0,619.0
3,31.0,711.0,310.0,1052.0,742.0,1021.0
4,60.0,455.0,90.0,605.0,515.0,545.0


In [25]:
def getHourMinute(df):
    df = df.copy()
    hour_vars = list()
    minute_vars = list()
    for col in datetime_vars:
        hours = list()
        minutes = list()
        for order in df.index.values:
            hour = df.loc[:, col][order].hour
            minute = df.loc[:, col][order].minute
            hours.append(hour)
            minutes.append(minute)
    #     print(col)
    #     print(hours, minutes)
        new_column_name_hour = '{}_hour'.format(col)
        new_column_name_minute = '{}_minute'.format(col)
        hour_vars.append(new_column_name_hour)
        minute_vars.append(new_column_name_minute)
        
        df[new_column_name_hour] = hours
        df[new_column_name_minute] = minutes
    return df, hour_vars, minute_vars

In [26]:
# apply function to both train and test data
train_data, hour_vars, minute_vars = getHourMinute(train_data)
test_data, hour_vars, minute_vars = getHourMinute(test_data)
train_data.loc[:, hour_vars + minute_vars].head()

Unnamed: 0,Placement_hour,Confirmation_hour,ArrivalatPickup_hour,Pickup_hour,Placement_minute,Confirmation_minute,ArrivalatPickup_minute,Pickup_minute
0,9,9,10,10,35,40,4,27
1,11,11,11,11,16,23,40,44
2,12,12,12,12,39,42,49,53
3,9,9,9,9,25,26,37,43
4,9,9,10,10,55,56,3,5


In [27]:
# relacing the outlier value with mean. Dos so fror the test data as well
ind_to_replace = train_data[train_data['Pickup_hour'] == 0].loc[:, 'Pickup_hour'].index
train_data.loc[ind_to_replace, 'Pickup_hour'] = np.mean(train_data['Pickup_hour'])
ind_to_replace = test_data[test_data['Pickup_hour'] == 0].loc[:, 'Pickup_hour'].index
test_data.loc[ind_to_replace, 'Pickup_hour'] = np.mean(train_data['Pickup_hour'])

In [28]:
def generateInteractionHourMinute(df):
    df = df.copy()
    df['add_hour_minute'] = df['Pickup_minute'].div(60).add(df['Pickup_hour'])
    return df

In [29]:
# apply function to both train and test data
train_data = generateInteractionHourMinute(train_data)
test_data = generateInteractionHourMinute(test_data)

In [30]:
def getSecondsPastMidnight(x):
    date = x.date()
    twelve = pd.to_datetime(str(date) + ' ' + '00:00:00 AM')
    diff = x - twelve
    return diff.total_seconds()

In [31]:
# applying the function to both train and test data
train_data['seconds_past_midnight'] = train_data['Pickup'].map(getSecondsPastMidnight)
test_data['seconds_past_midnight'] = test_data['Pickup'].map(getSecondsPastMidnight)

In [32]:
def generateSinCosTime(df):
    seconds_in_day = 24*60*60
    df = df.copy()
    df['sin_pickup_time'] = np.sin(2*np.pi*df['seconds_past_midnight']/seconds_in_day)
    df['cos_pickup_time'] = np.cos(2*np.pi*df['seconds_past_midnight']/seconds_in_day)
    return df

In [33]:
# apply function to both train and test data
train_data = generateSinCosTime(train_data)
test_data = generateSinCosTime(test_data)

### Preprocessing

In [35]:
def getDummies(df):
    df = df.copy()
    df = pd.get_dummies(df, drop_first=True)
    return df

In [38]:
X_train = getDummies(train_data.loc[:, predictors])
X_test = getDummies(test_data.loc[:, predictors])

In [39]:
y_train = train_data.loc[:, target]

In [40]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train), columns = X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)

In [42]:
X_train.head()

Unnamed: 0,Distance (KM),rider_id_bins,No_of_Ratings,No_Of_Orders,Time from Arrival at Pickup to Actual Pickup,Time from Confirmation to Arrival at Pickup,Pickup_hour,add_hour_minute,sin_pickup_time,rider_speed_reasonable,rider_speed_slow
0,-0.97137,0.369601,0.516144,-0.035206,0.812946,1.018519,-1.126956,-1.149394,1.259389,0.224817,-0.159814
1,1.145469,0.369601,-0.675342,-0.823507,-0.679437,0.35548,-0.73481,-0.643152,0.64848,0.224817,-0.159814
2,-1.147774,1.319525,-0.56364,-0.425228,-0.703084,-0.532934,-0.342665,-0.189507,0.084985,0.224817,-0.159814
3,-0.089354,-1.043453,-0.566123,-0.512252,-0.570398,-0.09527,-1.519101,-1.438676,1.579735,0.224817,-0.159814
4,-0.089354,-0.040159,0.476428,0.392929,-0.859416,-0.467503,-1.126956,-1.294035,1.423206,0.224817,-0.159814


In [62]:
feature_vector = X_train.copy()