In [1]:
# import packages
import pandas as pd
import numpy as np
from datetime import datetime, date

In [2]:
data = pd.DataFrame(['Order_No_21660', 'User_Id_1329', 'Bike', 3, 'Business', 31, 5, '12:16:49 PM', 31, 5, '12:22:48 PM', 31, 5, '12:23:47 PM',
                     31, 5, '12:38:24 PM', 4, 21.8, np.nan, -1.2795183, 36.8238089, -1.273056, 36.811298, 'Rider_Id_812', 4402, 1090, 14.3, 1301])
data = data.T
data.columns = ['Order No', 'User Id', 'Vehicle Type', 'Platform Type', 'Personal or Business',
                'Placement - Day of Month',
                'Placement - Weekday (Mo = 1)',
                'Placement - Time',
                'Confirmation - Day of Month',
                'Confirmation - Weekday (Mo = 1)',
                'Confirmation - Time',
                'Arrival at Pickup - Day of Month',
                'Arrival at Pickup - Weekday (Mo = 1)',
                'Arrival at Pickup - Time',
                'Pickup - Day of Month',
                'Pickup - Weekday (Mo = 1)',
                'Pickup - Time',
                'Distance (KM)',
                'Temperature',
                'Precipitation in millimeters',
                'Pickup Lat',
                'Pickup Long',
                'Destination Lat',
                'Destination Long',
                'Rider Id',
                'No_Of_Orders',
                'Age',
                'Average_Rating',
                'No_of_Ratings']
data

Unnamed: 0,Order No,User Id,Vehicle Type,Platform Type,Personal or Business,Placement - Day of Month,Placement - Weekday (Mo = 1),Placement - Time,Confirmation - Day of Month,Confirmation - Weekday (Mo = 1),...,Precipitation in millimeters,Pickup Lat,Pickup Long,Destination Lat,Destination Long,Rider Id,No_Of_Orders,Age,Average_Rating,No_of_Ratings
0,Order_No_21660,User_Id_1329,Bike,3,Business,31,5,12:16:49 PM,31,5,...,,-1.27952,36.8238,-1.27306,36.8113,Rider_Id_812,4402,1090,14.3,1301


In [3]:
rider_features = pd.read_csv('https://raw.githubusercontent.com/Team-8-JHB-RegressionPredict/regression-predict-api-template/master/predict%20deliverable/data/rider_features.csv')
rider_features

Unnamed: 0,rider_id_bins,rider_speed_reasonable,rider_speed_slow,Rider Id
0,1692.627611,1,0,Rider_Id_432
1,1692.627611,1,0,Rider_Id_856
2,2041.411417,1,0,Rider_Id_155
3,1173.795848,1,0,Rider_Id_855
4,1542.175647,1,0,Rider_Id_770
...,...,...,...,...
919,889.392857,0,1,Rider_Id_528
920,2287.263773,0,1,Rider_Id_638
921,2389.871369,0,1,Rider_Id_773
922,634.242857,0,1,Rider_Id_860


In [4]:
data = pd.merge(data, rider_features, how = 'left', on = 'Rider Id')
data['rider_speed_reasonable'] = rider_features['rider_speed_reasonable'].mode()[0]
data['rider_speed_slow'] = rider_features['rider_speed_slow'].mode()[0]
data['rider_id_bins'] = rider_features['rider_id_bins'].mode()[0]

In [5]:
month_day_vars = [
    'Placement - Day of Month', 
    'Confirmation - Day of Month', 
    'Arrival at Pickup - Day of Month', 
    'Pickup - Day of Month']
time_vars = [
    'Placement - Time',
    'Confirmation - Time',
    'Arrival at Pickup - Time',
    'Pickup - Time'
]


In [6]:
def getTimeObjects(df):
    datetime_vars = list()
    df = df.copy()
    for month_col, time_col in zip(month_day_vars, time_vars):
        new_col_name = '{}'.format(time_col.split('-')[0].replace(' ', ''))
        datetime_vars.append(new_col_name)
        print(new_col_name)
        
        values = list()
        Dates = list()
        for row in df.index.values:
            value = '2020' + '-' + '1' + '-' + str(df[month_col][row]) 
            values.append(value)
            
            date_string = values[row]
            time_string = df[time_col][row]

            Datetime = pd.to_datetime(date_string + ' ' + str(time_string))
    #         print(Datetime)
            Dates.append(Datetime)
        df[new_col_name] = Dates
    return df, datetime_vars

In [7]:
data, datetime_vars =  getTimeObjects(data)

Placement
Confirmation
ArrivalatPickup
Pickup


In [8]:
iter_dict = {
    'Time from Placement to Confirmation': ['Confirmation', 'Placement'],
    'Time from Confirmation to Arrival at Pickup': ['ArrivalatPickup', 'Confirmation'],
    'Time from Arrival at Pickup to Actual Pickup': ['Pickup', 'ArrivalatPickup'],
    'Time from Placement to Actual Pickup': ['Pickup', 'Placement'],
    'Time from Placement to Arrival at Pickup': ['ArrivalatPickup', 'Placement'],
    'Time from Confirmation to Actual Pickup': ['Pickup', 'Confirmation']
}
def getTimeDifferences(df, iter_dict):
    df = df.copy()
    numeric_time_vars_sub = list()
    for new_column, inputs in iter_dict.items():
        numeric_time_vars_sub.append(new_column)
        col1 = inputs[0]
        col2 = inputs[1]
        df[new_column] = df[col1] - df[col2]
        df[new_column] = df[new_column].map(lambda timedelt: timedelt.total_seconds())
    return df, numeric_time_vars_sub

In [9]:
data, numeric_time_vars_sub  = getTimeDifferences(data, iter_dict)

In [11]:
def getHourMinute(df):
    df = df.copy()
    hour_vars = list()
    minute_vars = list()
    for col in datetime_vars:
        hours = list()
        minutes = list()
        for order in df.index.values:
            hour = df.loc[:, col][order].hour
            minute = df.loc[:, col][order].minute
            hours.append(hour)
            minutes.append(minute)
    #     print(col)
    #     print(hours, minutes)
        new_column_name_hour = '{}_hour'.format(col)
        new_column_name_minute = '{}_minute'.format(col)
        hour_vars.append(new_column_name_hour)
        minute_vars.append(new_column_name_minute)
        
        df[new_column_name_hour] = hours
        df[new_column_name_minute] = minutes
    return df, hour_vars, minute_vars

In [12]:
data, hour_vars, minute_vars = getHourMinute(data)

In [13]:
def generateInteractionHourMinute(df):
    df = df.copy()
    df['add_hour_minute'] = df['Pickup_minute'].div(60).add(df['Pickup_hour'])
    return df

In [14]:
data = generateInteractionHourMinute(data)

In [15]:
def getSecondsPastMidnight(x):
    date = x.date()
    twelve = pd.to_datetime(str(date) + ' ' + '00:00:00 AM')
    diff = x - twelve
    return diff.total_seconds()

In [16]:
data['seconds_past_midnight'] = data['Pickup'].map(getSecondsPastMidnight)

In [17]:
data

Unnamed: 0,Order No,User Id,Vehicle Type,Platform Type,Personal or Business,Placement - Day of Month,Placement - Weekday (Mo = 1),Placement - Time,Confirmation - Day of Month,Confirmation - Weekday (Mo = 1),...,Placement_hour,Placement_minute,Confirmation_hour,Confirmation_minute,ArrivalatPickup_hour,ArrivalatPickup_minute,Pickup_hour,Pickup_minute,add_hour_minute,seconds_past_midnight
0,Order_No_21660,User_Id_1329,Bike,3,Business,31,5,12:16:49 PM,31,5,...,12,16,12,22,12,23,12,38,12.633333,45504.0


In [18]:
def generateSinCosTime(df):
    seconds_in_day = 24*60*60
    df = df.copy()
    df['sin_pickup_time'] = np.sin(2*np.pi*df['seconds_past_midnight']/seconds_in_day)
    df['cos_pickup_time'] = np.cos(2*np.pi*df['seconds_past_midnight']/seconds_in_day)
    return df

In [19]:
data = generateSinCosTime(data)

In [20]:
data

Unnamed: 0,Order No,User Id,Vehicle Type,Platform Type,Personal or Business,Placement - Day of Month,Placement - Weekday (Mo = 1),Placement - Time,Confirmation - Day of Month,Confirmation - Weekday (Mo = 1),...,Confirmation_hour,Confirmation_minute,ArrivalatPickup_hour,ArrivalatPickup_minute,Pickup_hour,Pickup_minute,add_hour_minute,seconds_past_midnight,sin_pickup_time,cos_pickup_time
0,Order_No_21660,User_Id_1329,Bike,3,Business,31,5,12:16:49 PM,31,5,...,12,22,12,23,12,38,12.633333,45504.0,-0.166769,-0.985996


In [30]:
predictors = ['Distance (KM)',
 'rider_id_bins',
 'rider_speed_slow',
 'rider_speed_reasonable',  
 'No_of_Ratings',
 'No_Of_Orders',
 'Time from Arrival at Pickup to Actual Pickup',
 'Time from Confirmation to Arrival at Pickup',
 'Pickup_hour',
 'add_hour_minute',
 'sin_pickup_time'
]

In [31]:
for var in predictors:
    if var not in data.columns:
        print(var, 'Yes')

In [32]:
predictor_vector = data.loc[:, predictors]

In [33]:
predictor_vector

Unnamed: 0,Distance (KM),rider_id_bins,rider_speed_slow,rider_speed_reasonable,No_of_Ratings,No_Of_Orders,Time from Arrival at Pickup to Actual Pickup,Time from Confirmation to Arrival at Pickup,Pickup_hour,add_hour_minute,sin_pickup_time
0,4,1480.366809,0,1,1301,4402,877.0,59.0,12,12.633333,-0.166769
