IMPORTING REQUIRED LIBRARIES

In [None]:
import numpy as np
import pandas as pd
import math
import time
import seaborn as sns   
import xgboost as xgb
from tqdm import tqdm
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.manifold import TSNE
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV, RandomizedSearchCV
from scipy.stats import randint
from sklearn.utils.class_weight import compute_sample_weight

In [None]:
tqdm.pandas()

READING THE DATA

In [None]:
train = pd.read_csv('data_set/train.csv')
test = pd.read_csv('data_set/test.csv')
sample_submission = pd.read_csv('data_set/sample_submission.csv')

* order_id : unique id for each order
* order_time: time of the creation of order by the client
* order_date : date of the order
* allot_time: time of allocation of order to the rider
* accept_time: time of acceptance of the order by the rider (if available)
* pickup_time: time of pickup of the order (if available)
* delivered_time: time of delivery of the order (if available)
* cancelled_time: time of cancellation of order (if the order was cancelled)
* cancelled: whether the order was cancelled
* rider_id: unique id for each rider
* first_mile_distance: road distance from riderâ€™s location to the pickup location
* last_mile_distance: road distance from pickup location to the delivery location
* allotted_orders: total number of orders allotted to the rider in the 30 days before (not including) order_date
* delivered_orders: total number of orders delivered by the rider in the 30 days before (not including) order_date
* undelivered_orders: total number of orders allotted to but not delivered by the rider (i.e. cancelled) in the 30 days before (not including) order_date
* lifetime_order_count: total number of orders delivered by the rider at any time before order_date
* reassigned_order: whether the order was reassigned to this rider
* reassignment_method: if the order was reassigned, whether the reassignment was done manually (by the ops team) or automatically
* reassignment_reason: a more detailed reason for the reassignment
* session_time: total time the rider had been online on order_date before order_time

MAKING THE FEATURES CATEGORIES

In [None]:
id_columns = ['order_id']

time_columns = ['order_time', 'order_date', 'allot_time', 'accept_time',
       'pickup_time', 'delivered_time','cancelled_time']

categorical_columns = ['reassignment_method','reassignment_reason']

CONVERTING TO DATETIME AND OBJECT


In [None]:
ids = train[id_columns]
train = train.drop(ids, axis=1)
test = test.drop(ids,axis=1)

for col in time_columns:
    train[col] = pd.to_datetime(train[col])
    try:
        test[col] = pd.to_datetime(test[col])
    except:
        continue

for col in categorical_columns:
    train[col] = train[col].astype('object')
    try:
        test[col] = test[col].astype('object')
    except:
        continue


PREPROCESSING

In [None]:
def peak_hour(x):
    if (x >= 8 and x <= 11) or (x >= 13 and x <= 19):
        return 1
    return 0

def log_normal(x):
    if x == 0:
        return 0
    return math.log(x) if x > 0 else math.log(abs(x))


In [None]:
def preprocess(df):
    df['order_day'] = df['order_date'].dt.dayofyear
    df['diff_allot_order'] = (df['allot_time'] - df['order_time']).dt.total_seconds()
    df['diff_accept_allot'] = (df['accept_time'] - df['allot_time']).dt.total_seconds()
    df['diff_accept_order'] = (df['accept_time'] - df['order_time']).dt.total_seconds()
    df['total_dist'] = df['first_mile_distance']+df['last_mile_distance']
    df['hour_in_day'] = df['order_time'].dt.hour
    df['day_of_week'] = df['order_date'].dt.dayofweek
    df[['cum_diff_allot_order','cum_diff_accept_allot','cum_diff_accept_order','cum_total_dist','cum_first_mile_dist','cum_last_mile_dist']]=df.groupby(['rider_id','order_date'])[['diff_allot_order','diff_accept_allot','diff_accept_order','total_dist','first_mile_distance','last_mile_distance']].cumsum()
    df['order_today'] = df.groupby(['rider_id','order_date']).cumcount()
    df['orders_cum'] = df.groupby(['rider_id']).cumcount()
    df['reassigned_order'] = df['reassigned_order'].fillna(0)                                                      
    df['saturday'] = df['day_of_week'].apply(lambda x:1 if x==5 else 0)
    for col in df.columns:
        if df[col].isnull().sum()>0:
            df[col]=df[col].fillna(0)
    df['orders_diff'] = df['alloted_orders']-df['delivered_orders']
    riders = df.groupby(['order_date','hour_in_day'])['rider_id'].count().to_dict()
    df['riders_available'] = df.apply(lambda x: riders[(x.order_date,x.hour_in_day)],axis=1)
    df['reliability'] = (df['alloted_orders']-df['undelivered_orders'])*100/df['alloted_orders']
    df['reliability'] = df['reliability'].fillna(0)
    cols = []
    col_dict = ((df.dtypes==int)|(df.dtypes==float)).to_dict()
    for col in col_dict.keys():
        if col_dict[col]:
            cols.append(col)
    df["mean"] = df[cols].mean(axis=1)
    df["std"] = df[cols].std(axis=1)
    df["min"] = df[cols].min(axis=1)
    df["max"] = df[cols].max(axis=1) 
    
    return df
