In [3]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from dateutil.relativedelta import relativedelta
import datetime 
from sklearn.impute import SimpleImputer

In [30]:
train = pd.read_csv('/train.csv')
test = pd.read_csv('/test.csv')
sample = pd.read_csv('/sample_submission.csv')

In [31]:
useless_cols = train.columns ^ test.columns
useless_cols

  """Entry point for launching an IPython kernel.


Index(['cancelled', 'cancelled_time', 'delivered_time', 'pickup_time'], dtype='object')

In [32]:
train.corr()['cancelled']

order_id                0.004266
rider_id                0.058096
first_mile_distance     0.009679
last_mile_distance      0.018696
alloted_orders         -0.034491
delivered_orders       -0.030220
cancelled               1.000000
undelivered_orders      0.007417
lifetime_order_count   -0.016192
reassigned_order             NaN
session_time           -0.019963
Name: cancelled, dtype: float64

In [33]:
target = train['cancelled'].copy()

In [34]:
target

0         0
1         0
2         0
3         0
4         0
         ..
449995    0
449996    0
449997    0
449998    0
449999    0
Name: cancelled, Length: 450000, dtype: int64

In [35]:
train.drop(['cancelled_time', 'delivered_time', 'pickup_time'],inplace = True,axis = 1)

In [36]:
def preprocess(train_data):
  train_data['order_time'] = pd.to_datetime(train_data['order_time'])
  train_data['dayOfWeek'] = train_data['order_time'].dt.dayofweek
  train_data['order_date'] = pd.to_datetime(train_data['order_date'])
  train_data['allot_time'] = pd.to_datetime(train_data['allot_time'])
  train_data['accept_time'] = pd.to_datetime(train_data['accept_time'])
  train_data['order_allot'] = train_data['allot_time'] -  train_data['order_time'] 
  train_data['order_accept'] = train_data['accept_time'] -  train_data['order_time']
  train_data['order_time_hrs'] = ((train_data['order_time'] - train_data['order_date'])/np.timedelta64(1,'h'))+6
  train_data['order_allot']=train_data['order_allot']/np.timedelta64(1,'m')
  train_data['order_accept']=train_data['order_accept']/np.timedelta64(1,'m')
  train_data.drop(['order_time', 'order_date', 'allot_time','order_id','accept_time','reassignment_method','reassignment_reason','reassigned_order','rider_id'],axis = 1,inplace = True)
  return train_data

In [37]:
train_pro = preprocess(train)

In [14]:
train_pro

Unnamed: 0,rider_id,first_mile_distance,last_mile_distance,alloted_orders,delivered_orders,cancelled,undelivered_orders,lifetime_order_count,session_time,dayOfWeek,order_allot,order_accept,order_time_hrs
0,11696.0,1.5666,2.65,46.0,46.0,0.0,0.0,621.0,,1,0.400000,0.550000,8.359722
1,18117.0,2.5207,2.76,8.0,8.0,0.0,0.0,105.0,3.266667,1,0.683333,1.483333,8.554444
2,18623.0,2.2074,4.80,1.0,1.0,0.0,0.0,66.0,9.816667,1,0.133333,0.400000,8.663611
3,15945.0,2.1894,6.38,1.0,1.0,0.0,0.0,127.0,17.533333,1,0.533333,1.216667,8.798056
4,17589.0,2.7870,4.01,34.0,34.0,0.0,0.0,84.0,1.350000,1,0.850000,1.450000,9.108333
...,...,...,...,...,...,...,...,...,...,...,...,...,...
321849,3566.0,0.9843,1.06,61.0,61.0,0.0,0.0,191.0,130.483333,2,0.033333,0.650000,14.731111
321850,561.0,1.4751,3.11,283.0,282.0,0.0,1.0,2437.0,207.866667,2,0.783333,1.033333,14.732500
321851,15868.0,2.2739,4.77,24.0,24.0,1.0,0.0,64.0,120.666667,2,16.300000,16.516667,14.733056
321852,850.0,1.5396,1.71,15.0,15.0,0.0,0.0,78.0,161.033333,2,2.116667,6.266667,14.733889


In [15]:
train_pro.isna().sum()

rider_id                    1
first_mile_distance         1
last_mile_distance          1
alloted_orders          12220
delivered_orders        12511
cancelled                   1
undelivered_orders      12511
lifetime_order_count       37
session_time             2638
dayOfWeek                   0
order_allot                 1
order_accept              116
order_time_hrs              0
dtype: int64

In [39]:
train_pro.drop('cancelled',axis = 1,inplace = True)

In [40]:
cols = train_pro.columns
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
train2 = imp.fit_transform(train_pro)
X = pd.DataFrame(train2,columns = cols )
X

Unnamed: 0,first_mile_distance,last_mile_distance,alloted_orders,delivered_orders,undelivered_orders,lifetime_order_count,session_time,dayOfWeek,order_allot,order_accept,order_time_hrs
0,1.5666,2.65,46.0,46.0,0.0,621.0,220.474779,1.0,0.400000,0.550000,8.359722
1,2.5207,2.76,8.0,8.0,0.0,105.0,3.266667,1.0,0.683333,1.483333,8.554444
2,2.2074,4.80,1.0,1.0,0.0,66.0,9.816667,1.0,0.133333,0.400000,8.663611
3,2.1894,6.38,1.0,1.0,0.0,127.0,17.533333,1.0,0.533333,1.216667,8.798056
4,2.7870,4.01,34.0,34.0,0.0,84.0,1.350000,1.0,0.850000,1.450000,9.108333
...,...,...,...,...,...,...,...,...,...,...,...
449995,0.5789,0.19,4.0,4.0,0.0,127.0,369.516667,5.0,0.466667,0.966667,16.054444
449996,1.9863,1.19,81.0,81.0,0.0,105.0,239.133333,5.0,0.016667,1.283333,16.054722
449997,1.5944,1.61,28.0,28.0,0.0,1488.0,204.150000,5.0,0.800000,1.350000,16.055000
449998,2.8939,4.68,72.0,72.0,0.0,105.0,65.583333,5.0,0.000000,2.366667,16.055278


In [24]:
X.isna().sum()

rider_id                0
first_mile_distance     0
last_mile_distance      0
alloted_orders          0
delivered_orders        0
undelivered_orders      0
lifetime_order_count    0
session_time            0
dayOfWeek               0
order_allot             0
order_accept            0
order_time_hrs          0
dtype: int64

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

param_grid = {
    
    'n_estimators': [100,150,200],
    'max_depth' : [3,5,7]
}

grid = GridSearchCV(GradientBoostingClassifier(), param_grid, scoring = 'roc_auc',refit = True, verbose = 10,n_jobs=-1)
grid.fit(X,target)

In [41]:
sample_weights = np.zeros(len(train2))
sample_weights[target == 0] = 1
sample_weights[target == 1] = 80

In [42]:
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=100,max_depth=5)
clf.fit(X,target,sample_weight = sample_weights)

GradientBoostingClassifier(max_depth=5)

In [43]:
test = preprocess(test)
test = imp.transform(test)
test = pd.DataFrame(test, columns = cols)

In [44]:
y_pred = clf.predict_proba(test)[:,1]
sample['cancelled'] = y_pred
sample.to_csv("Submit.csv",index = False)

In [45]:
y_pred

array([0.29226661, 0.36885221, 0.19110888, ..., 0.40480871, 0.31434098,
       0.43119949])