In [3]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from dateutil.relativedelta import relativedelta
import datetime 
from sklearn.impute import SimpleImputer

In [6]:
train = pd.read_csv('/train.csv')
test = pd.read_csv('/test.csv')
sample = pd.read_csv('/sample_submission.csv')

In [7]:
useless_cols = train.columns ^ test.columns
useless_cols

  """Entry point for launching an IPython kernel.


Index(['cancelled', 'cancelled_time', 'delivered_time', 'pickup_time'], dtype='object')

In [8]:
train.corr()['cancelled']

order_id                0.003273
rider_id                0.060149
first_mile_distance     0.010604
last_mile_distance      0.019367
alloted_orders         -0.036505
delivered_orders       -0.032025
cancelled               1.000000
undelivered_orders      0.006599
lifetime_order_count   -0.016231
reassigned_order             NaN
session_time           -0.020584
Name: cancelled, dtype: float64

In [9]:
target = train['cancelled'].copy()

In [10]:
target

0         0.0
1         0.0
2         0.0
3         0.0
4         0.0
         ... 
321849    0.0
321850    0.0
321851    1.0
321852    0.0
321853    NaN
Name: cancelled, Length: 321854, dtype: float64

In [11]:
train.drop(['cancelled_time', 'delivered_time', 'pickup_time'],inplace = True,axis = 1)

In [12]:
def preprocess(train_data):
  train_data['order_time'] = pd.to_datetime(train_data['order_time'])
  train_data['dayOfWeek'] = train_data['order_time'].dt.dayofweek
  train_data['order_date'] = pd.to_datetime(train_data['order_date'])
  train_data['allot_time'] = pd.to_datetime(train_data['allot_time'])
  train_data['accept_time'] = pd.to_datetime(train_data['accept_time'])
  train_data['order_allot'] = train_data['allot_time'] -  train_data['order_time'] 
  train_data['order_accept'] = train_data['accept_time'] -  train_data['order_time']
  train_data['order_time_hrs'] = ((train_data['order_time'] - train_data['order_date'])/np.timedelta64(1,'h'))+6
  train_data['order_allot']=train_data['order_allot']/np.timedelta64(1,'m')
  train_data['order_accept']=train_data['order_accept']/np.timedelta64(1,'m')
  train_data.drop(['order_time', 'order_date', 'allot_time','order_id','accept_time','reassignment_method','reassignment_reason','reassigned_order'],axis = 1,inplace = True)
  return train_data

In [13]:
train_pro = preprocess(train)

In [14]:
train_pro

Unnamed: 0,rider_id,first_mile_distance,last_mile_distance,alloted_orders,delivered_orders,cancelled,undelivered_orders,lifetime_order_count,session_time,dayOfWeek,order_allot,order_accept,order_time_hrs
0,11696.0,1.5666,2.65,46.0,46.0,0.0,0.0,621.0,,1,0.400000,0.550000,8.359722
1,18117.0,2.5207,2.76,8.0,8.0,0.0,0.0,105.0,3.266667,1,0.683333,1.483333,8.554444
2,18623.0,2.2074,4.80,1.0,1.0,0.0,0.0,66.0,9.816667,1,0.133333,0.400000,8.663611
3,15945.0,2.1894,6.38,1.0,1.0,0.0,0.0,127.0,17.533333,1,0.533333,1.216667,8.798056
4,17589.0,2.7870,4.01,34.0,34.0,0.0,0.0,84.0,1.350000,1,0.850000,1.450000,9.108333
...,...,...,...,...,...,...,...,...,...,...,...,...,...
321849,3566.0,0.9843,1.06,61.0,61.0,0.0,0.0,191.0,130.483333,2,0.033333,0.650000,14.731111
321850,561.0,1.4751,3.11,283.0,282.0,0.0,1.0,2437.0,207.866667,2,0.783333,1.033333,14.732500
321851,15868.0,2.2739,4.77,24.0,24.0,1.0,0.0,64.0,120.666667,2,16.300000,16.516667,14.733056
321852,850.0,1.5396,1.71,15.0,15.0,0.0,0.0,78.0,161.033333,2,2.116667,6.266667,14.733889


In [15]:
train_pro.isna().sum()

rider_id                    1
first_mile_distance         1
last_mile_distance          1
alloted_orders          12220
delivered_orders        12511
cancelled                   1
undelivered_orders      12511
lifetime_order_count       37
session_time             2638
dayOfWeek                   0
order_allot                 1
order_accept              116
order_time_hrs              0
dtype: int64

In [16]:
train_pro.drop('cancelled',axis = 1,inplace = True)

In [23]:
cols = train_pro.columns
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
train2 = imp.fit_transform(train_pro)
X = pd.DataFrame(train2,columns = cols )
X

Unnamed: 0,rider_id,first_mile_distance,last_mile_distance,alloted_orders,delivered_orders,undelivered_orders,lifetime_order_count,session_time,dayOfWeek,order_allot,order_accept,order_time_hrs
0,11696.000000,1.566600,2.650000,46.00000,46.00000,0.000000,621.000000,220.862876,1.0,0.400000,0.550000,8.359722
1,18117.000000,2.520700,2.760000,8.00000,8.00000,0.000000,105.000000,3.266667,1.0,0.683333,1.483333,8.554444
2,18623.000000,2.207400,4.800000,1.00000,1.00000,0.000000,66.000000,9.816667,1.0,0.133333,0.400000,8.663611
3,15945.000000,2.189400,6.380000,1.00000,1.00000,0.000000,127.000000,17.533333,1.0,0.533333,1.216667,8.798056
4,17589.000000,2.787000,4.010000,34.00000,34.00000,0.000000,84.000000,1.350000,1.0,0.850000,1.450000,9.108333
...,...,...,...,...,...,...,...,...,...,...,...,...
321849,3566.000000,0.984300,1.060000,61.00000,61.00000,0.000000,191.000000,130.483333,2.0,0.033333,0.650000,14.731111
321850,561.000000,1.475100,3.110000,283.00000,282.00000,1.000000,2437.000000,207.866667,2.0,0.783333,1.033333,14.732500
321851,15868.000000,2.273900,4.770000,24.00000,24.00000,0.000000,64.000000,120.666667,2.0,16.300000,16.516667,14.733056
321852,850.000000,1.539600,1.710000,15.00000,15.00000,0.000000,78.000000,161.033333,2.0,2.116667,6.266667,14.733889


In [24]:
X.isna().sum()

rider_id                0
first_mile_distance     0
last_mile_distance      0
alloted_orders          0
delivered_orders        0
undelivered_orders      0
lifetime_order_count    0
session_time            0
dayOfWeek               0
order_allot             0
order_accept            0
order_time_hrs          0
dtype: int64

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

param_grid = {
    
    'n_estimators': [100,150,200],
    'max_depth' : [3,5,7]
}

grid = GridSearchCV(GradientBoostingClassifier(), param_grid, scoring = 'roc_auc',refit = True, verbose = 10,n_jobs=-1)
grid.fit(X,target)

In [25]:
sample_weights = np.zeros(len(train2))
sample_weights[target == 0] = 1
sample_weights[target == 1] = 80

In [29]:
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=100,max_depth=5)
clf.fit(X,target,sample_weight = sample_weights)

ValueError: ignored

In [None]:
test = preprocess(test)
test = imp.transform(test)
test = pd.DataFrame(test, columns = cols)

KeyError: ignored

In [None]:
y_pred = clf.predict_proba(test)[:,1]
sample['cancelled'] = y_pred
sample.to_csv("Submit.csv",index = False)

In [None]:
y_pred

array([0.28517609, 0.32954324, 0.24521657, ..., 0.3585884 , 0.3741491 ,
       0.51861232])