In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.model_selection import StratifiedKFold,train_test_split, KFold
from sklearn.metrics import log_loss
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.decomposition import PCA

In [2]:
%matplotlib inline

In [187]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sub = pd.read_csv('sample_submission.csv')

In [188]:
split = train.shape[0]
df = pd.concat([train, test], axis=0)
df.shape

(306925, 10)

In [189]:
df.head()

Unnamed: 0,ID,subscriber_id,date_time,agent_id,entry_channel_id,stage_status,subscriber_age,gender,city,label
0,ID_PV1E3QPT,ecc55e48-fc41-4d07-af5b-4538f1644337,2022-07-18T14:27:33.109176+01:00,e06b3324-a7a7-480d-bd1e-3b13ddd432a5,4,,40.0,male,Ibadan,Issued Guarantor Form
1,ID_JV7ZCDMD,dd1fd326-030a-4538-b18d-4152febdd8f6,2022-08-05T11:19:03.876308+01:00,55b84aa7-3545-4e03-a9b4-b0ae08c15042,4,,36.0,male,Abeokuta,Issued Guarantor Form
2,ID_DN225KNK,8cfc6368-85fc-486e-b420-321e08fb781e,2021-08-13T11:56:09.789814+01:00,49c437b1-2abc-45b0-91cb-c5334172faf1,4,,30.0,male,Ibadan,Top of the funnel
3,ID_10O55XLL,2c0bbcfa-f19e-4f99-b812-10c07a84f119,2022-05-07T19:00:36.916704+01:00,07887ae3-2c35-49f0-95ec-dbf22719794f,4,,45.0,male,Ibadan,Top of the funnel
4,ID_RY926X3E,c7cab337-cdbd-41e7-9b65-108982e59c0c,2021-01-21T15:33:41.00366+01:00,49c437b1-2abc-45b0-91cb-c5334172faf1,4,,30.0,male,Ibadan,Top of the funnel


In [190]:
test.head()

Unnamed: 0,ID,subscriber_id,date_time,agent_id,entry_channel_id,stage_status,subscriber_age,gender,city
0,ID_04I3F51N,40a8de13-ed2a-46c9-805b-3603ee5108bc,2021-09-23T11:42:19.449752+01:00,58ffddff-7d92-4668-b3c8-3f6b9e523ee0,4,,35.0,male,Ibadan
1,ID_RL7OZ03G,be6b5a6f-233d-4a3c-85c9-0d14baa8ceef,2021-06-15T14:33:14.917173+01:00,5939bebb-79e4-442c-a556-2d66219bfaac,4,,35.0,male,Akure
2,ID_880UB1KT,90eda8c4-6463-4f66-a458-7850dab3acf1,2022-01-18T12:13:03.105838+01:00,0676b8d9-cb2e-499f-a113-a4ea7cab5290,4,,36.0,male,Ibadan
3,ID_P46W8LVS,05cb42d3-66fa-4aaa-acd0-c0ce2f6775d3,2021-08-10T16:32:58.144678+01:00,a248d29e-c5ea-4cf9-ad3e-f200eac89358,4,,40.0,male,Akure
4,ID_JHON927V,c64aea9b-b0f0-4f91-a2fe-720a6d2e4ec4,2022-05-18T09:58:43.659402+01:00,257532f9-13a5-4304-92e4-3007a734a5e6,4,,30.0,male,Oshogbo


In [191]:
df['label'].unique()

array(['Issued Guarantor Form', 'Top of the funnel', 'Test Scheduled',
       'Received Guarantor Form', 'Awaiting Activation',
       'Awaiting Vehicle Pickup', 'Onboarding', 'Activated', 'Tested',
       'Checked In For Test', 'In Verification', 'Backlog', nan],
      dtype=object)

In [192]:
df['label'].nunique()

12

In [193]:
df.isna().sum()

ID                       0
subscriber_id            0
date_time                0
agent_id              1665
entry_channel_id         0
stage_status        260732
subscriber_age        2499
gender                 567
city                   295
label                92078
dtype: int64

In [194]:
df.shape

(306925, 10)

In [195]:
df.subscriber_id.nunique()

88646

In [196]:
df.agent_id.nunique()

425

In [197]:
df.agent_id.value_counts()[:5]

49c437b1-2abc-45b0-91cb-c5334172faf1    22311
ae31f6f4-2c62-4079-a050-cd88263887d4    15366
2e28d108-8f22-4e8b-ab1e-eb50e144eca9     8755
f8f5f298-e4a1-472f-a7e8-a44c5435f973     8409
58ffddff-7d92-4668-b3c8-3f6b9e523ee0     8142
Name: agent_id, dtype: int64

In [198]:
df.ID.nunique()

306925

In [199]:
df.date_time = pd.to_datetime(df['date_time'])
df['year'] = df.date_time.dt.year
df['month'] = df.date_time.dt.month
df['day'] = df.date_time.dt.day

In [200]:
df.drop(['ID', 'date_time'], axis=1, inplace=True)

In [201]:
df.gender.value_counts()

male           303663
female           2312
undisclosed       363
Male               12
Female              8
Name: gender, dtype: int64

In [202]:
df.gender = df.gender.replace({'Male':'male', 'Female':'female'})

In [203]:
df.gender.mode()[0]

'male'

In [204]:
df.gender.value_counts()

male           303675
female           2320
undisclosed       363
Name: gender, dtype: int64

In [205]:
df.stage_status.unique()

array([nan, 'not_interested', 'schedule', 'in_verification', 'pass',
       'Received Guarantor Form', 'submitted', 'not_reachable', 'no_show',
       'received_guarantor_form', 'fail', 'awaiting_vehicle_pickup',
       'checked_in_for_test', 'Awaiting Vehicle Pickup', 'not_submitted',
       'retrain', 'issued_guarantor_form'], dtype=object)

In [206]:
df.stage_status.replace({'in_verification':'In Verification', 'received_guarantor_form':'Received Guarantor Form',
                         'checked_in_for_test':'Checked In For Test', 'issued_guarantor_form':'Issued Guarantor Form',
                        'awaiting_vehicle_pickup':'Awaiting Vehicle Pickup'}, inplace=True)
df.stage_status.unique()

array([nan, 'not_interested', 'schedule', 'In Verification', 'pass',
       'Received Guarantor Form', 'submitted', 'not_reachable', 'no_show',
       'fail', 'Awaiting Vehicle Pickup', 'Checked In For Test',
       'not_submitted', 'retrain', 'Issued Guarantor Form'], dtype=object)

In [207]:
df.stage_status.value_counts()

pass                       12108
schedule                    9420
Received Guarantor Form     7265
not_interested              5478
submitted                   3250
In Verification             2326
not_reachable               2033
fail                        1916
no_show                     1447
Awaiting Vehicle Pickup      625
not_submitted                303
retrain                       13
Checked In For Test            5
Issued Guarantor Form          4
Name: stage_status, dtype: int64

In [208]:
#data leak
df[df.stage_status == df.label].stage_status.unique()

array(['Received Guarantor Form', 'Awaiting Vehicle Pickup',
       'Checked In For Test', 'Issued Guarantor Form'], dtype=object)

In [209]:
#data leak
df[df.stage_status == df.label].label.unique()

array(['Received Guarantor Form', 'Awaiting Vehicle Pickup',
       'Checked In For Test', 'Issued Guarantor Form'], dtype=object)

In [210]:
df[df.stage_status.isna()]

Unnamed: 0,subscriber_id,agent_id,entry_channel_id,stage_status,subscriber_age,gender,city,label,year,month,day
0,ecc55e48-fc41-4d07-af5b-4538f1644337,e06b3324-a7a7-480d-bd1e-3b13ddd432a5,4,,40.0,male,Ibadan,Issued Guarantor Form,2022,7,18
1,dd1fd326-030a-4538-b18d-4152febdd8f6,55b84aa7-3545-4e03-a9b4-b0ae08c15042,4,,36.0,male,Abeokuta,Issued Guarantor Form,2022,8,5
2,8cfc6368-85fc-486e-b420-321e08fb781e,49c437b1-2abc-45b0-91cb-c5334172faf1,4,,30.0,male,Ibadan,Top of the funnel,2021,8,13
3,2c0bbcfa-f19e-4f99-b812-10c07a84f119,07887ae3-2c35-49f0-95ec-dbf22719794f,4,,45.0,male,Ibadan,Top of the funnel,2022,5,7
4,c7cab337-cdbd-41e7-9b65-108982e59c0c,49c437b1-2abc-45b0-91cb-c5334172faf1,4,,30.0,male,Ibadan,Top of the funnel,2021,1,21
...,...,...,...,...,...,...,...,...,...,...,...
92072,cca0a3bb-f749-414f-b8c6-7219385fcd4b,d52b9e23-5308-4c85-9626-6f1b328caa72,4,,30.0,male,Ibadan,,2022,7,29
92073,f64718a1-64bf-4245-8c5d-29d494e6942f,8cfa8917-7703-442e-b44c-abd13d9df0ab,4,,36.0,male,Oshogbo,,2022,7,27
92074,4b12b1b3-f84b-442a-a70c-731ab0a47168,9b1fa68e-5584-4950-b6c4-a35a1ab88ff4,4,,25.0,male,Oshogbo,,2022,8,17
92076,2f05d8c8-4777-4342-9d1a-7fa8f418f85e,9b6b6411-25ad-487b-9e0b-d8d357d7f27e,4,,31.0,male,Ibadan,,2022,8,24


In [211]:
#mass missing
df.stage_status.isna().sum()/len(df)

0.8494974342266026

In [212]:
df.city.unique()

array(['Ibadan', 'Abeokuta', 'Oshogbo', 'Akure', 'Ado Ekiti', 'Lagos',
       'Gbamugbamu', 'Accra', nan, 'Kano', 'Bonny Island', 'GbamuGbamu',
       'ibadan'], dtype=object)

In [213]:
df.city.value_counts()

Ibadan          158208
Oshogbo          63354
Akure            56731
Abeokuta         15058
Ado Ekiti         6778
Lagos             6097
Accra              268
Gbamugbamu         106
GbamuGbamu          19
Bonny Island         8
Kano                 2
ibadan               1
Name: city, dtype: int64

In [214]:
df.city = df.city.replace({'ibadan':'Ibadan', 'GbamuGbamu':'Gbamugbamu'})

In [215]:
df.city.value_counts()

Ibadan          158209
Oshogbo          63354
Akure            56731
Abeokuta         15058
Ado Ekiti         6778
Lagos             6097
Accra              268
Gbamugbamu         125
Bonny Island         8
Kano                 2
Name: city, dtype: int64

In [216]:
df[df.stage_status.isna()].label.value_counts()

Top of the funnel          51740
Test Scheduled             45768
Checked In For Test        20369
Issued Guarantor Form      13270
Awaiting Vehicle Pickup    12343
Tested                     10462
Activated                   8284
Received Guarantor Form     6695
In Verification             5387
Onboarding                  5287
Awaiting Activation         3129
Name: label, dtype: int64

In [217]:
df.subscriber_age.value_counts()[:10]

35.0    52716
30.0    49349
32.0    26651
40.0    18527
36.0    14073
        ...  
90.0        9
0.0         9
84.0        5
76.0        5
10.0        5
Name: subscriber_age, Length: 66, dtype: int64

In [218]:
df.subscriber_age.unique()

array([ 40.,  36.,  30.,  45.,  28.,  32.,  35.,  34.,  39.,  27., 100.,
        22.,  48.,  38.,  43.,  42.,  21.,  47.,  20.,  29.,  50.,  37.,
        33.,  46.,  25.,  44.,  26.,  51.,  58.,  24.,  31.,  41.,  23.,
        52.,  53.,  55.,   2.,  62.,  60.,  nan,  65.,  18.,  54.,  57.,
        49.,  63.,  69.,   0.,  67.,  56.,  19.,  79.,  68.,  72.,   3.,
        59.,  66.,  64.,  61.,  90.,  84.,  70.,  71.,   4.,  75.,   6.,
         5.,  80.,  81.,  76.,  78.,  10.,  15.,  16., 271.,  88.,   9.,
        91.])

In [219]:
age_mean = df.subscriber_age.mean()
age_mean

34.72781891165669

In [220]:
#filling with max
cat_missing = ['gender', 'agent_id', 'city']
for i in cat_missing:
    max_ = df[i].mode()[0]
    df[i].fillna(max_, inplace=True)
    
df.stage_status.fillna('missing', inplace=True)

df.subscriber_age = df.subscriber_age.fillna(age_mean)

In [221]:
df.isna().sum()

subscriber_id           0
agent_id                0
entry_channel_id        0
stage_status            0
subscriber_age          0
gender                  0
city                    0
label               92078
year                    0
month                   0
day                     0
dtype: int64

In [222]:
df.head()

Unnamed: 0,subscriber_id,agent_id,entry_channel_id,stage_status,subscriber_age,gender,city,label,year,month,day
0,ecc55e48-fc41-4d07-af5b-4538f1644337,e06b3324-a7a7-480d-bd1e-3b13ddd432a5,4,missing,40.0,male,Ibadan,Issued Guarantor Form,2022,7,18
1,dd1fd326-030a-4538-b18d-4152febdd8f6,55b84aa7-3545-4e03-a9b4-b0ae08c15042,4,missing,36.0,male,Abeokuta,Issued Guarantor Form,2022,8,5
2,8cfc6368-85fc-486e-b420-321e08fb781e,49c437b1-2abc-45b0-91cb-c5334172faf1,4,missing,30.0,male,Ibadan,Top of the funnel,2021,8,13
3,2c0bbcfa-f19e-4f99-b812-10c07a84f119,07887ae3-2c35-49f0-95ec-dbf22719794f,4,missing,45.0,male,Ibadan,Top of the funnel,2022,5,7
4,c7cab337-cdbd-41e7-9b65-108982e59c0c,49c437b1-2abc-45b0-91cb-c5334172faf1,4,missing,30.0,male,Ibadan,Top of the funnel,2021,1,21


In [223]:
encode = ['subscriber_id', 'agent_id', 'gender', 'city']
le = LabelEncoder()

for i in encode:
    df[i] = df[i].apply(lambda x: str(x))
    le.fit(df[i])
    df[i] = le.transform(df[i])

In [224]:
df.head()

Unnamed: 0,subscriber_id,agent_id,entry_channel_id,stage_status,subscriber_age,gender,city,label,year,month,day
0,81824,372,4,missing,40.0,1,6,Issued Guarantor Form,2022,7,18
1,76356,136,4,missing,36.0,1,0,Issued Guarantor Form,2022,8,5
2,48773,125,4,missing,30.0,1,6,Top of the funnel,2021,8,13
3,15197,16,4,missing,45.0,1,6,Top of the funnel,2022,5,7
4,69092,125,4,missing,30.0,1,6,Top of the funnel,2021,1,21


In [225]:
df.agent_id.value_counts()

125    23976
290    15366
78      8755
417     8409
144     8142
       ...  
204        2
335        1
155        1
246        1
177        1
Name: agent_id, Length: 425, dtype: int64

In [226]:
sub.shape

(92078, 13)

In [227]:
df.label.nunique()

12

In [228]:
sub.columns

Index(['ID', 'Activated', 'Awaiting Activation', 'Awaiting Vehicle Pickup',
       'Backlog', 'Checked In For Test', 'In Verification',
       'Issued Guarantor Form', 'Onboarding', 'Received Guarantor Form',
       'Test Scheduled', 'Tested', 'Top of the funnel'],
      dtype='object')

In [229]:
df.label.unique()

array(['Issued Guarantor Form', 'Top of the funnel', 'Test Scheduled',
       'Received Guarantor Form', 'Awaiting Activation',
       'Awaiting Vehicle Pickup', 'Onboarding', 'Activated', 'Tested',
       'Checked In For Test', 'In Verification', 'Backlog', nan],
      dtype=object)

In [230]:
df.stage_status.unique()

array(['missing', 'not_interested', 'schedule', 'In Verification', 'pass',
       'Received Guarantor Form', 'submitted', 'not_reachable', 'no_show',
       'fail', 'Awaiting Vehicle Pickup', 'Checked In For Test',
       'not_submitted', 'retrain', 'Issued Guarantor Form'], dtype=object)

In [231]:
#manual encoding
stage_label = {'missing':0, 'not_interested':13, 'schedule':14, 'In Verification':6, 'pass':1,
       'Received Guarantor Form':9, 'submitted':15, 'not_reachable':16, 'no_show':17,
       'fail':18, 'Awaiting Vehicle Pickup':3, 'Checked In For Test':5,
       'not_submitted':-1, 'retrain':-2, 'Issued Guarantor Form':7}

df.stage_status = df.stage_status.map(stage_label)

In [232]:
# #backlog is only 1, so i chabged it to Awaiting activation

# #manual encoding
# to_label = {'Activated':1, 'Awaiting Activation':2, 'Awaiting Vehicle Pickup':3,
#        'Backlog':4, 'Checked In For Test':5, 'In Verification':6,
#        'Issued Guarantor Form':7, 'Onboarding':8, 'Received Guarantor Form':9,
#        'Test Scheduled':10, 'Tested':11, 'Top of the funnel':0}

# df.label = df.label.map(to_label)

In [233]:
df.label.value_counts()

Top of the funnel          61664
Test Scheduled             51211
Checked In For Test        21146
Issued Guarantor Form      15810
Received Guarantor Form    14223
Tested                     14091
Awaiting Vehicle Pickup    12786
Activated                   8284
Onboarding                  6930
In Verification             5571
Awaiting Activation         3130
Backlog                        1
Name: label, dtype: int64

In [234]:
df.head()

Unnamed: 0,subscriber_id,agent_id,entry_channel_id,stage_status,subscriber_age,gender,city,label,year,month,day
0,81824,372,4,0,40.0,1,6,Issued Guarantor Form,2022,7,18
1,76356,136,4,0,36.0,1,0,Issued Guarantor Form,2022,8,5
2,48773,125,4,0,30.0,1,6,Top of the funnel,2021,8,13
3,15197,16,4,0,45.0,1,6,Top of the funnel,2022,5,7
4,69092,125,4,0,30.0,1,6,Top of the funnel,2021,1,21


In [235]:
#train with this..then come back to using one hot encoding for stage_status and entry_channel

In [236]:
train = df[:split]
test = df[split:]
# test.drop('label', axis=1, inplace=True)
# data.reset_index(drop=True, inplace=True)

In [139]:
# train.label = train.label.astype(int)

In [237]:
test = test.drop('label', axis=1)

In [238]:
train.head()

Unnamed: 0,subscriber_id,agent_id,entry_channel_id,stage_status,subscriber_age,gender,city,label,year,month,day
0,81824,372,4,0,40.0,1,6,Issued Guarantor Form,2022,7,18
1,76356,136,4,0,36.0,1,0,Issued Guarantor Form,2022,8,5
2,48773,125,4,0,30.0,1,6,Top of the funnel,2021,8,13
3,15197,16,4,0,45.0,1,6,Top of the funnel,2022,5,7
4,69092,125,4,0,30.0,1,6,Top of the funnel,2021,1,21


In [239]:
test.head()

Unnamed: 0,subscriber_id,agent_id,entry_channel_id,stage_status,subscriber_age,gender,city,year,month,day
0,22317,144,4,0,35.0,1,6,2021,9,23
1,65812,147,4,0,35.0,1,3,2021,6,15
2,50117,14,4,0,36.0,1,6,2022,1,18
3,2013,267,4,0,40.0,1,3,2021,8,10
4,68570,59,4,0,30.0,1,9,2022,5,18


In [240]:
from catboost import CatBoostClassifier, Pool
from catboost.utils import eval_metric
from sklearn.datasets import make_multilabel_classification
from sklearn.model_selection import train_test_split

In [241]:
X = train.drop('label', axis=1)
y = pd.get_dummies(train.label).values

In [248]:
# X, Y = make_multilabel_classification(n_samples=500, n_features=20, n_classes=5, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3 )
train_pool = Pool(X_train, y_train)
test_pool = Pool(X_test, y_test)

In [249]:
label_name = pd.get_dummies(train.label).columns

In [250]:
X_train.values

array([[5.2494e+04, 2.6100e+02, 4.0000e+00, ..., 2.0210e+03, 7.0000e+00,
        2.9000e+01],
       [4.2159e+04, 2.2400e+02, 4.0000e+00, ..., 2.0210e+03, 8.0000e+00,
        9.0000e+00],
       [1.6663e+04, 2.7900e+02, 4.0000e+00, ..., 2.0220e+03, 7.0000e+00,
        2.1000e+01],
       ...,
       [6.3621e+04, 5.1000e+01, 2.0000e+00, ..., 2.0220e+03, 5.0000e+00,
        1.1000e+01],
       [4.3754e+04, 1.9100e+02, 4.0000e+00, ..., 2.0220e+03, 7.0000e+00,
        2.8000e+01],
       [9.5260e+03, 1.2500e+02, 4.0000e+00, ..., 2.0220e+03, 2.0000e+00,
        1.0000e+00]])

In [251]:
len(y_train)

150392

In [252]:
len(X_train)

150392

In [253]:
clf = CatBoostClassifier(
    loss_function='MultiCrossEntropy',
# #     eval_metric='HammingLoss',
#     iterations=500,
#     class_names=label_name
)
clf.fit(X_train, y_train,verbose=50)

Learning rate set to 0.087621
0:	learn: 0.5680944	total: 2.23s	remaining: 37m 6s
50:	learn: 0.2069423	total: 1m 49s	remaining: 33m 59s
100:	learn: 0.2039258	total: 3m 31s	remaining: 31m 24s
150:	learn: 0.2020044	total: 4m 28s	remaining: 25m 11s
200:	learn: 0.2005362	total: 5m 27s	remaining: 21m 42s
250:	learn: 0.1993389	total: 6m 23s	remaining: 19m 2s
300:	learn: 0.1983459	total: 7m 19s	remaining: 16m 59s
350:	learn: 0.1975255	total: 8m 21s	remaining: 15m 27s
400:	learn: 0.1967545	total: 9m 21s	remaining: 13m 58s
450:	learn: 0.1960992	total: 10m 23s	remaining: 12m 39s
500:	learn: 0.1955103	total: 11m 28s	remaining: 11m 26s
550:	learn: 0.1949356	total: 12m 39s	remaining: 10m 19s
600:	learn: 0.1943529	total: 13m 45s	remaining: 9m 8s
650:	learn: 0.1938477	total: 15m 1s	remaining: 8m 3s
700:	learn: 0.1933507	total: 16m 8s	remaining: 6m 53s
750:	learn: 0.1928823	total: 17m 19s	remaining: 5m 44s
800:	learn: 0.1924688	total: 18m 25s	remaining: 4m 34s
850:	learn: 0.1920166	total: 19m 29s	remai

<catboost.core.CatBoostClassifier at 0x19b9bfd0ac0>

In [None]:
clf = CatBoostClassifier(n_estimators=5000,
    loss_function='MultiCrossEntropy',
# #     eval_metric='HammingLoss',
#     iterations=500,
#     class_names=label_name
)
clf.fit(X_train, y_train,verbose=100)

Learning rate set to 0.020029
0:	learn: 0.6615785	total: 1.68s	remaining: 2h 20m 7s
100:	learn: 0.2149026	total: 2m	remaining: 1h 37m 16s
200:	learn: 0.2075755	total: 3m 52s	remaining: 1h 32m 22s
300:	learn: 0.2055174	total: 5m 48s	remaining: 1h 30m 44s
400:	learn: 0.2044005	total: 7m 42s	remaining: 1h 28m 25s
500:	learn: 0.2034322	total: 9m 45s	remaining: 1h 27m 35s
600:	learn: 0.2026194	total: 11m 57s	remaining: 1h 27m 30s
700:	learn: 0.2018225	total: 13m 58s	remaining: 1h 25m 41s
800:	learn: 0.2011121	total: 15m 49s	remaining: 1h 22m 56s
900:	learn: 0.2005182	total: 17m 31s	remaining: 1h 19m 44s
1000:	learn: 0.1999721	total: 19m 12s	remaining: 1h 16m 46s
1100:	learn: 0.1994744	total: 20m 56s	remaining: 1h 14m 9s
1200:	learn: 0.1990054	total: 22m 37s	remaining: 1h 11m 34s
1300:	learn: 0.1985768	total: 24m 17s	remaining: 1h 9m 4s
1400:	learn: 0.1981639	total: 26m 2s	remaining: 1h 6m 53s
1500:	learn: 0.1977612	total: 29m 2s	remaining: 1h 7m 41s
1600:	learn: 0.1973912	total: 29m 41s	rem

In [151]:
clf = CatBoostClassifier(
    loss_function='MultiCrossEntropy',
# #     eval_metric='HammingLoss',
#     iterations=500,
#     class_names=label_name
)
clf.fit(X_train, y_train,verbose=50)

Learning rate set to 0.087621
0:	learn: 0.5727963	total: 463ms	remaining: 7m 42s
50:	learn: 0.2137288	total: 16.6s	remaining: 5m 8s
100:	learn: 0.2112876	total: 32.6s	remaining: 4m 50s
150:	learn: 0.2100251	total: 49s	remaining: 4m 35s
200:	learn: 0.2087603	total: 1m 5s	remaining: 4m 20s
250:	learn: 0.2078271	total: 1m 22s	remaining: 4m 5s
300:	learn: 0.2070120	total: 1m 40s	remaining: 3m 52s
350:	learn: 0.2063875	total: 1m 57s	remaining: 3m 36s
400:	learn: 0.2058008	total: 2m 15s	remaining: 3m 22s
450:	learn: 0.2052640	total: 2m 32s	remaining: 3m 5s
500:	learn: 0.2047324	total: 2m 50s	remaining: 2m 49s
550:	learn: 0.2043142	total: 3m 7s	remaining: 2m 32s
600:	learn: 0.2038777	total: 3m 25s	remaining: 2m 16s
650:	learn: 0.2035021	total: 3m 43s	remaining: 1m 59s
700:	learn: 0.2031436	total: 4m 1s	remaining: 1m 42s
750:	learn: 0.2028270	total: 4m 18s	remaining: 1m 25s
800:	learn: 0.2024567	total: 4m 35s	remaining: 1m 8s
850:	learn: 0.2021275	total: 4m 53s	remaining: 51.4s
900:	learn: 0.2

<catboost.core.CatBoostClassifier at 0x19bf8967490>

In [254]:
pred = clf.predict_proba(test)

In [255]:
result = pd.DataFrame(pred, columns=col)

In [256]:
result['ID'] =sub['ID']

In [184]:
new['ID'] = sub['ID']
new[sub.columns].to_csv('combo.csv', index=False)

In [186]:
new[sub.columns]

Unnamed: 0,ID,Activated,Awaiting Activation,Awaiting Vehicle Pickup,Backlog,Checked In For Test,In Verification,Issued Guarantor Form,Onboarding,Received Guarantor Form,Test Scheduled,Tested,Top of the funnel
0,ID_04I3F51N,0.059357,0.026378,0.050706,1.357436e-09,0.088505,0.032235,0.060330,0.032920,0.027785,0.272062,0.048753,0.305702
1,ID_RL7OZ03G,0.027989,0.013879,0.043148,3.257203e-09,0.082839,0.023991,0.052692,0.018092,0.019230,0.324867,0.051185,0.306341
2,ID_880UB1KT,0.041082,0.029112,0.064271,6.678092e-09,0.073653,0.032132,0.068938,0.038691,0.029065,0.213664,0.046297,0.357471
3,ID_P46W8LVS,0.026502,0.016749,0.033315,2.371457e-08,0.084132,0.022268,0.046003,0.022181,0.028705,0.343143,0.054567,0.329807
4,ID_JHON927V,0.038863,0.013502,0.084342,1.407462e-08,0.102663,0.045529,0.096573,0.044379,0.079067,0.207912,0.071271,0.215594
...,...,...,...,...,...,...,...,...,...,...,...,...,...
92073,ID_Y52L9ECX,0.028626,0.004730,0.087397,1.611517e-08,0.110005,0.024805,0.087657,0.038692,0.051250,0.240596,0.059427,0.259541
92074,ID_HW1J30MC,0.015399,0.000539,0.019968,4.501507e-08,0.134751,0.011509,0.097741,0.009777,0.036990,0.341489,0.042501,0.313415
92075,ID_JUCRN3HX,0.000004,0.000004,0.001333,2.713104e-08,0.000063,0.000037,0.218927,0.000707,0.232374,0.482405,0.000041,0.000086
92076,ID_PFSCZIBQ,0.029162,0.004502,0.037579,4.330325e-09,0.087983,0.007043,0.046940,0.010491,0.010060,0.362720,0.031731,0.367731


In [257]:
result[sub.columns].to_csv('FINAL.csv', index=False)

In [176]:
result['Backlog'] = 0

In [177]:
result

Unnamed: 0,Activated,Awaiting Activation,Awaiting Vehicle Pickup,Backlog,Checked In For Test,In Verification,Issued Guarantor Form,Onboarding,Received Guarantor Form,Test Scheduled,Tested,Top of the funnel,ID
0,0.062090,0.026677,0.047288,0,0.088893,0.032649,0.058486,0.032878,0.028218,0.271526,0.047354,0.309980,ID_04I3F51N
1,0.026598,0.014710,0.054156,0,0.079436,0.023902,0.054318,0.020115,0.023619,0.315244,0.054228,0.290058,ID_RL7OZ03G
2,0.042379,0.024861,0.065498,0,0.074116,0.029886,0.067302,0.036055,0.030772,0.206408,0.041691,0.374411,ID_880UB1KT
3,0.025653,0.018638,0.030075,0,0.081841,0.025892,0.044783,0.022280,0.023589,0.358368,0.056577,0.321158,ID_P46W8LVS
4,0.037658,0.010513,0.085176,0,0.103654,0.047627,0.091626,0.044106,0.073956,0.221749,0.070741,0.211071,ID_JHON927V
...,...,...,...,...,...,...,...,...,...,...,...,...,...
92073,0.024618,0.004017,0.082638,0,0.122863,0.021154,0.087143,0.033727,0.047197,0.237094,0.063672,0.267238,ID_Y52L9ECX
92074,0.017434,0.000586,0.021529,0,0.137866,0.011416,0.103144,0.010001,0.036364,0.339592,0.038475,0.314444,ID_HW1J30MC
92075,0.000002,0.000002,0.000559,0,0.000050,0.000035,0.224008,0.000401,0.232389,0.492535,0.000033,0.000069,ID_JUCRN3HX
92076,0.025268,0.005012,0.038924,0,0.088922,0.006540,0.046821,0.011695,0.008528,0.365423,0.031537,0.367343,ID_PFSCZIBQ


In [158]:
col = ['Activated', 'Awaiting Activation', 'Awaiting Vehicle Pickup',
       'Backlog', 'Checked In For Test', 'In Verification',
       'Issued Guarantor Form', 'Onboarding', 'Received Guarantor Form',
        'Test Scheduled', 'Tested', 'Top of the funnel']

In [183]:
new = (pd.read_csv('please_abeg.csv')[col] + pd.read_csv('please_full.csv')[col])/2

In [156]:
sub.columns

Index(['ID', 'Activated', 'Awaiting Activation', 'Awaiting Vehicle Pickup',
       'Backlog', 'Checked In For Test', 'In Verification',
       'Issued Guarantor Form', 'Onboarding', 'Received Guarantor Form',
       'Test Scheduled', 'Tested', 'Top of the funnel'],
      dtype='object')

In [155]:
sub

Unnamed: 0,ID,Activated,Awaiting Activation,Awaiting Vehicle Pickup,Backlog,Checked In For Test,In Verification,Issued Guarantor Form,Onboarding,Received Guarantor Form,Test Scheduled,Tested,Top of the funnel
0,ID_04I3F51N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ID_RL7OZ03G,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ID_880UB1KT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ID_P46W8LVS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ID_JHON927V,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
92073,ID_Y52L9ECX,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
92074,ID_HW1J30MC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
92075,ID_JUCRN3HX,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
92076,ID_PFSCZIBQ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [171]:
y

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1],
       ...,
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1]], dtype=uint8)