In [1]:
## We want to use catboost-classifier for this session 

import pandas as pd 
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt 
%matplotlib inline 

import warnings 
warnings.filterwarnings('ignore')
sns.set_theme(context='notebook', style='darkgrid', palette='deep', font='sans-serif', font_scale=1, color_codes=True, rc=None)

from sklearn.preprocessing import LabelBinarizer

from sklearn import preprocessing 

from catboost import CatBoostClassifier,Pool
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import auc, classification_report, roc_auc_score
from sklearn.metrics import accuracy_score


In [2]:
#lets load both our train and test datasets .
# additionally , lets also load our variable definition file

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
vardef = pd.read_csv('VariableDefinitions.csv')

# lets create a copy for both the train and test dataset 
trainCopy = train.copy()
testCopy = test.copy()


# lets have a look at our variable definition 
vardef

Unnamed: 0,Variable Definitions,Unnamed: 1
0,country,Country interviewee is in.
1,year,Year survey was done in.
2,uniqueid,Unique identifier for each interviewee
3,location_type,"Type of location: Rural, Urban"
4,cellphone_access,"If interviewee has access to a cellphone: Yes, No"
5,household_size,Number of people living in one house
6,age_of_respondent,The age of the interviewee
7,gender_of_respondent,"Gender of interviewee: Male, Female"
8,relationship_with_head,The interviewee’s relationship with the head o...
9,marital_status,The martial status of the interviewee: Married...


In [3]:
train['fam_bin']= pd.qcut(train.household_size,3,labels=['Small','Medium','Large'])

test['fam_bin']= pd.qcut(test.household_size,3,labels=['Small','Medium','Large'])


# let us try binning the age of respondent to find out the distribution for each country 
train['age_bins'] =pd.qcut(train.age_of_respondent, 4, labels=['Young','Youth','Adult','Old-ish'])


test['age_bins'] =pd.qcut(test.age_of_respondent, 4, labels=['Young','Youth','Adult','Old-ish'])



#change the data types 
#train['year']=train['year'].astype('category')
train['household_size'] = train['household_size'].astype('int32')
train['age_of_respondent'] = train['age_of_respondent'].astype('int32')

test['household_size'] = test['household_size'].astype('int32')
test['age_of_respondent'] = test['age_of_respondent'].astype('int32')


In [4]:
train['marital_status']=train['marital_status'].replace(to_replace ="Dont know", value ="Married/Living together")

test['marital_status']=test['marital_status'].replace(to_replace ="Dont know", value ="Married/Living together")


train['education_level'] = train['education_level'].replace(to_replace="Other/Dont know/RTA", value="Primary education")

test['education_level'] = test['education_level'].replace(to_replace="Other/Dont know/RTA", value="Primary education")

In [5]:
# label binarize the following columns 

from sklearn.preprocessing import LabelBinarizer

lb = LabelBinarizer()
train['bank_account']=lb.fit_transform(train['bank_account'])

In [6]:
trainId = train['uniqueid']
testId = test['uniqueid']
testCountry = test['country']
y= train['bank_account']
X = train.drop(['uniqueid','bank_account'] ,axis =1)
Xtest = test.drop('uniqueid',axis =1)


In [7]:
train.shape

(23524, 15)

In [8]:
test.shape

(10086, 14)

In [9]:
# Declaring our categorical columns 
train.columns

Index(['country', 'year', 'uniqueid', 'bank_account', 'location_type',
       'cellphone_access', 'household_size', 'age_of_respondent',
       'gender_of_respondent', 'relationship_with_head', 'marital_status',
       'education_level', 'job_type', 'fam_bin', 'age_bins'],
      dtype='object')

In [10]:
cat_cols =['country','location_type',
                'cellphone_access','gender_of_respondent',
               'relationship_with_head','marital_status',
               'education_level','job_type','fam_bin','age_bins']

cat_features = X[cat_cols]
catXtest_features = Xtest[cat_cols]
           

In [11]:
# Looking on label balance on our target data
print('Labels:  {}  '.format(set(y)))

print('Zero count = {}  One count = {}'.format(len(y)-sum(y) ,sum(y)))


Labels:  {0, 1}  
Zero count = 20212  One count = 3312


In [12]:
even_rate = (len(y)-sum(y))/sum(y)
print('Even rate for this study is =  {} '.format(even_rate))

Even rate for this study is =  6.102657004830918 


In [13]:
# Let's use pool to split our data into train and validation data 
from catboost import Pool
pool = Pool(data=X,label=y,cat_features=cat_features)

In [14]:
from sklearn.model_selection import train_test_split 
data = train_test_split(X,y,test_size=0.2,random_state=21)
X_train,X_validation,y_train,y_validation = data 

# this is our training pool
train_pool = Pool(
data=X_train,
label =  y_train,
cat_features =  cat_features
)

# here is our validation pool
validation_pool = Pool(
data=X_validation,
label =  y_validation,
cat_features =  cat_features
)

In [15]:
## model_two using  CatBoostClassifier 
model = CatBoostClassifier( iterations=400,early_stopping_rounds=270,custom_loss=['AUC','Accuracy'])

model.fit(train_pool,eval_set=validation_pool,verbose=False,plot=True)

print('Model is fitted: {}'.format(model.is_fitted()))
print('Model params:\n {}'.format(model.get_params()))

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Model is fitted: True
Model params:
 {'iterations': 400, 'custom_loss': ['AUC', 'Accuracy'], 'early_stopping_rounds': 270}


In [16]:
model = CatBoostClassifier(loss_function='Logloss',
                                               cat_features = cat_features,
                                               iterations = 400,
                                               l2_leaf_reg = 1,
                                               depth = 5
                                               
                                              )

model.fit( train_pool,
    eval_set = validation_pool,
    verbose=False,
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x225522c5730>

In [21]:
model.get_feature_importance (prettified=True)

Unnamed: 0,Feature Id,Importances
0,cellphone_access,26.58855
1,education_level,18.947977
2,age_of_respondent,11.049296
3,country,10.979236
4,job_type,9.78751
5,location_type,5.714599
6,relationship_with_head,4.611954
7,year,3.481437
8,marital_status,2.996295
9,gender_of_respondent,2.811182


print(X.shape)
print(Xtest.shape)

In [18]:
test_pool = Pool(data=Xtest, cat_features=catXtest_features)
contest_predictions = model.predict_proba(test_pool)
print('Predictoins:')
print(contest_predictions)

contpreds = model.predict(test_pool)
print(contpreds)

Predictoins:
[[0.15332391 0.84667609]
 [0.16094122 0.83905878]
 [0.98036358 0.01963642]
 ...
 [0.7108712  0.2891288 ]
 [0.96312491 0.03687509]
 [0.92008545 0.07991455]]
[1 1 0 ... 0 0 0]


In [19]:
submission = pd.DataFrame(data={'uniqueid':testId+' x '+ testCountry,'bank_account':contpreds})

submission.to_csv('Submission.csv',index=False)