In [1]:
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt 
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline 

import catboost
from sklearn import preprocessing 


from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import auc, classification_report, roc_auc_score
from sklearn.metrics import accuracy_score

### 1) Data Preparation

In [2]:
#lets load both our train and test datasets .
# additionally , lets also load our variable definition file

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
vardef = pd.read_csv('VariableDefinitions.csv')

# lets create a copy for both the train and test dataset 
trainCopy = train.copy()
testCopy = test.copy()


# lets have a look at our variable definition 
vardef

Unnamed: 0,Variable Definitions,Unnamed: 1
0,country,Country interviewee is in.
1,year,Year survey was done in.
2,uniqueid,Unique identifier for each interviewee
3,location_type,"Type of location: Rural, Urban"
4,cellphone_access,"If interviewee has access to a cellphone: Yes, No"
5,household_size,Number of people living in one house
6,age_of_respondent,The age of the interviewee
7,gender_of_respondent,"Gender of interviewee: Male, Female"
8,relationship_with_head,The interviewee’s relationship with the head o...
9,marital_status,The martial status of the interviewee: Married...


In [3]:
train.head()

Unnamed: 0,country,year,uniqueid,bank_account,location_type,cellphone_access,household_size,age_of_respondent,gender_of_respondent,relationship_with_head,marital_status,education_level,job_type
0,Kenya,2018,uniqueid_1,Yes,Rural,Yes,3,24,Female,Spouse,Married/Living together,Secondary education,Self employed
1,Kenya,2018,uniqueid_2,No,Rural,No,5,70,Female,Head of Household,Widowed,No formal education,Government Dependent
2,Kenya,2018,uniqueid_3,Yes,Urban,Yes,5,26,Male,Other relative,Single/Never Married,Vocational/Specialised training,Self employed
3,Kenya,2018,uniqueid_4,No,Rural,Yes,5,34,Female,Head of Household,Married/Living together,Primary education,Formally employed Private
4,Kenya,2018,uniqueid_5,No,Urban,No,8,26,Male,Child,Single/Never Married,Primary education,Informally employed


In [4]:
## performing feature preparation
# our df does not have null values 
null_value_stats = train.isnull().sum(axis=0)
null_value_stats[null_value_stats != 0]

Series([], dtype: int64)

In [5]:
from sklearn.preprocessing import LabelBinarizer

lb = LabelBinarizer()
train['bank_account']=lb.fit_transform(train['bank_account'])

In [6]:
# our dataset does not have missing values 
# separating our features and label variables 
X = train.drop('bank_account',axis=1)
y= train['bank_account']

In [7]:
# declaring our categorical features 
print(X.dtypes)

categorical_features_indices = np.where(X.dtypes == 'object')[0]

country                   object
year                       int64
uniqueid                  object
location_type             object
cellphone_access          object
household_size             int64
age_of_respondent          int64
gender_of_respondent      object
relationship_with_head    object
marital_status            object
education_level           object
job_type                  object
dtype: object


In [8]:
print(categorical_features_indices)

[ 0  2  3  4  7  8  9 10 11]


In [9]:
# Data splitting 
from sklearn.model_selection import train_test_split

X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.75, random_state=42)

X_test = test

In [10]:
from catboost import CatBoostClassifier, Pool, metrics, cv

### 2)Model Training 


In [11]:
# Basic model
model = CatBoostClassifier(
    custom_loss=['Logloss','AUC','Accuracy'],
    random_seed=42,
    logging_level='Silent'
)

In [12]:
model.fit(
    X_train, y_train,
    cat_features=categorical_features_indices,
    eval_set=(X_validation, y_validation),
#     logging_level='Verbose',  # you can uncomment this for text output
    plot=True
);

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [13]:
cv_params = model.get_params()
print(cv_params)

{'random_seed': 42, 'logging_level': 'Silent', 'custom_loss': ['Logloss', 'AUC', 'Accuracy']}


In [14]:
## attempting cross-validation

cv_params.update({'loss_function': metrics.Logloss() })

cv_data = cv(
    Pool(X, y, cat_features=categorical_features_indices),
    cv_params,
    nfold = 5,
    iterations= 1200,
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [15]:
cv_data.head()

Unnamed: 0,iterations,test-Logloss-mean,test-Logloss-std,train-Logloss-mean,train-Logloss-std,test-AUC-mean,test-AUC-std,test-Accuracy-mean,test-Accuracy-std,train-Accuracy-mean,train-Accuracy-std
0,0,0.665216,0.000629,0.665136,0.000704,0.784448,0.010374,0.882418,0.002113,0.882418,0.002397
1,1,0.639552,0.000316,0.639399,0.000428,0.804137,0.003705,0.884288,0.002042,0.883598,0.000917
2,2,0.615041,0.001589,0.614958,0.001175,0.816444,0.012738,0.883481,0.00177,0.884214,0.001043
3,3,0.592405,0.001937,0.592343,0.001238,0.820827,0.011301,0.884713,0.0022,0.884788,0.00076
4,4,0.570675,0.002186,0.570516,0.001656,0.827013,0.008097,0.884798,0.002498,0.884926,0.000582


In [16]:
print('Best validation AUC score is: {:.2f}±{:.2f} on step {}'.format(
    np.max(cv_data['test-AUC-mean']),
    cv_data['test-AUC-std'][np.argmax(cv_data['test-AUC-mean'])],
    np.argmax(cv_data['test-AUC-mean'])
))

Best validation AUC score is: 0.87±0.01 on step 611


In [17]:
print('Precise  AUC score: {}'.format(np.max(cv_data['test-AUC-mean'])))

Precise  AUC score: 0.8666699699386807


In [18]:
# Model applying 
predictions = model.predict(X_test)
predictions_probs = model.predict_proba(X_test)[:,1]
print(predictions[:10])
print(predictions_probs[:10])

[1 1 0 0 0 0 0 1 0 0]
[0.87821491 0.88343476 0.02284834 0.01433218 0.02310715 0.12820832
 0.07891748 0.57254898 0.03700185 0.43037195]


In [19]:
testId = test['uniqueid']
testCountry = test['country']

In [20]:
submission = pd.DataFrame(data={'uniqueid':testId+' x '+ testCountry,'bank_account':predictions_probs})

submission.to_csv('Submission.csv',index=False)

### 3) CatBoost Features

In [21]:
params = {
    'iterations': 600,
    'learning_rate': 0.15,
    'eval_metric': metrics.Accuracy(),
    'random_seed': 42,
    'logging_level': 'Silent',
    'use_best_model': False
}
train_pool = Pool(X_train, y_train, cat_features=categorical_features_indices)
validate_pool = Pool(X_validation, y_validation, cat_features=categorical_features_indices)

In [22]:
model = CatBoostClassifier(**params)
model.fit(train_pool, eval_set=validate_pool)


best_model_params = params.copy()
best_model_params.update({
    'use_best_model': True
})
best_model = CatBoostClassifier(**best_model_params)
best_model.fit(train_pool, eval_set=validate_pool);


print('Simple model validation accuracy: {:.4}'.format(accuracy_score(y_validation, model.predict(X_validation))))
print('')

print('Best model validation accuracy: {:.4}'.format( accuracy_score(y_validation, best_model.predict(X_validation))))

Simple model validation accuracy: 0.8896

Best model validation accuracy: 0.8924


In [28]:
predictions = best_model.predict(X_test)

In [29]:
submission = pd.DataFrame(data={'uniqueid':testId+' x '+ testCountry,'bank_account':predictions})

submission.to_csv('Submission.csv',index=False)

In [25]:
## attempting early stopping parameters

earlystop_params = params.copy()
earlystop_params.update({
    'od_type': 'Iter', # overfitting detector
    'od_wait': 40
})
earlystop_model = CatBoostClassifier(**earlystop_params)
earlystop_model.fit(train_pool, eval_set=validate_pool);

print('Best model validation accuracy: {:.4}'.format( accuracy_score(y_validation, earlystop_model.predict(X_validation))))

Best model validation accuracy: 0.8903


In [26]:
print('Simple model tree count: {}'.format(model.tree_count_))
print('Simple model validation accuracy: {:.4}'.format(
    accuracy_score(y_validation, model.predict(X_validation))
))
print('')

print('Early-stopped model tree count: {}'.format(earlystop_model.tree_count_))
print('Early-stopped model validation accuracy: {:.4}'.format(
    accuracy_score(y_validation, earlystop_model.predict(X_validation))
))

Simple model tree count: 600
Simple model validation accuracy: 0.8896

Early-stopped model tree count: 133
Early-stopped model validation accuracy: 0.8903


In [27]:
predictions = earlystop_model.predict(X_test)

submission = pd.DataFrame(data={'uniqueid':testId+' x '+ testCountry,'bank_account':predictions})

submission.to_csv('Submission.csv',index=False)

In [28]:
# using baseline
current_params = params.copy()
current_params.update({
    'iterations': 10
})
model = CatBoostClassifier(**current_params).fit(X_train, y_train, categorical_features_indices)
# Get baseline (only with prediction_type='RawFormulaVal')
baseline = model.predict(X_train, prediction_type='RawFormulaVal')
# Fit new model
model.fit(X_train, y_train, categorical_features_indices, baseline=baseline);

In [29]:
#snapshotting
params_with_snapshot = params.copy()
params_with_snapshot.update({
    'iterations': 5,
    'learning_rate': 0.5,
    'logging_level': 'Verbose'
})
model = CatBoostClassifier(**params_with_snapshot).fit(train_pool, eval_set=validate_pool, save_snapshot=True)
params_with_snapshot.update({
    'iterations': 10,
    'learning_rate': 0.1,
})
model = CatBoostClassifier(**params_with_snapshot).fit(train_pool, eval_set=validate_pool, save_snapshot=True)


bestTest = 0.8867539534
bestIteration = 9


bestTest = 0.8867539534
bestIteration = 9



In [30]:
# feature importances
model = CatBoostClassifier(iterations=50, random_seed=42, logging_level='Silent').fit(train_pool)
feature_importances = model.get_feature_importance(train_pool)
feature_names = X_train.columns
for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
    print('{}: {}'.format(name, score))

education_level: 19.983085023899235
cellphone_access: 15.824708793906652
job_type: 15.334053171882058
country: 13.520019103692233
age_of_respondent: 8.529908364632602
year: 7.8569435345738
location_type: 6.527291525776653
gender_of_respondent: 3.606801359063477
relationship_with_head: 3.5670744443284286
marital_status: 2.7185705700304457
household_size: 1.6815603874711262
uniqueid: 0.8499837207432747


In [31]:
# eval metrics
model = CatBoostClassifier(iterations=50, random_seed=42, logging_level='Silent').fit(train_pool)
eval_metrics = model.eval_metrics(validate_pool, [metrics.AUC()], plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [32]:
# parameter tuning 
#!pip install hyperopt

In [33]:
import hyperopt


def hyperopt_objective(params):
    model = CatBoostClassifier(
        l2_leaf_reg=int(params['l2_leaf_reg']),
        learning_rate=params['learning_rate'],
        iterations=600,
        eval_metric=metrics.Accuracy(),
        random_seed=42,
        verbose=False,
        loss_function=metrics.Logloss(),
    )
    
    cv_data = cv(
        Pool(X, y, cat_features=categorical_features_indices),
        model.get_params()
    )
    best_accuracy = np.max(cv_data['test-Accuracy-mean'])
    
    return 1 - best_accuracy # as hyperopt minimises

In [34]:
from numpy.random import RandomState

params_space = {
    'l2_leaf_reg': hyperopt.hp.qloguniform('l2_leaf_reg', 0, 2, 1),
    'learning_rate': hyperopt.hp.uniform('learning_rate', 1e-3, 5e-1),
}

trials = hyperopt.Trials()

best = hyperopt.fmin(
    hyperopt_objective,
    space=params_space,
    algo=hyperopt.tpe.suggest,
    max_evals=50,
    trials=trials,
    rstate=RandomState(123)
)

print(best)

100%|████████████████████████████████████████████| 50/50 [2:00:11<00:00, 144.23s/trial, best loss: 0.11061045345006981]
{'l2_leaf_reg': 7.0, 'learning_rate': 0.1822559342882682}


In [50]:
model = CatBoostClassifier(
    l2_leaf_reg=int(best['l2_leaf_reg']),
    learning_rate=best['learning_rate'],
    iterations=500,
    eval_metric=metrics.Accuracy(),
    random_seed=42,
    verbose=False,
    loss_function=metrics.Logloss(),

)
cv_data = cv(Pool(X, y, cat_features=categorical_features_indices), model.get_params())

In [51]:
print('Precise validation accuracy score: {}'.format(np.max(cv_data['test-Accuracy-mean'])))

Precise validation accuracy score: 0.8893895465499302


In [52]:
model.fit(X, y, cat_features=categorical_features_indices)

<catboost.core.CatBoostClassifier at 0x246c39bbc70>

In [53]:
predictions = model.predict(X_test)

submission = pd.DataFrame(data={'uniqueid':testId+' x '+ testCountry,'bank_account':predictions})

submission.to_csv('Submission.csv',index=False)