# Imports

In [1]:
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier
from sklearn.ensemble import ExtraTreesClassifier #,  RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score

In [None]:
#conda install -c conda-forge scikit-learn=1.1.0

In [None]:
#pip install scikit-learn==1.1.0

In [3]:
import sklearn
sklearn.__version__

'1.1.0'

In [None]:
#!pip install category_encoders
#!pip install catboost

In [4]:
from catboost import CatBoostClassifier

In [5]:
import category_encoders as ce

In [6]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
test.shape, train.shape

((36349, 18), (73799, 19))

In [7]:
SEED = 42
TARGET = 'default'

In [8]:
train['test'] = 0
test['test'] = 1
train = train.astype({'test': 'int32'})
test = test.astype({'test': 'int32'})
data = train.copy()
data = data.append(test)

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 110148 entries, 0 to 36348
Data columns (total 20 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   client_id         110148 non-null  int64  
 1   app_date          110148 non-null  object 
 2   education         109670 non-null  object 
 3   sex               110148 non-null  object 
 4   age               110148 non-null  int64  
 5   car               110148 non-null  object 
 6   car_type          110148 non-null  object 
 7   decline_app_cnt   110148 non-null  int64  
 8   good_work         110148 non-null  int64  
 9   score_bki         110148 non-null  float64
 10  bki_request_cnt   110148 non-null  int64  
 11  region_rating     110148 non-null  int64  
 12  home_address      110148 non-null  int64  
 13  work_address      110148 non-null  int64  
 14  income            110148 non-null  int64  
 15  sna               110148 non-null  int64  
 16  first_time        110

In [24]:
def check_model(data, encoder, model, target=TARGET):
    """
    function to fit chosen ML model and calculate statistics
    :param data: dataset to fit model
    :param encoder: feature encoder to transform features
    :param model: ML model to fit
    :param target: name of target vector
    :return: ROC AUC score, dataframe with feature importance, classification report
    """
    # preparing data
    data = data[data.test==0].copy()
    data.drop('test', axis=1, inplace=True)
    x, y = data.drop(target, axis=1), data[target]

    # encode features
    x = encoder.fit_transform(x,y)

    # fit model and make classification report
    x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=SEED)
    model.fit(x_train, y_train)
    predict = model.predict(x_test)
    report = classification_report(y_test, predict)#, output_dict=True)

    # take feature importance for given model
    value = pd.DataFrame(data=model.feature_importances_, index=x.columns, columns=['importance'])
    importance = value.importance.sort_values(ascending=False)

    # calculate ROC AUC score
    predict_prob = model.predict_proba(x_test)[:,1]
    roc_auc = roc_auc_score(y_test,predict_prob)

    return roc_auc, importance, report

# Naive model

In [25]:
list_to_drop = []
parameters = {
    'model': CatBoostClassifier(random_state=SEED), #ROC AUC 0.73, f1(1)=0.06
    #'model': LogisticRegression(solver='liblinear', max_iter=1000, random_state=SEED),  #ROC AUC = 0.61, f1(1)=0.00
    #'model': SGDClassifier(random_state=SEED, loss='modified_huber'), #ROC AUC = 0.5, f1(1)=0.22
    #'model': ExtraTreesClassifier(random_state=SEED), #ROC AUC 0.70, f1(1)=0.04

    'encoder': ce.TargetEncoder()
}
roc_auc, importance, report = check_model(data, **parameters)
print(f'ROC AUC score = {roc_auc:0.00f}:')
print(report)
print(importance)

Learning rate set to 0.058777
0:	learn: 0.6472797	total: 27.3ms	remaining: 27.3s
1:	learn: 0.6076844	total: 52.5ms	remaining: 26.2s
2:	learn: 0.5737663	total: 78.2ms	remaining: 26s
3:	learn: 0.5446447	total: 107ms	remaining: 26.6s
4:	learn: 0.5191018	total: 134ms	remaining: 26.6s
5:	learn: 0.4969717	total: 163ms	remaining: 27s
6:	learn: 0.4781245	total: 194ms	remaining: 27.6s
7:	learn: 0.4622309	total: 219ms	remaining: 27.1s
8:	learn: 0.4479467	total: 239ms	remaining: 26.3s
9:	learn: 0.4347408	total: 258ms	remaining: 25.5s
10:	learn: 0.4230547	total: 281ms	remaining: 25.3s
11:	learn: 0.4139803	total: 305ms	remaining: 25.1s
12:	learn: 0.4061162	total: 324ms	remaining: 24.6s
13:	learn: 0.3991216	total: 339ms	remaining: 23.9s
14:	learn: 0.3930008	total: 354ms	remaining: 23.3s
15:	learn: 0.3875944	total: 370ms	remaining: 22.7s
16:	learn: 0.3830611	total: 391ms	remaining: 22.6s
17:	learn: 0.3791558	total: 408ms	remaining: 22.3s
18:	learn: 0.3755292	total: 428ms	remaining: 22.1s
19:	learn: 0

# Clearing, EDA, FE

In [26]:
importance

score_bki           18.100528
income               9.837968
age                  9.633714
client_id            8.828698
app_date             8.296025
region_rating        7.606334
bki_request_cnt      6.418436
sna                  4.999990
education            4.182564
first_time           4.122536
home_address         4.010949
decline_app_cnt      3.980084
work_address         3.310466
sex                  2.153794
good_work            1.384631
foreign_passport     1.315483
car_type             1.047700
car                  0.770100
Name: importance, dtype: float64

In [68]:
data.isna().sum()

client_id                 0
app_date                  0
education               478
sex                       0
age                       0
car                       0
car_type                  0
decline_app_cnt           0
good_work                 0
score_bki                 0
bki_request_cnt           0
region_rating             0
home_address              0
work_address              0
income                    0
sna                       0
first_time                0
foreign_passport          0
default               36349
test                      0
score_bki_standard        0
dtype: int64

### 'Score BKI' to standard scale

In [None]:
from sklearn.preprocessing import StandardScaler

In [49]:
x = train.loc[:,'score_bki'].values.reshape(-1,1)
scaler = StandardScaler().fit(x)

scaled = scaler.transform(data.loc[:,'score_bki'].values.reshape(-1,1))
data['score_bki_standard'] = scaled
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 110148 entries, 0 to 36348
Data columns (total 21 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   client_id           110148 non-null  int64  
 1   app_date            110148 non-null  object 
 2   education           109670 non-null  object 
 3   sex                 110148 non-null  object 
 4   age                 110148 non-null  int64  
 5   car                 110148 non-null  object 
 6   car_type            110148 non-null  object 
 7   decline_app_cnt     110148 non-null  int64  
 8   good_work           110148 non-null  int64  
 9   score_bki           110148 non-null  float64
 10  bki_request_cnt     110148 non-null  int64  
 11  region_rating       110148 non-null  int64  
 12  home_address        110148 non-null  int64  
 13  work_address        110148 non-null  int64  
 14  income              110148 non-null  int64  
 15  sna                 110148 non-null

In [67]:
list_to_drop = []
parameters = {
    'model': CatBoostClassifier(random_state=SEED), #ROC AUC 0.73, f1(1)=0.06
    'encoder': ce.TargetEncoder()
}
roc_auc, importance, report = check_model(data.drop(list_to_drop, axis=1), **parameters)
print(f'ROC AUC score = {roc_auc:.4f}')
print(report)
#print(importance)

Learning rate set to 0.058777
0:	learn: 0.6474947	total: 46.2ms	remaining: 46.2s
1:	learn: 0.6070594	total: 74.8ms	remaining: 37.3s
2:	learn: 0.5729002	total: 129ms	remaining: 43s
3:	learn: 0.5428802	total: 165ms	remaining: 41s
4:	learn: 0.5178470	total: 200ms	remaining: 39.8s
5:	learn: 0.4955041	total: 242ms	remaining: 40.1s
6:	learn: 0.4765718	total: 271ms	remaining: 38.4s
7:	learn: 0.4608484	total: 295ms	remaining: 36.6s
8:	learn: 0.4458484	total: 321ms	remaining: 35.3s
9:	learn: 0.4344492	total: 357ms	remaining: 35.3s
10:	learn: 0.4234057	total: 382ms	remaining: 34.3s
11:	learn: 0.4140337	total: 406ms	remaining: 33.4s
12:	learn: 0.4059649	total: 429ms	remaining: 32.6s
13:	learn: 0.3993645	total: 459ms	remaining: 32.3s
14:	learn: 0.3931300	total: 483ms	remaining: 31.7s
15:	learn: 0.3877218	total: 507ms	remaining: 31.2s
16:	learn: 0.3827853	total: 529ms	remaining: 30.6s
17:	learn: 0.3783199	total: 551ms	remaining: 30.1s
18:	learn: 0.3747487	total: 582ms	remaining: 30s
19:	learn: 0.37

In [56]:
importance

score_bki_standard    19.340529
income                 9.556519
age                    9.262368
app_date               8.465950
client_id              8.338340
region_rating          7.424892
bki_request_cnt        6.570610
sna                    4.725442
first_time             4.427048
education              4.240926
home_address           4.214655
decline_app_cnt        3.860655
work_address           3.147610
sex                    2.033321
good_work              1.450845
car_type               1.174942
foreign_passport       0.970709
car                    0.794638
Name: importance, dtype: float64

### 'Education' to fill NA

In [70]:
data.education.unique()

array(['SCH', 'GRD', 'UGR', 'PGR', 'ACD', nan], dtype=object)

In [71]:
data.education.value_counts()

SCH    57998
GRD    34768
UGR    14748
PGR     1865
ACD      291
Name: education, dtype: int64

In [74]:
data.groupby('education')[['age', 'income', 'score_bki']].agg('mean')

Unnamed: 0_level_0,age,income,score_bki
education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ACD,40.323024,85949.405498,-2.069584
GRD,38.066584,54315.008916,-1.920431
PGR,38.567292,79753.214477,-1.975544
SCH,41.07302,32033.254681,-1.903968
UGR,34.860523,39009.904868,-1.855721


In [76]:
data[pd.isna(data.education)][['age', 'income', 'score_bki']].agg('mean')

age             41.437238
income       46240.956067
score_bki       -1.945634
dtype: float64

In [81]:
data.groupby('education')[['age']].agg(['min', 'median', 'max'])

Unnamed: 0_level_0,age,age,age
Unnamed: 0_level_1,min,median,max
education,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
ACD,21,38.0,68
GRD,21,35.0,71
PGR,22,37.0,72
SCH,21,40.0,71
UGR,21,32.0,70


In [83]:
data['edu_nan'] = data.education.apply(lambda q: 1 if pd.isna(q) else 0)
data.education.fillna('GRD', inplace=True)

In [84]:
data.isna().sum()

client_id                 0
app_date                  0
education                 0
sex                       0
age                       0
car                       0
car_type                  0
decline_app_cnt           0
good_work                 0
score_bki                 0
bki_request_cnt           0
region_rating             0
home_address              0
work_address              0
income                    0
sna                       0
first_time                0
foreign_passport          0
default               36349
test                      0
score_bki_standard        0
edu_nan                   0
dtype: int64

In [85]:
list_to_drop = []
parameters = {
    'model': CatBoostClassifier(random_state=SEED),  #ROC AUC 0.73, f1(1)=0.06
    'encoder': ce.TargetEncoder()
}
roc_auc, importance, report = check_model(data.drop(list_to_drop, axis=1), **parameters)
print(f'ROC AUC score = {roc_auc:.4f}')
print(report)

Learning rate set to 0.058777
0:	learn: 0.6468880	total: 42ms	remaining: 41.9s
1:	learn: 0.6081279	total: 74.6ms	remaining: 37.2s
2:	learn: 0.5744125	total: 121ms	remaining: 40.2s
3:	learn: 0.5444004	total: 157ms	remaining: 39.2s
4:	learn: 0.5191188	total: 189ms	remaining: 37.5s
5:	learn: 0.4974148	total: 218ms	remaining: 36.1s
6:	learn: 0.4782374	total: 248ms	remaining: 35.2s
7:	learn: 0.4623670	total: 276ms	remaining: 34.2s
8:	learn: 0.4482891	total: 303ms	remaining: 33.4s
9:	learn: 0.4361610	total: 331ms	remaining: 32.7s
10:	learn: 0.4250901	total: 360ms	remaining: 32.4s
11:	learn: 0.4151815	total: 383ms	remaining: 31.5s
12:	learn: 0.4072535	total: 408ms	remaining: 31s
13:	learn: 0.4005279	total: 435ms	remaining: 30.6s
14:	learn: 0.3942642	total: 464ms	remaining: 30.5s
15:	learn: 0.3886479	total: 486ms	remaining: 29.9s
16:	learn: 0.3837278	total: 509ms	remaining: 29.4s
17:	learn: 0.3793502	total: 532ms	remaining: 29s
18:	learn: 0.3756240	total: 558ms	remaining: 28.8s
19:	learn: 0.37

In [86]:
importance

score_bki             10.707869
income                 9.219739
age                    9.144110
score_bki_standard     8.935029
client_id              8.415760
app_date               8.275123
region_rating          7.518235
bki_request_cnt        6.180237
sna                    4.757149
home_address           4.550538
education              4.257770
first_time             4.172129
decline_app_cnt        3.844915
work_address           3.113786
sex                    2.076636
good_work              1.372824
foreign_passport       1.173609
car_type               1.131016
car                    0.885084
edu_nan                0.268442
Name: importance, dtype: float64

### Balancing classes

In [88]:
# I will take 60% of "non default" class and multiply by three "default" class
train_0, train_1 = train[train.default==0], train[train.default==1]

train_bal, _ = train_test_split(train_0, test_size=0.4, random_state=SEED)
print(train.shape, train_0.shape, train_bal.shape)
data_bal = train_bal.copy()
data_bal = data_bal.append(train_1)
data_bal = data_bal.append(train_1)
data_bal = data_bal.append(train_1)

print(data_bal.shape)

(73799, 20) (64427, 20) (38656, 20)
(66772, 20)


In [90]:
list_to_drop = []
parameters = {
    'model': CatBoostClassifier(random_state=SEED),  #ROC AUC 0.73, f1(1)=0.06
    'encoder': ce.TargetEncoder()
}
roc_auc, importance, report = check_model(data_bal.drop(list_to_drop, axis=1), **parameters)
print(f'ROC AUC score = {roc_auc:.4f}')
print(report)

Learning rate set to 0.056319
0:	learn: 0.6838829	total: 26.2ms	remaining: 26.1s
1:	learn: 0.6758565	total: 51.9ms	remaining: 25.9s
2:	learn: 0.6683480	total: 96.4ms	remaining: 32s
3:	learn: 0.6620706	total: 127ms	remaining: 31.5s
4:	learn: 0.6565706	total: 163ms	remaining: 32.4s
5:	learn: 0.6518340	total: 212ms	remaining: 35.2s
6:	learn: 0.6470175	total: 241ms	remaining: 34.2s
7:	learn: 0.6428735	total: 268ms	remaining: 33.2s
8:	learn: 0.6390095	total: 306ms	remaining: 33.7s
9:	learn: 0.6356173	total: 337ms	remaining: 33.4s
10:	learn: 0.6322348	total: 418ms	remaining: 37.5s
11:	learn: 0.6292941	total: 442ms	remaining: 36.4s
12:	learn: 0.6269229	total: 469ms	remaining: 35.6s
13:	learn: 0.6246195	total: 493ms	remaining: 34.7s
14:	learn: 0.6226527	total: 513ms	remaining: 33.7s
15:	learn: 0.6208383	total: 534ms	remaining: 32.9s
16:	learn: 0.6190532	total: 554ms	remaining: 32s
17:	learn: 0.6171090	total: 580ms	remaining: 31.7s
18:	learn: 0.6152874	total: 603ms	remaining: 31.1s
19:	learn: 0

### Resume
Balancing of classes help to achive f1 score = 67% of 'default' class. So lets go to make submission.

# Submission

In [93]:
data_bal.shape, test.shape

((66772, 20), (36349, 19))

In [None]:
x, y = data_bal.drop(['test', TARGET], axis=1), data_bal[TARGET]
encoder = ce.TargetEncoder().fit(x,y)
x = encoder.transform(x)

question = test.drop('test', axis=1)
question = encoder.transform(question)

model = CatBoostClassifier(random_state=SEED).fit(x, y)
predict = model.predict(question)

In [None]:
sub = test.copy()
sub['default'] = predict
sub = sub[['client_id', 'default']].copy()
sub

In [100]:
sub.to_csv('submission_2.csv', index=False)

In [None]:
# score on lederboard 0.35032
# 11 place from 74 (29/07/2022)