#### [Problem 1] Confirmation of competition contents

In [108]:
import gc
import joblib
import warnings
import pandas as pd
import numpy as np
from tqdm import tqdm
from scipy import stats
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (RandomForestClassifier,
                              GradientBoostingClassifier)
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder

warnings.filterwarnings('ignore')

gc.enable()

#### [Problem 2] Learning and verification

In [113]:
# Data load 


DEBUG = False
REMOVE_OUTLIERS=True

if DEBUG:
    train_df = pd.read_csv('data/application_train.csv', nrows=1000)
    test_df = pd.read_csv('data/application_test.csv', nrows=1000)
else:
    train_df = pd.read_csv('data/application_train.csv')
    test_df = pd.read_csv('data/application_test.csv')

train_df['isTrain'] = 'Train'
test_df['isTrain'] = 'Test'

test_df['TARGET'] = np.nan

df = pd.concat([train_df, test_df[train_df.columns]], axis=0)
df = df.reset_index(drop=True)

del train_df, test_df
gc.collect()

7764

In [114]:
df.shape

(356255, 123)

#### [Problem 4] Feature engineering

In [117]:
# Data preprocessing
num_cols = []
cat_cols = []
should_be_encode = []
not_useful_cols = ['SK_ID_CURR', 'TARGET', 'isTrain']

for col in df.columns:
    if col not in not_useful_cols:
        unique_len = len(df[col].unique())
        data_type = df[col].dtype

        if unique_len<=20 and data_type!="object":
            cat_cols.append(col)
        elif data_type=='object':
            should_be_encode.append(col)
        else:
            num_cols.append(col)

print('Number of cat cols:', len(cat_cols+should_be_encode))
print('Number of numerical cols:', len(num_cols))

Number of cat cols: 58
Number of numerical cols: 62


In [118]:
# Fill missing values
missing_values = {}
for col in num_cols:
    num_missing_values = df[col].isnull().sum()
    if num_missing_values>0:
        df[col] = df[col].fillna(np.nanmean(df[col].values))
        missing_values[col] = np.nanmean(df[col].values)

In [119]:
missing_values

{'AMT_ANNUITY': 27425.560656506248,
 'AMT_GOODS_PRICE': 528019.9978004196,
 'OWN_CAR_AGE': 12.023741054753998,
 'EXT_SOURCE_1': 0.5019647173606229,
 'EXT_SOURCE_2': 0.5148899751498333,
 'EXT_SOURCE_3': 0.5093502021860116,
 'APARTMENTS_AVG': 0.11813799565530579,
 'BASEMENTAREA_AVG': 0.08867263218785101,
 'YEARS_BEGINEXPLUATATION_AVG': 0.977889190913453,
 'YEARS_BUILD_AVG': 0.7522831436693929,
 'COMMONAREA_AVG': 0.045045127207006805,
 'ELEVATORS_AVG': 0.07981886765365634,
 'ENTRANCES_AVG': 0.15001510278440014,
 'FLOORSMAX_AVG': 0.2273310098158009,
 'FLOORSMIN_AVG': 0.2328165692549523,
 'LANDAREA_AVG': 0.06645421391779162,
 'LIVINGAPARTMENTS_AVG': 0.10149499540944239,
 'LIVINGAREA_AVG': 0.10808933183069531,
 'NONLIVINGAPARTMENTS_AVG': 0.008868181984386712,
 'NONLIVINGAREA_AVG': 0.02850312046308469,
 'APARTMENTS_MODE': 0.11491437095520877,
 'BASEMENTAREA_MODE': 0.08774972119646735,
 'YEARS_BEGINEXPLUATATION_MODE': 0.9772385752922698,
 'YEARS_BUILD_MODE': 0.7594524439553474,
 'COMMONAREA_MO

In [120]:
# Fill missing values and encoding 
for col in should_be_encode:
    num_missing_values = df[col].isnull().sum()
    if num_missing_values > 0:
        df[col] = df[col].fillna(df[col].mode())
        missing_values[col] = (df[col].mode())

In [121]:
# Fill categorical variables

for col in cat_cols:
    num_missing_values = df[col].isnull().sum()
    if num_missing_values > 0:
        df[col] = df[col].fillna(df[col].median())
        missing_values[col] = (df[col].median())

In [123]:
joblib.dump(missing_values, 'data/utils/missing_values.pkl')

['data/utils/missing_values.pkl']

In [124]:
# Label encoding

encoders = {}
for col in should_be_encode:
    encoder = LabelEncoder().fit(df[col])
    encoders[col] = encoder
    df[col] = encoder.transform(df[col])

joblib.dump(encoders, "data/utils/encoders.pkl",)

['data/utils/encoders.pkl']

In [125]:
# Verify

for col in cat_cols+should_be_encode+num_cols:
    if df[col].isnull().sum()>0:
        print(col, df[col].dtype, df[col].isnull().sum())

cat_cols = cat_cols + should_be_encode
del should_be_encode
gc.collect()

0

In [126]:
# Onehot encoding

dummy_cols = []
map_dummy = {}
for col in tqdm(cat_cols, total=len(cat_cols)):
    for value in tqdm(df[col].unique()):
        df[f'dummy_{col}_{value}'] = 0
        df.loc[df[col]==value, f'dummy_{col}_{value}'] = 1
        dummy_cols.append(f'dummy_{col}_{value}')
        map_dummy[col] = f"dummy_{col}_{value}"

joblib.dump(map_dummy, 'data/utils/map_dummy.pkl')

del cat_cols
gc.collect()

  0%|                                                                                           | 0/58 [00:00<?, ?it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 16/16 [00:00<00:00, 463.42it/s][A

100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 666.40it/s][A

100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 444.15it/s][A

100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 285.74it/s][A

100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 400.03it/s][A

100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 399.97it/s][A

100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 500.10it/s][A

100%|███████

9

In [127]:
df.shape

(356255, 427)

In [128]:
# Outlier remove
train_df = df.loc[df['isTrain']=='Train'].reset_index(drop=True)
test_df = df.loc[df['isTrain']=='Test'].reset_index(drop=True)

del df
gc.collect()

0

In [129]:
outlier_idx = []
for col in num_cols:
    z_score = stats.zscore(train_df[col])
    outlier_idx = outlier_idx + list(train_df.loc[(np.abs(z_score)>3)].index)

outlier_idx = list(set(outlier_idx))

if REMOVE_OUTLIERS:
    train_df = train_df.drop(index=outlier_idx,
                             axis=0).reset_index(drop=True)

joblib.dump(num_cols, 'data/utils/num_cols.pkl')
joblib.dump(dummy_cols, 'data/utils/dummy_cols.pkl')

['data/utils/dummy_cols.pkl']

In [130]:
print(f"Class ratio = {train_df['TARGET'].sum()/train_df.shape[0]}" )

Class ratio = 0.08373467777008199


In [131]:
# Modelling

skf = StratifiedKFold(n_splits=5,
                      shuffle=True,
                      random_state=42)

models = []
for fold, (train_idx, val_idx) in enumerate(skf.split(train_df,
                                                      train_df['TARGET'],
                                                      groups=train_df['TARGET'])):
    train_set = train_df.loc[train_idx]
    val_set = train_df.loc[val_idx]

    model = LogisticRegression().fit(train_set[num_cols+dummy_cols],
                                     train_set['TARGET'])
    models.append(model)

    y_pred = model.predict_proba(val_set[num_cols+dummy_cols])[:, 1]

    auc_score = roc_auc_score(val_set['TARGET'], y_pred)
    print(f"FOLD-{fold}: AUC score={np.round(auc_score, 3)}")

joblib.dump(models, 'data/utils/models.pkl')

FOLD-0: AUC score=0.613
FOLD-1: AUC score=0.617
FOLD-2: AUC score=0.613
FOLD-3: AUC score=0.624
FOLD-4: AUC score=0.624


['data/utils/models.pkl']

#### [Problem 3] Estimation on test data

##### Inference

In [115]:
test_df = pd.read_csv('data/application_test.csv')

In [116]:
# Data preprocess on new data

# Missing values
missing_values = joblib.load('data/utils/missing_values.pkl')
for col in missing_values.keys():
    test_df.loc[test_df[col].isnull(), col] = missing_values[col]

# Encoding
encoders = joblib.load('data/utils/encoders.pkl')
for col in encoders.keys():
    test_df[col] = encoders[col].transform(test_df[col])

# Dummmy
cat_cols = ['CNT_CHILDREN', 'FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL', 'CNT_FAM_MEMBERS', 'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY', 'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION', 'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY', 'DEF_30_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE', 'FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21', 'AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK', 'AMT_REQ_CREDIT_BUREAU_QRT', 'NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE']
for col in tqdm(cat_cols, total=len(cat_cols)):
    for value in tqdm(test_df[col].unique()):
        test_df[f'dummy_{col}_{value}'] = 0
        test_df.loc[test_df[col]==value, f'dummy_{col}_{value}'] = 1

  0%|                                                                                           | 0/58 [00:00<?, ?it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 1289.83it/s][A

100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 999.95it/s][A

100%|██████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 1000.07it/s][A

100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 667.03it/s][A

100%|██████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 1000.31it/s][A

100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 666.77it/s][A

100%|██████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 1000.19it/s][A

100%|███████

In [132]:
# Prediction

models = joblib.load('data/utils/models.pkl')
num_cols = joblib.load('data/utils/num_cols.pkl')
dummy_cols = joblib.load('data/utils/dummy_cols.pkl')

for col in dummy_cols:
    if col not in test_df.columns:
        test_df[col] = 0

y_pred = np.zeros(test_df.shape[0])
for model in models:
    y_pred += model.predict_proba(test_df[num_cols+dummy_cols])[:, 1]/len(models)

test_df['TARGET'] = y_pred
test_df[['SK_ID_CURR', 'TARGET']].to_csv('data/submission.csv', index=False)