In [151]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_log_error, plot_roc_curve, roc_auc_score
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [152]:
def my_cross_validation(estimator, X, y, k):
    kf = KFold(n_splits=k, shuffle = True, random_state=0)
    rocaucs = []
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        estimator.fit(X_train, y_train)
        y_pred = estimator.predict_proba(X_test)[:,1]
        rocauc = roc_auc_score(y_test, y_pred)
        rocaucs.append(rocauc)
        print(rocauc)
    return np.mean(rocaucs), np.std(rocaucs)

def normalize(features):
    for f in features:
        if abs(train[f].max()) > abs(train[f].min()):
            train[f] = (train[f] - train[f].mean()) / train[f].max()
            test[f] = (test[f] - test[f].mean()) / test[f].max()
        else:
            train[f] = (train[f] - train[f].mean()) / train[f].min()
            test[f] = (test[f] - test[f].mean()) / test[f].min()

In [153]:
train = pd.read_csv('application_train.csv')
test  = pd.read_csv('application_test.csv')
submission = pd.read_csv('sample_submission.csv')
train.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,APARTMENTS_MODE,BASEMENTAREA_MODE,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_MODE,COMMONAREA_MODE,ELEVATORS_MODE,ENTRANCES_MODE,FLOORSMAX_MODE,FLOORSMIN_MODE,LANDAREA_MODE,LIVINGAPARTMENTS_MODE,LIVINGAREA_MODE,NONLIVINGAPARTMENTS_MODE,NONLIVINGAREA_MODE,APARTMENTS_MEDI,BASEMENTAREA_MEDI,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BUILD_MEDI,COMMONAREA_MEDI,ELEVATORS_MEDI,ENTRANCES_MEDI,FLOORSMAX_MEDI,FLOORSMIN_MEDI,LANDAREA_MEDI,LIVINGAPARTMENTS_MEDI,LIVINGAREA_MEDI,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MEDI,FONDKAPREMONT_MODE,HOUSETYPE_MODE,TOTALAREA_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,351000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.018801,-9461,-637,-3648.0,-2120,,1,1,0,1,1,0,Laborers,1.0,2,2,WEDNESDAY,10,0,0,0,0,0,0,Business Entity Type 3,0.083037,0.262949,0.139376,0.0247,0.0369,0.9722,0.6192,0.0143,0.0,0.069,0.0833,0.125,0.0369,0.0202,0.019,0.0,0.0,0.0252,0.0383,0.9722,0.6341,0.0144,0.0,0.069,0.0833,0.125,0.0377,0.022,0.0198,0.0,0.0,0.025,0.0369,0.9722,0.6243,0.0144,0.0,0.069,0.0833,0.125,0.0375,0.0205,0.0193,0.0,0.0,reg oper account,block of flats,0.0149,"Stone, brick",No,2.0,2.0,2.0,2.0,-1134.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,1129500.0,Family,State servant,Higher education,Married,House / apartment,0.003541,-16765,-1188,-1186.0,-291,,1,1,0,1,1,0,Core staff,2.0,1,1,MONDAY,11,0,0,0,0,0,0,School,0.311267,0.622246,,0.0959,0.0529,0.9851,0.796,0.0605,0.08,0.0345,0.2917,0.3333,0.013,0.0773,0.0549,0.0039,0.0098,0.0924,0.0538,0.9851,0.804,0.0497,0.0806,0.0345,0.2917,0.3333,0.0128,0.079,0.0554,0.0,0.0,0.0968,0.0529,0.9851,0.7987,0.0608,0.08,0.0345,0.2917,0.3333,0.0132,0.0787,0.0558,0.0039,0.01,reg oper account,block of flats,0.0714,Block,No,1.0,0.0,1.0,0.0,-828.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,135000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.010032,-19046,-225,-4260.0,-2531,26.0,1,1,1,1,1,0,Laborers,1.0,2,2,MONDAY,9,0,0,0,0,0,0,Government,,0.555912,0.729567,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-815.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,297000.0,Unaccompanied,Working,Secondary / secondary special,Civil marriage,House / apartment,0.008019,-19005,-3039,-9833.0,-2437,,1,1,0,1,0,0,Laborers,2.0,2,2,WEDNESDAY,17,0,0,0,0,0,0,Business Entity Type 3,,0.650442,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,0.0,2.0,0.0,-617.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,513000.0,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,0.028663,-19932,-3038,-4311.0,-3458,,1,1,0,1,0,0,Core staff,1.0,2,2,THURSDAY,11,0,0,0,0,1,1,Religion,,0.322738,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-1106.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [154]:
for c in train.columns:
    print(c)

SK_ID_CURR
TARGET
NAME_CONTRACT_TYPE
CODE_GENDER
FLAG_OWN_CAR
FLAG_OWN_REALTY
CNT_CHILDREN
AMT_INCOME_TOTAL
AMT_CREDIT
AMT_ANNUITY
AMT_GOODS_PRICE
NAME_TYPE_SUITE
NAME_INCOME_TYPE
NAME_EDUCATION_TYPE
NAME_FAMILY_STATUS
NAME_HOUSING_TYPE
REGION_POPULATION_RELATIVE
DAYS_BIRTH
DAYS_EMPLOYED
DAYS_REGISTRATION
DAYS_ID_PUBLISH
OWN_CAR_AGE
FLAG_MOBIL
FLAG_EMP_PHONE
FLAG_WORK_PHONE
FLAG_CONT_MOBILE
FLAG_PHONE
FLAG_EMAIL
OCCUPATION_TYPE
CNT_FAM_MEMBERS
REGION_RATING_CLIENT
REGION_RATING_CLIENT_W_CITY
WEEKDAY_APPR_PROCESS_START
HOUR_APPR_PROCESS_START
REG_REGION_NOT_LIVE_REGION
REG_REGION_NOT_WORK_REGION
LIVE_REGION_NOT_WORK_REGION
REG_CITY_NOT_LIVE_CITY
REG_CITY_NOT_WORK_CITY
LIVE_CITY_NOT_WORK_CITY
ORGANIZATION_TYPE
EXT_SOURCE_1
EXT_SOURCE_2
EXT_SOURCE_3
APARTMENTS_AVG
BASEMENTAREA_AVG
YEARS_BEGINEXPLUATATION_AVG
YEARS_BUILD_AVG
COMMONAREA_AVG
ELEVATORS_AVG
ENTRANCES_AVG
FLOORSMAX_AVG
FLOORSMIN_AVG
LANDAREA_AVG
LIVINGAPARTMENTS_AVG
LIVINGAREA_AVG
NONLIVINGAPARTMENTS_AVG
NONLIVINGAREA_AVG
APART

In [None]:
pd.set_option('display.max_rows', 150)
print(train.corr()["TARGET"].sort_values(ascending=False))

# Model 1

In [None]:
features = ["DAYS_BIRTH",
            "REGION_RATING_CLIENT_W_CITY",
            "DAYS_LAST_PHONE_CHANGE",
            "EXT_SOURCE_1",
            "EXT_SOURCE_2",
            "EXT_SOURCE_3"]
for f in features:
    train[f] = train[f].fillna(train[f].median())
    test[f] = test[f].fillna(test[f].median())
train["EXT_SOURCE_3"]

In [None]:
model1 = LogisticRegression()
my_cross_validation(model1, train[features], train["TARGET"], 10)

In [None]:
model1.fit(train[features], train["TARGET"])
y_pred = model1.predict_proba(test[features])[:,1]
submission['TARGET'] = y_pred
submission.to_csv('predict1.csv', index = False)

cv roc-auc mean/std 0.652934248421855, 0.03979582924055982

lb private/public 0.69169 0.70115

# Model 2

In [None]:
normalize(features)
model2 = LogisticRegression()
my_cross_validation(model2, train[features], train["TARGET"], 10)

In [None]:
model2.fit(train[features], train["TARGET"])
y_pred = model2.predict_proba(test[features])[:,1]
submission['TARGET'] = y_pred
submission.to_csv('predict2.csv', index = False)

cv roc-auc mean/std 0.7202821552753629, 0.0049228768914965845

lb private/public 0.70194 0.71383

# Model 3

In [None]:
for c in train.columns:
    if len(train[c].unique()) < 4:
        print(len(train[c].unique()), c)

In [None]:
train["APARTMENTS_AVG"].unique()

In [None]:
replace_dict = {
    "CODE_GENDER": {"M": 0, "F": 1, "XNA": np.nan},
    "NAME_CONTRACT_TYPE": {"Cash loans": 0, "Revolving loans": 1},
    "EMERGENCYSTATE_MODE": {"No": 0, "Yes": 0}
}

for k in replace_dict.keys():
    train[k] = train[k].replace(replace_dict[k])
    test[k] = test[k].replace(replace_dict[k])

for k in replace_dict.keys():
    train[k] = train[k].fillna(train[k].median())
    test[k] = test[k].fillna(test[k].median())

features += replace_dict.keys()

In [None]:
model3 = LogisticRegression()
my_cross_validation(model3, train[features], train["TARGET"], 10)

In [None]:
model3.fit(train[features], train["TARGET"])
y_pred = model3.predict_proba(test[features])[:,1]
submission['TARGET'] = y_pred
submission.to_csv('predict3.csv', index = False)

cv roc-auc mean/std 0.7258636681573927, 0.004075368545835932

lb private/public 0.70649 0.72104

# Model 4

In [None]:
for i in range(2, 22):
    features += ["FLAG_DOCUMENT_" + str(i)]

In [None]:
model4 = LogisticRegression(max_iter=200)
my_cross_validation(model4, train[features], train["TARGET"], 10)

In [None]:
model4.fit(train[features], train["TARGET"])
y_pred = model4.predict_proba(test[features])[:,1]
submission['TARGET'] = y_pred
submission.to_csv('predict4.csv', index = False)

cv roc-auc mean/std 0.7284747632662143, 0.003933530444388552

lb private/public 0.70956 0.72169

# Model 5

In [None]:
train["CREDIT_INCOME"] = train["AMT_CREDIT"] / train["AMT_INCOME_TOTAL"]
train["CREDIT_INCOME"] = train["CREDIT_INCOME"].fillna(train["CREDIT_INCOME"].max())
test["CREDIT_INCOME"] = test["AMT_CREDIT"] / test["AMT_INCOME_TOTAL"]
test["CREDIT_INCOME"] = test["CREDIT_INCOME"].fillna(test["CREDIT_INCOME"].max())
normalize(["CREDIT_INCOME"])
features += ["CREDIT_INCOME"]

In [None]:
model5 = LogisticRegression(max_iter=200)
my_cross_validation(model5, train[features], train["TARGET"], 10)

In [None]:
model5.fit(train[features], train["TARGET"])
y_pred = model5.predict_proba(test[features])[:,1]
submission['TARGET'] = y_pred
submission.to_csv('predict5.csv', index = False)

cv roc-auc mean/std 0.7287039611138694, 0.003960359688432011

lb private/public 0.71004 0.72192

In [None]:
lbs = [0.71004, 0.70956, 0.70649, 0.70194, 0.69169]
cvs = [0.7287039611138694, 0.7284747632662143, 0.7258636681573927, 0.7202821552753629, 0.652934248421855]
names = [5, 4, 3, 2, 1]

fig, ax = plt.subplots()
ax.scatter(cvs, lbs)
plt.title("1-5 models")
for i, txt in enumerate(names):
    ax.annotate(txt, (cvs[i], lbs[i]))

# Mean solution

In [None]:
for i in range(1, 6):
    submission[i] = pd.read_csv("predict{0}.csv".format(i))["TARGET"]

submission["TARGET"] = submission[list(range(1, 6))].mean(axis=1)
submission = submission.drop(list(range(1, 6)), axis=1)
submission.to_csv('predict_mean.csv', index = False)

lb private/public 0.70657 0.71905

# Вывод

Лучшей оказалась 5 модель с наибольшим количеством фичей,
хотя добавление новых фичей улучшало результат совсем не на много.
С добавлением фичей улучшался и результат кросс-валидации и итоговый результат.
Нормализация фичей дает значительное улучшение результата.

Среднее значение так и оказалось средним по результату,
так как все 5 моделей не разные, а каждая следующая - улучшенная версия предыдущей.
