In [4]:
import pandas as pd
import numpy as np
import scipy.stats as ss

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="darkgrid")

from itertools import combinations
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression

import xgboost as xgb
from xgboost import XGBClassifier

from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV

import warnings
warnings.filterwarnings('ignore')

In [5]:
TRAIN_FILE = "./train.csv"
TEST_FILE = "./test.csv"
N_FOLDS = 5
CV = StratifiedKFold(n_splits=N_FOLDS, shuffle=True)

In [11]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
def datapreprocess(train_only=False):
    df_train, df_test = (
            pd.read_csv(TRAIN_FILE).drop("Id", axis=1), 
            pd.read_csv(TEST_FILE).drop("id", axis=1)
        )

    for col in ["profession", "city", "state"]:
        df_train[col] = df_train[col].str.replace("_", " ")
        df_test[col] = df_test[col].str.replace("_", " ")

    encode_cols = ["house_ownership", "car_ownership", "married", "profession", "city", "state"]
    for col in encode_cols:
        encoder = LabelEncoder().fit(df_train[col].values)
        df_train[col] = encoder.transform(df_train[col].values)
        df_test[col] = encoder.transform(df_test[col].values)
    
#     drop_cols = ["car_ownership", "house_ownership", "married", "profession", "city", "state", "profession"]
    drop_cols = []
    df_train = df_train.drop(drop_cols, axis=1)
    df_test = df_test.drop(drop_cols, axis=1)
    
    if train_only:
        return df_train
    return df_train, df_test
    
def make_submission(clf):
    df_train, df_test = datapreprocess()
    X, y = df_train.drop("risk_flag", axis=1).values, df_train["risk_flag"].values
    clf.fit(X, y)
    
    preds = clf.predict(df_test.values)
    res = {"id": np.arange(preds.size), "risk_flag": list(preds)}
    return pd.DataFrame.from_dict(res)

def train(clf):
    df_train = datapreprocess(train_only=True)
    X, y = df_train.drop("risk_flag", axis=1).values, df_train["risk_flag"].values
    clf.fit(X, y)
    return clf, roc_auc_score(y, clf.predict(X))

def eval_model(clf, n_jobs=-1):
    df_train = datapreprocess(train_only=True)
    X, y = df_train.drop("risk_flag", axis=1).values, df_train["risk_flag"].values
    scores = cross_val_score(clf, X, y, cv=CV, n_jobs=n_jobs, scoring="roc_auc", verbose=100)
    print(f"AUC: {scores.mean()} +- {scores.std()}")

In [10]:
df_train, df_test = datapreprocess()
df_train.head(10)

Unnamed: 0,income,age,experience,married,house_ownership,car_ownership,profession,city,state,current_job_years,current_house_years,risk_flag
0,1303835,23,3,1,2,0,33,251,13,3,13,0
1,7574516,40,10,1,2,0,43,227,14,9,13,0
2,3991815,66,4,0,2,0,47,8,12,4,10,0
3,6256451,41,2,1,2,1,43,54,17,2,12,1
4,5768871,47,11,1,2,0,11,296,22,3,14,1
5,6915937,64,0,1,2,0,11,130,14,0,12,0
6,3954973,58,14,0,2,0,31,299,22,8,12,0
7,1706172,33,2,1,2,0,20,134,6,2,14,0
8,7566849,24,17,1,2,1,25,165,20,11,11,0
9,8964846,23,12,1,2,0,2,149,23,5,13,0


In [None]:
from sklearn.feature_selection import mutual_info_regression
def make_mi_scores(discrete_features):
    df_train = datapreprocess(train_only=True)
    discrete_features = [col in discrete_features for col in df_train.drop("risk_flag", axis=1).columns]
    X, y = df_train.drop("risk_flag", axis=1), df_train["risk_flag"]
    idx = np.random.permutation(np.arange(y.size))[:int(0.5 * y.size)]
    X, y = X.loc[idx,:], y.loc[idx]
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    width = np.arange(len(mi_scores))
    ticks = list(mi_scores.index)
    plt.barh(width, mi_scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")

make_mi_scores(discrete_features=["profession", "state", "city", "lifescore"])

In [None]:
def plot_train_vs_test():
    df_train, df_test = datapreprocess()
    
    cont_vars = ["current_house_years", "current_job_years", "age", "experience", "income"]
    catg_vars = ["car_ownership", "house_ownership", "married", "profession", "city", "state"]
    
#     for var in cont_vars:
#         sns.histplot(df_train[var], label="Train", color="#e74c3c", stat="density", kde=True)
#         sns.histplot(df_test[var], label="Test", color="#2ecc71", stat="density", kde=True)
#         plt.title(var)
#         plt.legend(loc="upper right")
#         plt.show()
    
    for var in catg_vars:
        sns.countplot(df_train[var], label="Train", color="#e74c3c")
        sns.countplot(df_test[var], label="Test", color="#2ecc71")
        plt.title(var)
        plt.legend(loc="upper right")
        plt.show()
plot_train_vs_test()

# RandomForest

## submission:

In [87]:
eval_model(RandomForestClassifier(class_weight="balanced", n_jobs=-1), n_jobs=1)
print(train(RandomForestClassifier(class_weight="balanced", n_jobs=-1))[1])

AUC: 0.9378436675345906 +- 0.0009354050054031958
0.9573912910085283


In [88]:
res = make_submission(RandomForestClassifier(class_weight="balanced", n_jobs=-1))
res.to_csv("./submission1.csv", index=False)

## baseline 2:

In [None]:
eval_model(RandomForestClassifier(class_weight="balanced"), df_cont, df_catg, targets)

## baseline 3:

In [51]:
eval_model(RandomForestClassifier(class_weight="balanced"), include_cont=False, include_catg=True)

TypeError: eval_model() got an unexpected keyword argument 'include_cont'

## baseline4

In [103]:
eval_model(RandomForestClassifier(class_weight="balanced", n_jobs=-1), n_jobs=1)
# print(train(RandomForestClassifier(class_weight="balanced", n_jobs=-1))[1])

AUC: 0.9386525635703752 +- 0.000560765054422955


In [97]:
# features = datapreprocess(train_only=True).drop("risk_flag", axis=1).columns
# importances = model.feature_importances_
# indices = np.argsort(importances)

# plt.title('Feature Importances')
# plt.barh(range(len(indices)), importances[indices], color='b', align='center')
# plt.yticks(range(len(indices)), [features[i] for i in indices])
# plt.xlabel('Relative Importance')
# plt.show()

In [99]:
res = make_submission(RandomForestClassifier(class_weight="balanced", n_jobs=-1))
res.to_csv("./baseline4.csv", index=False)

In [101]:
true, preds = pd.read_csv("./baseline4.csv")["risk_flag"], pd.read_csv("./submission1.csv")["risk_flag"]
roc_auc_score(true, preds)

0.9072682432123436

In [13]:
model = RandomForestClassifier(class_weight="balanced", n_jobs=-1, n_estimators=500, max_depth=25, max_features=1)
eval_model(model, n_jobs=1)
print(train(model)[1])

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[CV]  ................................................................
[CV] .................................... , score=0.938, total=  46.6s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   46.5s remaining:    0.0s
[CV]  ................................................................
[CV] .................................... , score=0.939, total=  45.3s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.5min remaining:    0.0s
[CV]  ................................................................
[CV] .................................... , score=0.938, total=  44.4s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  2.3min remaining:    0.0s
[CV]  ................................................................
[CV] .................................... , score=0.938, total=  48.1s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  3.1min remaining:    0.0s
[CV]  .........................

In [11]:
def rf_tuning(params):
    df_train = datapreprocess(train_only=True)
    X, y = df_train.drop("risk_flag", axis=1).values, df_train["risk_flag"].values
        
    clf = GridSearchCV(
            RandomForestClassifier(class_weight="balanced", n_jobs=-1, n_estimators=250, max_depth=25, max_features=1),
            params,
            scoring="roc_auc",
            n_jobs=4,
            cv=CV,
            verbose=3,
            error_score="raise",
            refit=False
        )
    search = clf.fit(X, y)
    return search

In [12]:
params = {
    "max_features": range(1, df_train.drop("risk_flag", axis=1).shape[1]+1)
}
rf_search = rf_tuning(params)
pd.DataFrame.from_dict(rf_search.cv_results_)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed: 16.1min
[Parallel(n_jobs=4)]: Done  45 out of  45 | elapsed: 41.9min finished


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_features,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,82.242336,0.527038,2.862028,2.510547,1,{'max_features': 1},0.937,0.936057,0.937767,0.937395,0.939018,0.937448,0.00097,1
1,115.900944,1.33108,4.620056,2.068797,2,{'max_features': 2},0.935227,0.935017,0.935109,0.935942,0.937469,0.935753,0.000918,2
2,148.904792,0.929701,5.956887,0.952966,3,{'max_features': 3},0.934107,0.934576,0.934386,0.934098,0.936409,0.934715,0.000866,4
3,179.829542,0.873895,6.863559,0.626431,4,{'max_features': 4},0.934966,0.934057,0.935636,0.935011,0.935995,0.935133,0.000663,3
4,223.043497,5.750922,6.420767,0.367901,5,{'max_features': 5},0.933242,0.933304,0.93443,0.933931,0.935003,0.933982,0.000671,9
5,260.532358,6.004845,6.685801,0.382326,6,{'max_features': 6},0.934137,0.933712,0.934996,0.933506,0.934551,0.93418,0.000544,8
6,284.302349,5.554677,6.882948,0.70974,7,{'max_features': 7},0.934044,0.933078,0.935787,0.933936,0.934292,0.934228,0.00088,6
7,309.869235,2.765259,6.904394,0.604229,8,{'max_features': 8},0.934073,0.933711,0.935738,0.934109,0.933276,0.934181,0.000835,7
8,301.467272,60.036974,3.882371,2.519422,9,{'max_features': 9},0.935143,0.932139,0.935422,0.93338,0.935062,0.934229,0.001268,5


In [None]:
# check which idx differ at disagreeing idxs
# tuning of XGBBoost
# tuning of RandomForestClassifier
# check diff models agreement idea

# XGBoost

In [None]:
def xgboost_train(clf, X, y):
    params = clf.get_xgb_params()
    xgtrain = xgb.DMatrix(X, label=y)
    res = xgb.cv(
        params, 
        xgtrain, 
        num_boost_round=clf.get_params()['n_estimators'], 
        nfold=N_FOLDS,
        metrics='auc', 
        verbose_eval=False
    )
        
    clf.set_params(n_estimators=res.shape[0])
    print(f"Optimal Number of Estimators: {res.shape[0]}")
    print(res.max(axis=0))
    scores = cross_val_score(clf, X, y, cv=StratifiedKFold(n_splits=N_FOLDS, shuffle=True), n_jobs=-1, scoring="roc_auc")
    print(f"AUC: {scores.mean()} +- {scores.std()}")

In [None]:
model = XGBClassifier(
            n_estimators=2000, 
            learning_rate=0.01,
            tree_method="gpu_hist",
            scale_pos_weight=7.13,
            max_depth=25,
            min_child_weight=2,
            colsample_bytree=0.6
        )
eval_model(model, n_jobs=-1)
print(train(model)[1])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  1.9min


In [4]:
def xgb_tuning(params):
    df_train = datapreprocess(train_only=True)
    X, y = df_train.drop("risk_flag", axis=1).values, df_train["risk_flag"].values
    
    model = XGBClassifier(
            n_estimators=200, 
            learning_rate=0.1,
            tree_method="gpu_hist",
            scale_pos_weight=7.13,
            max_depth=25,
            min_child_weight=2.
            colsample_bytree=0.6
        )
    clf = GridSearchCV(
            model,
            params,
            n_jobs=4,
            cv=CV,
            scoring="roc_auc",
            verbose=3,
            error_score="raise",
            refit=False
        )
    search = clf.fit(X, y)
    return search

In [7]:
params = {
        "max_depth": range(5, 30, 5),
        "min_child_weight": range(1, 20, 5)
    }
search1 = xgb_tuning(params)
pd.DataFrame.from_dict(search1.cv_results_)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:  1.5min
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed: 43.4min finished




Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_child_weight,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,9.020955,0.528846,0.10625,0.015312,5,1,"{'max_depth': 5, 'min_child_weight': 1}",0.85279,0.845911,0.843099,0.848783,0.85266,0.848649,0.003783,18
1,8.052688,0.087626,0.093753,0.009887,5,6,"{'max_depth': 5, 'min_child_weight': 6}",0.852393,0.845702,0.844454,0.852196,0.854281,0.849805,0.003948,17
2,7.911961,0.194966,0.103118,0.012498,5,11,"{'max_depth': 5, 'min_child_weight': 11}",0.849446,0.841805,0.839544,0.845127,0.854707,0.846126,0.005434,20
3,7.963289,0.177871,0.103127,0.007653,5,16,"{'max_depth': 5, 'min_child_weight': 16}",0.849814,0.84658,0.843404,0.846601,0.853004,0.847881,0.003267,19
4,37.582785,0.436313,0.175001,0.00625,10,1,"{'max_depth': 10, 'min_child_weight': 1}",0.927225,0.92051,0.919233,0.920159,0.926641,0.922753,0.003443,15
5,33.21795,0.625169,0.178124,0.007651,10,6,"{'max_depth': 10, 'min_child_weight': 6}",0.927029,0.921804,0.919412,0.921352,0.928618,0.923643,0.003543,13
6,30.999671,0.768285,0.183468,0.006056,10,11,"{'max_depth': 10, 'min_child_weight': 11}",0.925947,0.920805,0.920515,0.922791,0.924584,0.922929,0.002108,14
7,29.238418,0.333764,0.197382,0.008118,10,16,"{'max_depth': 10, 'min_child_weight': 16}",0.925977,0.919729,0.916627,0.919176,0.923499,0.921001,0.003319,16
8,108.464858,2.520669,0.345603,0.0245,15,1,"{'max_depth': 15, 'min_child_weight': 1}",0.936922,0.933983,0.932214,0.934281,0.935896,0.934659,0.001627,8
9,80.487555,0.631987,0.325806,0.015933,15,6,"{'max_depth': 15, 'min_child_weight': 6}",0.935387,0.933314,0.931212,0.932749,0.935422,0.933617,0.001614,11


In [7]:
params = {
    "min_child_weight": range(1, 6)
}
search2 = xgb_tuning(params)
pd.DataFrame.from_dict(search2.cv_results_)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  25 out of  25 | elapsed: 34.6min finished


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_min_child_weight,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,368.51366,2.854905,0.895738,0.111382,1,{'min_child_weight': 1},0.936422,0.937697,0.937535,0.935865,0.935225,0.936549,0.000952,1
1,343.718437,2.546192,0.801271,0.050379,2,{'min_child_weight': 2},0.935764,0.936882,0.936622,0.935814,0.935275,0.936072,0.000593,2
2,321.257584,6.919648,1.049935,0.20606,3,{'min_child_weight': 3},0.935153,0.936681,0.936542,0.935726,0.933836,0.935587,0.001037,3
3,310.526265,2.894076,0.921482,0.257556,4,{'min_child_weight': 4},0.934776,0.936757,0.936321,0.935194,0.933901,0.93539,0.001036,4
4,257.454166,67.578403,0.643131,0.180632,5,{'min_child_weight': 5},0.934501,0.93655,0.936551,0.935235,0.933959,0.935359,0.001054,5


In [6]:
params = {
    "colsample_bytree": [i/10 for i in range(6,11)]
}
search3 = xgb_tuning(params)
pd.DataFrame.from_dict(search3.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_colsample_bytree,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,325.392567,4.469651,0.761131,0.082723,0.6,{'colsample_bytree': 0.6},0.937366,0.937301,0.937896,0.939736,0.938924,0.938245,0.000946,1
1,309.998892,6.087178,0.61526,0.07436,0.7,{'colsample_bytree': 0.7},0.937309,0.937431,0.93797,0.939434,0.938811,0.938191,0.000817,2
2,306.623595,1.54657,0.637141,0.022735,0.8,{'colsample_bytree': 0.8},0.937238,0.937214,0.93803,0.939062,0.938401,0.937989,0.000705,3
3,310.963393,4.59885,0.613087,0.050074,0.9,{'colsample_bytree': 0.9},0.936655,0.936586,0.937611,0.938709,0.93814,0.93754,0.000828,4
4,272.540355,71.266244,0.558014,0.104456,1.0,{'colsample_bytree': 1.0},0.9358,0.935544,0.936637,0.938015,0.936893,0.936578,0.000877,5


# Results
---
| Name       | Score  | CV                | Train | Description 
|    :-:     |   :-:  |         :-:       |  :-:  | :-      
`baseline1`  | 0.745  | 0.9379 +- 0.00108 | 0.975 | Used all features except `city`
`baseline2`  | 0.848  | 0.9386 +- 0.00138 | 0.958 | Same as above, but with `class_weight="balanced"`
`submission` | 0.856  | 0.9378 +- 0.00093 | 0.957 | Used only continuous features
`baseline3`  |   xx   | 0.7847 +- 0.00342 | 0.821 | Used only categorical features, excluding `city`
`baseline4`  |   xx   | 0.9386 +- 0.00112 | 0.957 | Replaced `[married, house_ownership, car_ownership]` with their count;<br>`[state, city]` with their smoothed median income; dropped `profession`