In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

%matplotlib inline

data_dir = "data/loan_approval_predication"
train_csv = pd.read_csv(os.path.join(data_dir, "train.csv"))


train_csv, eval_csv = train_test_split(train_csv, test_size=0.2, random_state=42, stratify=train_csv["loan_status"])

In [2]:
from itertools import combinations
import numpy as np
from sklearn import base


def division_features(X_pd):
    result = []
    result.append(X_pd["person_income"] / X_pd["person_age"])
    result.append(X_pd["person_income"] / X_pd["cb_person_cred_hist_length"])
    result.append(X_pd["person_income"] / X_pd["loan_amnt"])
    return np.vstack(result).T


def group_features(numric_csv, degree=3):
    data = numric_csv.values
    new_data = []
    m, n = data.shape
    for indexes in combinations(range(n), degree):
        r = np.ones(m)
        for i in range(len(indexes)):
            r *= data[:, indexes[i]]
        new_data.append(r)
    return np.vstack(new_data).T


def polyminal_features(numric_csv, degree=2):
    data = numric_csv.values
    _, n = data.shape
    new_data = []
    for i in range(n):
        new_data.append(data[:, i] ** degree)
    return np.vstack(new_data).T


def generate_features(numric_csv):
    new_data = []
    for i in range(2, 4):
        new_data.append(group_features(numric_csv, i))
    new_data.append(polyminal_features(numric_csv, 2))
    new_data.append(division_features(numric_csv))
    return np.hstack(new_data)


class FeatureGenerater(base.TransformerMixin, base.BaseEstimator):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return np.concatenate([X.values, generate_features(X)], axis=1)

In [3]:
from sklearn import pipeline, preprocessing, compose, impute


train_data = train_csv.drop(["loan_status", "id"], axis=1)
train_labels = train_csv["loan_status"]
eval_data = eval_csv.drop(["loan_status", "id"], axis=1)
eval_labels = eval_csv["loan_status"]
numric_columns = [
    "loan_amnt",
    "loan_int_rate",
    "loan_percent_income",
    "person_income",
    "person_emp_length",
    "person_age",
    "cb_person_cred_hist_length",
]

numric_transformer = pipeline.Pipeline(
    steps=[
        # ("generater", FeatureGenerater()),
        # ("imputer", impute.SimpleImputer(strategy="mean")),
        ("scaler", preprocessing.StandardScaler()),
    ]
)
discretied_columns = [
    "person_age",
    "person_income",
    "person_emp_length",
    "cb_person_cred_hist_length",
]
discretizer_transformer = compose.ColumnTransformer(
    transformers=[
        ("person_age", preprocessing.KBinsDiscretizer(), ["person_age"]),
        ("person_income", preprocessing.KBinsDiscretizer(), ["person_income"]),
        ("person_emp_length", preprocessing.KBinsDiscretizer(), ["person_emp_length"]),
        (
            "cb_person_cred_hist_length",
            preprocessing.KBinsDiscretizer(),
            ["cb_person_cred_hist_length"],
        ),
    ]
)
category_columns = ["person_home_ownership", "loan_intent", "loan_grade"]

In [4]:
classify_pipline = pipeline.Pipeline(
    steps=[
        (
            "preprocessor",
            compose.ColumnTransformer(
                transformers=[
                    ("num", numric_transformer, numric_columns),
                    ("dis", discretizer_transformer, discretied_columns),
                    (
                        "cat",
                        preprocessing.OneHotEncoder(),
                        category_columns,
                    ),
                ]
            ),
        ),
        # ("clf", linear_model.LogisticRegression()),
    ]
)
classify_pipline.set_params(preprocessor__dis__person_age__n_bins=10)
classify_pipline.set_params(preprocessor__dis__person_income__n_bins=10)
classify_pipline.set_params(preprocessor__dis__person_emp_length__n_bins=3)
classify_pipline.set_params(preprocessor__dis__cb_person_cred_hist_length__n_bins=5)

In [5]:
X_train = classify_pipline.fit_transform(train_data)
X_eval = classify_pipline.transform(eval_data)

In [None]:
from sklearn import model_selection, linear_model
from xgboost import XGBClassifier


def cv_loop(X, y, model, cv=5):
    return model_selection.cross_val_score(
        model, X, y, cv=cv, scoring="roc_auc", n_jobs=-1
    ).mean()


def selection_loop(model, score_hist, good_features, X_all, y, current_max_score):
    scores = []
    for i in range(X_all.shape[1]):
        if i not in good_features:
            features = good_features + [i]
            X = X_all[:, features]
            score = cv_loop(
                X,
                y,
                model,
            )
            scores.append((score, i))
            if score > current_max_score:
                print(f"new best score: Feature: {i}, new score: {score} ")
                current_max_score = score
    scores = sorted(scores, key=lambda x: x[0], reverse=True)
    score_hist.append(scores[0])
    good_features.append(scores[0][1])
    return current_max_score


def prune_loop(model, score_hist, good_features, X_all, y):
    to_be_removed = None
    gain = 0
    baseline = score_hist[-1][0]
    for f in good_features:
        features = good_features.copy()
        features.remove(f)
        X = X_all[:, features]
        score = cv_loop(X, y, model)
        if score > baseline and (score - baseline) > gain:
            gain = score - baseline
            to_be_removed = f
    if to_be_removed is not None:
        good_features.remove(to_be_removed)
        score_hist.append((baseline + gain, to_be_removed * -1))
        print(f"remove feature {to_be_removed}, improve gain: {gain}")
    return gain


def feature_selection(model, X_all, y, init_solution=None, threshold=1e-7):
    score_hist = []
    good_features = []
    if init_solution is not None:
        good_features = init_solution
    max_score = 0

    while len(score_hist) < 2 or (score_hist[-1][0] - score_hist[-2][0] > threshold):
        max_score = selection_loop(
            model, score_hist, good_features, X_all, y, max_score
        )
    while True:
        if prune_loop(model, score_hist, good_features, X_all, y) <= 0:
            break
    return good_features


param = {
    "alpha": 0.05,
    "colsample_bytree": 0.8,
    "gamma": 0.1,
    "lambda": 0.95,
    "learning_rate": 0.1,
    "max_depth": 10,
    "n_estimators": 200,
    "scale_pos_weight": 0.166,
    "subsample": 1.0,
}
good_features = feature_selection(XGBClassifier(**param), X_train, train_labels.values)

new best score: Feature: 0, new score: 0.619569154105583 


In [11]:
from sklearn import model_selection, linear_model
from xgboost import XGBClassifier

param_grid = {
    "scale_pos_weight": [0.166],
    "max_depth": [10, 20],  # 控制树的深度
    "subsample": [0.8, 1.0],  # 每棵树的样本采样比例
    "colsample_bytree": [0.1, 0.5, 0.8],  # 每棵树的特征采样比例
    "learning_rate": [0.1],  # 学习率
    "n_estimators": [200, 300, 500],  # 树的数量,
    "gamma": [0.05, 0.1, 0.2],  # 节点分裂所需的最小损失减少
    "lambda": [0.9, 0.95, 1.0],  # L2正则化
    "alpha": [0.05],  # L1正则化
}
gird = model_selection.GridSearchCV(
    XGBClassifier(),
    param_grid=param_grid,
    cv=5,
    scoring="roc_auc",
    n_jobs=-1,
)
grid_result = gird.fit(X_train[:, good_features], train_labels)
xgb_params = grid_result.best_params_
grid_result.best_score_, grid_result.best_params_

(0.9553213529094204,
 {'alpha': 0.05,
  'colsample_bytree': 0.8,
  'gamma': 0.1,
  'lambda': 0.95,
  'learning_rate': 0.1,
  'max_depth': 10,
  'n_estimators': 200,
  'scale_pos_weight': 0.166,
  'subsample': 1.0})

In [16]:
from sklearn import model_selection, ensemble

param_grid = {
    "n_estimators": [200, 300, 500],
    "max_depth": [30, 50, 100],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
}
gird = model_selection.GridSearchCV(
    ensemble.RandomForestClassifier(),
    param_grid=param_grid,
    cv=5,
    scoring="roc_auc",
    n_jobs=-1,
)
grid_result = gird.fit(X_train[:, good_features], train_labels)
tree_params = grid_result.best_params_
grid_result.best_score_, grid_result.best_params_

  _data = np.array(data, dtype=dtype, copy=copy,


(0.9422901909386276,
 {'max_depth': 100,
  'min_samples_leaf': 1,
  'min_samples_split': 10,
  'n_estimators': 500})

In [17]:
from sklearn import linear_model

param_grid = {
    "C": [0.1, 1, 10],
    "solver": ["newton-cg", "newton-cholesky"],
    "max_iter": [300, 500, 800],
}
gird = model_selection.GridSearchCV(
    linear_model.LogisticRegression(),
    param_grid=param_grid,
    cv=5,
    scoring="roc_auc",
    n_jobs=-1,
)
grid_result = gird.fit(X_train[:, good_features], train_labels)
linear_params = grid_result.best_params_
grid_result.best_score_, grid_result.best_params_

(0.9049416835964411, {'C': 10, 'max_iter': 300, 'solver': 'newton-cg'})

In [19]:
from sklearn import metrics

xgb_clf = XGBClassifier(**xgb_params)
tree_clf = ensemble.RandomForestClassifier(**tree_params)
linear_clf = linear_model.LogisticRegression(**linear_params)

xgb_clf.fit(X_train[:, good_features], train_labels)
tree_clf.fit(X_train[:, good_features], train_labels)
linear_clf.fit(X_train[:, good_features], train_labels)
(
    metrics.roc_auc_score(
        eval_labels, xgb_clf.predict_proba(X_eval[:, good_features])[:, 1]
    ),
    metrics.roc_auc_score(
        eval_labels, tree_clf.predict_proba(X_eval[:, good_features])[:, 1]
    ),
    metrics.roc_auc_score(
        eval_labels, linear_clf.predict_proba(X_eval[:, good_features])[:, 1]
    ),
)

(0.9556147472427647, 0.9387224358321828, 0.9088125568130069)

In [113]:
from scipy import optimize
from sklearn import metrics


def ensemble_predict(clfs, W, X):
    logtis = np.vstack([w * (clf.predict_proba(X)[:, 1]) for clf, w in zip(clfs, W)]).T
    return np.sum(logtis, axis=1) / np.sum(W)


O = optimize.minimize(
    lambda W: -metrics.roc_auc_score(
        eval_labels,
        ensemble_predict([xgb_clf, tree_clf, linear_clf], W, X_eval[:, good_features]),
    ),
    [1 / 3] * 3,
    bounds=[(-10, 10)] * 3,
    method="Nelder-Mead",
)

O, metrics.roc_auc_score(
    eval_labels,
    ensemble_predict([xgb_clf, tree_clf, linear_clf], O.x, X_eval[:, good_features]),
)

(       message: Optimization terminated successfully.
        success: True
         status: 0
            fun: -0.9557444014446501
              x: [ 8.765e-01 -9.330e-03 -1.269e-03]
            nit: 60
           nfev: 115
  final_simplex: (array([[ 8.765e-01, -9.330e-03, -1.269e-03],
                        [ 8.765e-01, -9.357e-03, -1.276e-03],
                        [ 8.765e-01, -9.368e-03, -1.270e-03],
                        [ 8.765e-01, -9.392e-03, -1.287e-03]]), array([-9.557e-01, -9.557e-01, -9.557e-01, -9.557e-01])),
 0.9557444014446501)

In [109]:
def soft_voting_predict(clfs, W, X):
    logtis = np.vstack([w * (clf.predict_proba(X)[:, 1]) for clf, w in zip(clfs, W)]).T
    return np.max(logtis, axis=1)


O = optimize.minimize(
    lambda W: -metrics.roc_auc_score(
        eval_labels,
        soft_voting_predict(
            [xgb_clf, tree_clf, linear_clf], W, X_eval[:, good_features]
        ),
    ),
    [1 / 3] * 3,
    bounds=[(0, 1)] * 3,
    method="Nelder-Mead",
)

O, metrics.roc_auc_score(
    eval_labels,
    soft_voting_predict([xgb_clf, tree_clf, linear_clf], O.x, X_eval[:, good_features]),
)

(       message: Optimization terminated successfully.
        success: True
         status: 0
            fun: -0.9556147472427647
              x: [ 1.000e+00  0.000e+00  0.000e+00]
            nit: 26
           nfev: 44
  final_simplex: (array([[ 1.000e+00,  0.000e+00,  0.000e+00],
                        [ 1.000e+00,  0.000e+00,  0.000e+00],
                        [ 1.000e+00,  0.000e+00,  0.000e+00],
                        [ 1.000e+00,  0.000e+00,  0.000e+00]]), array([-9.556e-01, -9.556e-01, -9.556e-01, -9.556e-01])),
 0.9556147472427647)

In [107]:
eval_logtis = np.concatenate(
    [
        clf.predict_proba(X_eval[:, good_features])[:, 1]
        for clf in [xgb_clf, tree_clf, linear_clf]
    ]
).reshape(-1, 3)
eval_logtis = preprocessing.StandardScaler().fit_transform(eval_logtis)
param_grid = {
    "C": [0.1, 1, 10],
}
stacking_grid = model_selection.GridSearchCV(
    linear_model.LogisticRegression(),
    param_grid=param_grid,
    cv=5,
    scoring="roc_auc",
    n_jobs=-1,
)
stacking_result = stacking_grid.fit(eval_logtis, eval_labels)
stacking_result.best_score_, stacking_result.best_params_

(0.49233411084927575, {'C': 10})

(nan, {})

In [88]:
from sklearn import ensemble

final_tree_clf = ensemble.RandomForestClassifier(**tree_params)
final_xgb_clf = XGBClassifier(**xgb_params)
final_linear_clf = linear_model.LogisticRegression(**linear_params)


def predict_write_csv(best_clf):
    test_csv = pd.read_csv(os.path.join(data_dir, "test.csv"))
    train_full_csv = pd.read_csv(os.path.join(data_dir, "train.csv"))
    train_full_data = train_full_csv.drop(["loan_status", "id"], axis=1)
    train_full_labels = train_full_csv["loan_status"]
    test_data = test_csv.drop(["id"], axis=1)
    best_clf.fit(classify_pipline.fit_transform(train_full_data), train_full_labels)
    predict_logits = best_clf.predict_proba(classify_pipline.transform(test_data))[:, 1]
    pd.DataFrame({"id": test_csv["id"], "loan_status": predict_logits}).to_csv(
        "data/loan_approval_predication/submission.csv", index=False
    )

In [93]:
data = np.array([1, 2, 3, 4, 6, 5]).reshape(-1, 2)
np.max(data, axis=1)

array([2, 4, 6])

In [94]:
data

array([[1, 2],
       [3, 4],
       [6, 5]])