In [None]:
from scipy import sparse as ssp
from sklearn.model_selection import StratifiedKFold
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import matplotlib.pyplot as plt
import lightgbm as lgbm

plt.rcParams["figure.figsize"] = (7,7)
plt.rcParams["font.size"] = 14
import warnings
warnings.filterwarnings("ignore")

In [None]:
path = "./data/"

train = pd.read_csv(path+'train.csv')
train_label = train['target']
train_id = train['id']
test = pd.read_csv(path+'test.csv')
test_id = test['id']

# FE

In [None]:
feature_names = train.drop(["id", "target"], axis=1).columns.tolist()
# Các cột categorical
cat_features = [c for c in feature_names if ('cat' in c)]
# Các cột internal + binary
num_features = [c for c in feature_names if ('cat' not in c and 'calc' not in c)]

In [None]:
train['missing'] = (train==-1).sum(axis=1).astype(float)
test['missing'] = (test==-1).sum(axis=1).astype(float)
num_features.append('missing')

In [None]:
for c in cat_features:
    le = LabelEncoder()
    le.fit(train[c])
    train[c] = le.transform(train[c])
    test[c] = le.transform(test[c])

enc = OneHotEncoder()
enc.fit(train[cat_features])
X_cat = enc.transform(train[cat_features])
X_t_cat = enc.transform(test[cat_features])

In [None]:
ind_features = [c for c in feature_names if 'ind' in c]
count=0
for c in ind_features:
    if count==0:
        train['new_ind'] = train[c].astype(str)+'_'
        test['new_ind'] = test[c].astype(str)+'_'
        count+=1
    else:
        train['new_ind'] += train[c].astype(str)+'_'
        test['new_ind'] += test[c].astype(str)+'_'

In [None]:
cat_count_features = []
for c in cat_features+['new_ind']:
    d = pd.concat([train[c],test[c]]).value_counts().to_dict()
    train['%s_count'%c] = train[c].apply(lambda x:d.get(x,0))
    test['%s_count'%c] = test[c].apply(lambda x:d.get(x,0))
    cat_count_features.append('%s_count'%c)

# train

In [None]:
train_list = [train[num_features+cat_count_features].values, X_cat]
test_list = [test[num_features+cat_count_features].values, X_t_cat]

# Nén dữ liệu: Compressed Sparse Row format
X = ssp.hstack(train_list).tocsr()
X_test = ssp.hstack(test_list).tocsr()

In [None]:
def Gini(y_true, y_pred):
    # check and get number of samples
    assert y_true.shape == y_pred.shape
    n_samples = y_true.shape[0]

    # sort rows on prediction column
    # (from largest to smallest)
    arr = np.array([y_true, y_pred]).transpose()
    true_order = arr[arr[:, 0].argsort()][::-1, 0]
    pred_order = arr[arr[:, 1].argsort()][::-1, 0]

    # get Lorenz curves
    L_true = np.cumsum(true_order) * 1. / np.sum(true_order)
    L_pred = np.cumsum(pred_order) * 1. / np.sum(pred_order)
    L_ones = np.linspace(1 / n_samples, 1, n_samples)

    # get Gini coefficients (area between curves)
    G_true = np.sum(L_ones - L_true)
    G_pred = np.sum(L_ones - L_pred)

    # normalize to true Gini coefficient
    return G_pred * 1. / G_true

def evalerror(preds, dtrain):
    labels = dtrain.get_label()
    return 'gini', Gini(labels, preds), True

In [None]:
NFOLDS = 5

In [None]:
def objective(trial, X, y, X_test, y_test):
    param_grid = {
        'objective': 'binary',
        'verbosity': 0,
        'num_boost_round': 10000,
        "num_leaves": trial.suggest_int("num_leaves", 5, 55, step=5),
        "min_child_samples": trial.suggest_int("min_child_samples", 0, 150, step=10),
        "min_child_weight": trial.suggest_float("min_child_weight", 0.0, 200.0),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 0.9, step=0.1),
        "seed": 12, # Thay thế seed
    }

    kfold = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=218)

    cv_train = np.zeros(len(y))
    cv_pred = np.zeros(len(y_test))

    kf = kfold.split(X, y)

    best_trees = []
    fold_scores = []

    for i, (train_fold, validate) in enumerate(kf):
        X_train, X_validate, label_train, label_validate = X[train_fold, :], X[validate, :], y[train_fold], y[validate]

        dtrain = lgbm.Dataset(X_train, label_train)
        dvalid = lgbm.Dataset(X_validate, label_validate, reference=dtrain)

        bst = lgbm.train(param_grid, dtrain, valid_sets=dvalid, feval=evalerror, verbose_eval=100,
                         early_stopping_rounds=100,
                         callbacks=[LightGBMPruningCallback(trial, "gini")])

        best_trees.append(bst.best_iteration)
        cv_pred += bst.predict(X_test, num_iteration=bst.best_iteration)
        cv_train[validate] += bst.predict(X_validate)

        score = Gini(label_validate, cv_train[validate])
        print(score)
        fold_scores.append(score)

    cv_pred /= NFOLDS

    cv_score = Gini(y, cv_train)
    print("cv score: {cv_score}")
    print(fold_scores)
    test_score = Gini(y_test, cv_pred)
    print("test score: {test_score}")
    print(best_trees, np.mean(best_trees))

    return test_score

In [None]:
def cv_gbm(params):
    cv_train = np.zeros(len(train_label))
    cv_pred = np.zeros(len(test_id))

    kfold = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=218)
    kf = kfold.split(X, train_label)

    #best_trees = []
    fold_scores = []

    for i, (train_fold, validate) in enumerate(kf):
        print("Fold {}/{}".format(i+1, NFOLDS))

        X_train, X_validate, label_train, label_validate = \
            X[train_fold, :], X[validate, :], train_label[train_fold], train_label[validate]

        dtrain = lgbm.Dataset(X_train, label_train)
        dvalid = lgbm.Dataset(X_validate, label_validate, reference=dtrain)
        
        bst = lgbm.train(params, dtrain, valid_sets=dvalid, feval=evalerror, verbose_eval=100,
                        early_stopping_rounds=100)

        #best_trees.append(bst.best_iteration)
        cv_pred += bst.predict(X_test, num_iteration=bst.best_iteration)
        cv_train[validate] += bst.predict(X_validate, num_iteration=bst.best_iteration)

        score = Gini(label_validate, cv_train[validate])
        print(score)
        fold_scores.append(score)

    cv_pred /= NFOLDS

    print(fold_scores)

    return cv_train, cv_pred

In [None]:
FIXED_PARAMS = {'objective': 'binary', 'verbosity': 0, 'num_boost_round': 10000}

SEARCH_PARAMS =     [{'num_leaves': 15,
                    'feature_fraction': 0.5,
                    'min_child_samples': 100,
                    'min_child_weight': 101.94095152382667,
                    'seed': 0},
                    {'num_leaves': 15,
                    'feature_fraction': 0.5,
                    'min_child_samples': 60,
                    'min_child_weight': 124.66803210058042,
                    'seed': 2},
                    {'num_leaves': 15,
                    'feature_fraction': 0.5,
                    'min_child_samples': 150,
                    'min_child_weight': 148.13731979766956,
                    'seed': 5},
                    {'num_leaves': 15,
                    'feature_fraction': 0.5,
                    'min_child_samples': 30,
                    'min_child_weight': 157.2373873709297,
                    'seed': 7},
                    {'num_leaves': 15,
                    'feature_fraction': 0.5,
                    'min_child_samples': 10,
                    'min_child_weight': 125.16184758062035,
                    'seed': 12}]

In [None]:
num_seed = len(SEARCH_PARAMS)
x_score = []
final_cv_train = np.zeros(len(train_label))
final_cv_pred = np.zeros(len(test_id))
for s, s_param in enumerate(SEARCH_PARAMS):
    print("param", s)
    params = {**s_param, **FIXED_PARAMS}

    cv_train, cv_pred = cv_gbm(params)
    final_cv_train += cv_train
    final_cv_pred += cv_pred

    cv_score = Gini(train_label, cv_train)
    print("cv score:", cv_score)
    print("current score:", Gini(train_label, final_cv_train / (s + 1.)))

    x_score.append(cv_score)

print("cv scores:", x_score)
pd.DataFrame({'id': test_id, 'target': final_cv_pred / num_seed}).to_csv('./lgbm3_pred_avg.csv', index=False)
#pd.DataFrame({'id': train_id, 'target': final_cv_train / num_seed}).to_csv('./lgbm3_cv_avg.csv', index=False)