In [None]:
import gc
import os
import joblib
import random
import warnings
import itertools
import scipy as sp
import numpy as np
import pandas as pd
from tqdm import tqdm
import lightgbm as lgb
import xgboost as xgb
from itertools import combinations
pd.set_option('display.width', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
from sklearn.preprocessing import LabelEncoder
import warnings; warnings.filterwarnings('ignore')
from sklearn.model_selection import StratifiedKFold, train_test_split

In [None]:
def read_data():
    train = pd.read_parquet('/content/drive/MyDrive/Kaggle/AmericanExpress/data/train_fe_v1.parquet')
    test = pd.read_parquet('/content/drive/MyDrive/Kaggle/AmericanExpress/data/test_fe_v1.parquet')
    return train, test

class CFG:
    seed = 42
    n_folds = 5
    target = 'target'

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed_everything(CFG.seed)

def amex_metric(y_true, y_pred):
    labels = np.transpose(np.array([y_true, y_pred]))
    labels = labels[labels[:, 1].argsort()[::-1]]
    weights = np.where(labels[:,0]==0, 20, 1)
    cut_vals = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])
    gini = [0,0]
    for i in [1,0]:
        labels = np.transpose(np.array([y_true, y_pred]))
        labels = labels[labels[:, i].argsort()[::-1]]
        weight = np.where(labels[:,0]==0, 20, 1)
        weight_random = np.cumsum(weight / np.sum(weight))
        total_pos = np.sum(labels[:, 0] *  weight)
        cum_pos_found = np.cumsum(labels[:, 0] * weight)
        lorentz = cum_pos_found / total_pos
        gini[i] = np.sum((lorentz - weight_random) * weight)
    return 0.5 * (gini[1]/gini[0] + top_four)

## 1.第一种特征训练

In [None]:
train, test = read_data()

In [None]:
# Label encode categorical features
cat_features = [
    "B_30",
    "B_38",
    "D_114",
    "D_116",
    "D_117",
    "D_120",
    "D_126",
    "D_63",
    "D_64",
    "D_66",
    "D_68"
]
cat_features = [f"{cf}_last" for cf in cat_features]
num_cols = list(train.dtypes[(train.dtypes == 'float32') | (train.dtypes == 'float64')].index)
num_cols = [col for col in num_cols if 'last' in col]
for col in num_cols:
    train[col + '_round2'] = train[col].round(2)
    test[col + '_round2'] = test[col].round(2)

# num_cols = [col for col in train.columns if 'last' in col]
# num_cols = [col[:-5] for col in num_cols if 'round' not in col]
# for col in num_cols:
#     try:
#         train[f'{col}_last_mean_diff'] = train[f'{col}_last'] - train[f'{col}_mean']
#         test[f'{col}_last_mean_diff'] = test[f'{col}_last'] - test[f'{col}_mean']
#     except:
#         pass
num_cols = list(train.dtypes[(train.dtypes == 'float32') | (train.dtypes == 'float64')].index)
for col in tqdm(num_cols):
    train[col] = train[col].astype(np.float16)
    test[col] = test[col].astype(np.float16)

In [None]:
def lgb_amex_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return 'amex_metric', amex_metric(y_true, y_pred), True

def train_and_evaluate_lgb(train,seed,save_dir):
    # Get feature list
    features = [col for col in train.columns if col not in ['customer_ID', CFG.target , 'S_2']]
    params = {
        'objective': 'binary',
        'device_type' : 'gpu',
        'metric': "binary_logloss",
        'boosting': 'dart',
        'seed': CFG.seed,
        'num_leaves': 100,
        'learning_rate': 0.01,
        'feature_fraction': 0.20,
        'bagging_freq': 10,
        'bagging_fraction': 0.50,
        'n_jobs': -1,
        'lambda_l2': 2,
        'min_data_in_leaf': 40
        }
    # Create a numpy array to store out of folds predictions
    oof_predictions = np.zeros(len(train))
    kfold = StratifiedKFold(n_splits = CFG.n_folds, shuffle = True, random_state = seed)
    for fold, (trn_ind, val_ind) in enumerate(kfold.split(train, train[CFG.target])):
        print(' ')
        print('-'*50)
        print(f'Training fold {fold} with {len(features)} features...')
        x_train, x_val = train[features].iloc[trn_ind], train[features].iloc[val_ind]
        y_train, y_val = train[CFG.target].iloc[trn_ind], train[CFG.target].iloc[val_ind]
        lgb_train = lgb.Dataset(x_train, y_train)
        lgb_valid = lgb.Dataset(x_val, y_val)
        model = lgb.train(
            params = params,
            train_set = lgb_train,
            num_boost_round = 10500,
            valid_sets = [lgb_train, lgb_valid],
            early_stopping_rounds = 100,
            verbose_eval = 500,
            feval = lgb_amex_metric
            )
        # Save best model
        joblib.dump(model, save_dir + f'/lgbm_fold{fold}_seed{seed}_feature_v1.pkl')
        val_pred = model.predict(x_val)
        oof_predictions[val_ind] = val_pred
        print(amex_metric(train[CFG.target].iloc[val_ind],oof_predictions[val_ind]))
        # Predict validation
        del x_train, x_val, y_train, y_val, lgb_train, lgb_valid
        import gc
        gc.collect()
    score = amex_metric(train[CFG.target], oof_predictions)
    print(f'Our out of folds CV score is {score}')
    # Create a dataframe to store out of folds predictions
    oof_df = pd.DataFrame({'customer_ID': train['customer_ID'], 'target': train[CFG.target], 'prediction': oof_predictions})
    oof_df.to_csv(save_dir + f'/oof_lgbm_baseline_{CFG.n_folds}fold_seed{seed}_feature_v1.csv', index = False)

In [None]:
train_and_evaluate_lgb(train,42,save_dir)
train_and_evaluate_lgb(train,52,save_dir)
train_and_evaluate_lgb(train,62,save_dir)

In [None]:
def pred_lgboost(test,seed,save_dir,version):
  features = [col for col in train.columns if col not in ['customer_ID', CFG.target,'S_2']]
  for fold in range(5):
    test_predictions = np.zeros(len(test))
    model = joblib.load(save_dir + f'/lgbm_fold{fold}_seed{seed}_feature_{version}.pkl')
    print('get_model')
    start_row = 0
    step  = len(test) // 5
    end_row =  start_row + step
    for k in range(6):
      print(k,start_row,end_row)
      if end_row <= len(test):
        test_pred = model.predict(test.loc[start_row:end_row,features])
        test_predictions[start_row:end_row+1] += test_pred 
      else:
        test_pred = model.predict(test.loc[start_row:end_row,features])
        test_predictions[start_row:end_row+1] += test_pred
      start_row += step
      end_row += step
    test_df = pd.DataFrame({'customer_ID': test['customer_ID'], 'prediction': test_predictions})
    test_df.to_csv(save_dir + f'/test_lgb_baseline_{CFG.n_folds}fold_seed{seed}_{fold}_featurev_{version}.csv', index = False)
    del test_pred,test_predictions
    for _ in range(5):
      gc.collect()

In [None]:
pred_lgboost(test,42,save_dir,'v1')
pred_lgboost(test,52,save_dir,'v1')
pred_lgboost(test,62,save_dir,'v1')

In [None]:
def train_and_evaluate_xgboost(train,seed,save_dir,version):
    features = [col for col in train.columns if col not in ['customer_ID', CFG.target]]
    # Get feature list
    
    params = {
          'n_estimators': 27,
          'reg_alpha':1.2825193704597235,
          'reg_lambda':2.8254513129979624,
          'colsample_bytree':0.4,
          'subsample':0.6,
          'learning_rate':0.006,
          'max_depth':7,
          'num_leaves':25,
          'min_child_samples': 249,
          'cat_smooth': 66,
          'eval_metric':'logloss',
          'objective':'binary:logistic',
          'tree_method':'gpu_hist',
          'predictor':'gpu_predictor',
          'random_state':seed
        }
    # Create a numpy array to store test predictions
    # test_predictions = np.zeros(len(test))
    # # Create a numpy array to store out of folds predictions
    oof_predictions = np.zeros(len(train))
    kfold = StratifiedKFold(n_splits = CFG.n_folds, shuffle = True, random_state = seed)
    for fold, (trn_ind, val_ind) in enumerate(kfold.split(train, train[CFG.target])):
        print(' ')
        print('-'*50)
        print(f'Training fold {fold} with {len(features)} features...')
        x_train, x_val = train[features].iloc[trn_ind], train[features].iloc[val_ind]
        y_train, y_val = train[CFG.target].iloc[trn_ind], train[CFG.target].iloc[val_ind]
        xgb_train = xgb.DMatrix(x_train, y_train)
        xgb_valid = xgb.DMatrix(x_val, y_val)
        watchlist = [(xgb_train, 'train'), (xgb_valid, 'val')]
        model = xgb.train(
            params,
            xgb_train,
            10500,
            evals = watchlist,
            early_stopping_rounds = 100,
            verbose_eval = 300,
            # feval = lgb_amex_metric
            )
        # Save best model
        joblib.dump(model, save_dir + f'/xgbm_fold{fold}_seed{seed}_feature_{version}.pkl')
        val_pred = model.predict(xgb_valid)
        oof_predictions[val_ind] = val_pred
        print(amex_metric(train[CFG.target].iloc[val_ind],oof_predictions[val_ind]))
        # Predict validation
        del x_train, x_val, y_train, y_val, xgb_train, xgb_valid
        import gc
        gc.collect()
    score = amex_metric(train[CFG.target], oof_predictions)
    print(f'Our out of folds CV score is {score}')
    # Create a dataframe to store out of folds predictions
    oof_df = pd.DataFrame({'customer_ID': train['customer_ID'], 'target': train[CFG.target], 'prediction': oof_predictions})
    oof_df.to_csv(save_dir + f'/oof_xgbm_baseline_{CFG.n_folds}fold_seed{seed}_feature_{version}.csv', index = False)

In [None]:
train_and_evaluate_xgboost(train,42,save_dir,'v1')
train_and_evaluate_xgboost(train,52,save_dir,'v1')
train_and_evaluate_xgboost(train,62,save_dir,'v1')

In [None]:
def pred_xgboost(test,seed,save_dir,version):
  features = [col for col in train.columns if col not in ['customer_ID', CFG.target]]
  for fold in range(5):
    test_predictions = np.zeros(len(test))
    model = joblib.load(save_dir + f'/xgbm_fold{fold}_seed{seed}_feature_{version}.pkl')
    print('get_model')
    start_row = 0
    step  = len(test) // 5
    end_row =  start_row + step
    for k in range(6):
      print(k,start_row,end_row)
      if end_row <= len(test):
        xgb_test = xgb.DMatrix(data = test.loc[start_row:end_row,features])
        test_pred = model.predict(xgb_test)
        test_predictions[start_row:end_row+1] += test_pred 
      else:
        xgb_test = xgb.DMatrix(data = test.loc[start_row:end_row,features])
        test_pred = model.predict(xgb_test)
        test_predictions[start_row:end_row+1] += test_pred
      start_row += step
      end_row += step
    test_df = pd.DataFrame({'customer_ID': test['customer_ID'], 'prediction': test_predictions})
    test_df.to_csv(save_dir + f'/test_xgb_baseline_{CFG.n_folds}fold_seed{seed}_{fold}_feature_{version}.csv', index = False)
    del test_pred,test_predictions,xgb_test
    for _ in range(5):
      gc.collect()

In [None]:
pred_xgboost(test,42,save_dir,'v1')
pred_xgboost(test,52,save_idr,'v1')
pred_xgboost(test,62,save_dir,'v1')

## 第二种特征训练

In [None]:
train = pd.read_parquet('/content/drive/MyDrive/Kaggle/AmericanExpress/data/train_parquet_cols_v2.parquet')
test = pd.read_parquet('/content/drive/MyDrive/Kaggle/AmericanExpress/data/test_parquet_cols_v2.parquet')

In [None]:
train_and_evaluate_lgb(train,42,'v2')
train_and_evaluate_lgbtrain,52,'v2')
train_and_evaluate_lgb(train,62,'v2')

In [None]:
pred_lgboost(test,42,'v2')
pred_lgboost(test,52,'v2')
pred_lgboost(test,62,'v2')

In [None]:
train_and_evaluate_xgboost(train,42,'v2')
train_and_evaluate_xgboost(train,52,'v2')
train_and_evaluate_xgboost(train,62,'v2')

In [None]:
pred_xgboost(test,42,'v2')
pred_xgboost(test,52,'v2')
pred_xgboost(test,62,'v2')