# Main notebook for model train and tracking

In [1]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np 
from sklearn.model_selection import KFold, train_test_split
import matplotlib.pyplot as plt
import mlflow
import mlflow.sklearn
import mlflow.xgboost
from xgboost import XGBClassifier
import pickle
import warnings 
import xgboost as xgb

warnings.filterwarnings("ignore")


In [2]:
cat_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']

train = pd.read_parquet("../data/train_woe_balanced.parquet")
train = train.fillna(-127)

In [3]:
train.drop('WOE_target',axis=1, inplace=True)

In [4]:
#train = train[:1000]

In [5]:
FEATURES = train.columns.to_list()
FEATURES.remove("target")
FEATURES = FEATURES[2:]

In [6]:
#Competition metric
def amex_metric(y_true, y_pred):

    labels     = np.transpose(np.array([y_true, y_pred]))
    labels     = labels[labels[:, 1].argsort()[::-1]]
    weights    = np.where(labels[:,0]==0, 20, 1)
    cut_vals   = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four   = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])

    gini = [0,0]
    for i in [1,0]:
        labels         = np.transpose(np.array([y_true, y_pred]))
        labels         = labels[labels[:, i].argsort()[::-1]]
        weight         = np.where(labels[:,0]==0, 20, 1)
        weight_random  = np.cumsum(weight / np.sum(weight))
        total_pos      = np.sum(labels[:, 0] *  weight)
        cum_pos_found  = np.cumsum(labels[:, 0] * weight)
        lorentz        = cum_pos_found / total_pos
        gini[i]        = np.sum((lorentz - weight_random) * weight)

    return 0.5 * (gini[1]/gini[0] + top_four)

In [7]:
# Model Parameters

xgb_params = {
    'max_depth':4, 
    'learning_rate':0.05,
    'subsample': 0.8,
    'colsample_bytree':0.6,
    'eval_metric': 'logloss',
    'objective':'binary:logistic',
    'tree_method':'gpu_hist',
    'predictor':'gpu_predictor',
    'random_state':42
}

# xgb_params = {
#         'num_leaves': 10,
#         'max_bin': 127,
#         'min_data_in_leaf': 11,
#         'learning_rate': 0.035,
#         'bagging_fraction': 1.0, 
#         'bagging_freq': 5, 
#         'feature_fraction': 0.05,
#         'lambda_l1': 4.972,
#         'lambda_l2': 2.276,
#         'min_gain_to_split': 0.65,
#         'max_depth': 14,
#         'save_binary': True,
#         'seed': 1337,
#         'feature_fraction_seed': 1337,
#         'bagging_seed': 1337,
#         'drop_seed': 1337,
#         'data_random_seed': 1337,
#         'objective': 'binary',
#         'boosting_type': 'dart',
#         'verbose': 1,
#         'is_unbalance': True,
#         'boost_from_average': False,
#         'device': 'gpu',
#         'tree_method':'gpu_hist',
#         'predictor':'gpu_predictor',
#         'gpu_platform_id': 0,
#         'gpu_device_id': 0
#         }

reg_log_params={
        'penalty': 'l2',
        'max_iter': 200,
        'warm_start': True,
        'n_jobs': 1,
            
                }
fores_params={
        'bootstrap': True,
        'criterion': 'gini',
        'max_depth': None,
        'max_features': 'auto',
        'max_leaf_nodes': None,
        'min_impurity_decrease': 0.0,
        'min_samples_leaf': 1,
        'min_samples_split': 2,
        'min_weight_fraction_leaf': 0.0,
        'n_estimators': 10,
        'n_jobs': -1,
        'oob_score': False,
        'random_state': 42,
        'verbose': 0,
        'warm_start': False
            }

In [8]:
# Setting MLFlow
experiment_name = "RegLog, Forest, XgBoost comparissom"
try:
    exp_id = mlflow.create_experiment(name=experiment_name)
except Exception as e:
    exp_id = mlflow.get_experiment_by_name(experiment_name).experiment_id 

In [9]:
modelclasses = [
        ["xgb", xgb, xgb_params],
        ["reg_log",LogisticRegression,reg_log_params],
        ["forest", RandomForestClassifier, fores_params]
                ]

# VERSION NAME FOR SAVED MODEL FILES
VER = '03'

# TRAIN RANDOM SEED
SEED = 42

# FILL NAN VALUE
NAN_VALUE = -127 # will fit in int8

# FOLDS PER MODEL
FOLDS = 5

# TRAIN FOLD
#TRAIN_PATH = "../data/processed/train_woe_balanced.parquet"

importances = []
oof = []
TRAIN_SUBSAMPLE = 1.0

skf = KFold(n_splits = FOLDS, shuffle=True, random_state=42)
with mlflow.start_run(experiment_id=exp_id):
        for fold, (train_idx, valid_idx) in enumerate(skf.split(train, train.target)):
                

                if TRAIN_SUBSAMPLE<1.0:
                        np.random.seed(SEED)
                        train_idx = np.random.choice(train_idx, 
                                        int(len(train_idx)*TRAIN_SUBSAMPLE), replace=False)
                        np.random.seed(None)
                
                X_train = train.loc[train_idx, FEATURES]
                y_train = train.loc[train_idx, 'target']
                X_valid = train.loc[valid_idx, FEATURES]
                y_valid = train.loc[valid_idx, 'target']

                for modelname, Model, param_list in modelclasses:
                        print('#'*25)
                        print('### Fold',fold+1)
                        print('### Train size',len(train_idx),'Valid size',len(valid_idx))
                        print(f'### Training model {modelname.upper()}')
                        print(f'### Training with {int(TRAIN_SUBSAMPLE*100)}% fold data...')
                        print('#'*25)

                        if modelname == 'xgb':

                                batch_size = 200000
                                iterations = 10
                                model = None
                                for i in range(iterations):
                                        for start in range(0, len(X_train), batch_size):

                                                dtrain=xgb.DMatrix(X_train[start:start+batch_size], 
                                                                        y_train[start:start+batch_size])
                                                dvalid = xgb.DMatrix(X_valid[start:start+batch_size], 
                                                                    y_valid[start:start+batch_size],)
                                                model = xgb.train(
                                                {
                                                'max_depth':4, 
                                                'learning_rate':0.05,
                                                'subsample': 0.8,
                                                'colsample_bytree':0.6,
                                                'eval_metric': 'logloss',
                                                'objective':'binary:logistic',
                                                'tree_method':'gpu_hist',
                                                'predictor':'gpu_predictor',
                                                'random_state':42
                                                },
                                                dtrain=dtrain,
                                                xgb_model=model,
                                                evals=[(dtrain, 'train'), (dvalid, 'valid')],
                                                num_boost_round= 9999,
                                                early_stopping_rounds = 100,
                                                verbose_eval= 100
                                                                         
                                                )
                                        
                                
                                        
                                mlflow.xgboost.log_model(model, f"{Model}")
                                pickle.dump(model, open(f'../models/{modelname}_fold{fold}.pkl','wb'))
                                oof_preds = model.predict(dvalid)
                                acc = amex_metric(y_valid.values, oof_preds)
                                print("Kaggle Metric=", acc,'\n')
                                mlflow.log_metric(f"Kaggle Metric for {modelname}", acc)
                                mlflow.sklearn.log_model(model, f"{Model}")
                                print("Model saved in run %s" % mlflow.active_run().info.run_uuid)
                                
                                df = train.loc[valid_idx, ['customer_ID', 'target']].copy()
                                df['oof_pred']= oof_preds
                                df['model_name'] = modelname
                                oof.append(df)
                                del dtrain, dvalid
                                
                        else:

                                model = Model(**param_list)
                                #print(model)
                                model.fit(X_train,y_train)

                        pickle.dump(model, open(f'../models/{modelname}_fold{fold}.pkl','wb'))

                        oof_preds = model.predict(X_valid)
                        acc = amex_metric(y_valid.values, oof_preds)
                        print("Kaggle Metric=", acc,'\n')
                        mlflow.log_metric(f"Kaggle Metric for {modelname}", acc)
                        mlflow.sklearn.log_model(model, f"{Model}")
                        print("Model saved in run %s" % mlflow.active_run().info.run_uuid)
                        
                        mlflow.sklearn.log_model(model, f"{Model}")
                        df = train.loc[valid_idx, ['customer_ID', 'target']].copy()
                        df['oof_pred']= oof_preds
                        df['model_name'] = modelname
                        oof.append(df)

                        del df, model

                del X_train, y_train, dvalid, dtrain
                del X_valid, y_valid

        
print('#'*25)
oof = pd.concat(oof, axis=0, ignore_index=True).set_index('customer_ID')
for n in range(len(modelclasses)):
    target = oof.loc[oof['model_name'] ==modelclasses[n][0], ['target'] ].reset_index()
    preds = oof.loc[oof['model_name'] == modelclasses[n][0], ['oof_pred']].reset_index()
    acc= amex_metric(target.target.values, preds.oof_pred.values)
    print(f'OVERAL CV Kaggle Metric for {modelclasses[n][0]} = {acc}')



#########################
### Fold 1
### Train size 4425160 Valid size 1106291
### Training model XGB
### Training with 100% fold data...
#########################
[0]	train-logloss:0.66640	valid-logloss:0.66623
[100]	train-logloss:0.27995	valid-logloss:0.28575
[200]	train-logloss:0.26017	valid-logloss:0.27488
[300]	train-logloss:0.24874	valid-logloss:0.27078
[400]	train-logloss:0.23963	valid-logloss:0.26812
[500]	train-logloss:0.23225	valid-logloss:0.26637
[600]	train-logloss:0.22561	valid-logloss:0.26510
[700]	train-logloss:0.21959	valid-logloss:0.26408
[800]	train-logloss:0.21443	valid-logloss:0.26325
[900]	train-logloss:0.20922	valid-logloss:0.26249
[1000]	train-logloss:0.20441	valid-logloss:0.26188
[1100]	train-logloss:0.19993	valid-logloss:0.26136
[1200]	train-logloss:0.19555	valid-logloss:0.26082
[1300]	train-logloss:0.19112	valid-logloss:0.26035
[1400]	train-logloss:0.18723	valid-logloss:0.26005
[1500]	train-logloss:0.18326	valid-logloss:0.25971
[1600]	train-logloss:0.17924	val

NameError: name 'dvalid' is not defined