In [1]:
import sys
import warnings

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, matthews_corrcoef
from sklearn.model_selection import cross_val_score, cross_validate, StratifiedKFold

from imblearn.pipeline import Pipeline
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.over_sampling import SMOTE

from xgboost import XGBClassifier, XGBRFClassifier

sys.path.append("../../")

from helpers.split import tag_label_feature_split
from helpers.assess import make_classification_report, make_confusion_matrix

DATASET_FOLDER = "../../datasets/"

In [2]:
# read a data set
df = pd.read_pickle(DATASET_FOLDER + "dataset_00_all.pickle")

In [3]:
# get labels, a label encoder and features
_, (y, le), X = tag_label_feature_split(df, label_format="encoded")

In [4]:
# split the data for training and testing with shuffling and stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1962, shuffle=True, stratify=y
)

In [5]:
# standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1962)

classifiers = []

for train_idx, validate_idx in skf.split(X_train, y_train):
    X_fold_train, X_fold_validate = X_train_scaled[train_idx], X_train_scaled[validate_idx]
    y_fold_train, y_fold_validate = y[train_idx], y[validate_idx]
    
    smt = SMOTETomek(random_state=1962, n_jobs=-1)
   
    print ("resample")
    X_fold_train_resampled, y_fold_train_resampled = smt.fit_resample(X_fold_train, y_fold_train)
   
    print ("create classifier")
    classifier = XGBClassifier(
        learning_rate=.221461,
        n_estimators=827,
        max_depth=4,
        gamma=.524969,
        reg_alpha=4.327827,
        use_label_encoder=False,
        tree_method="gpu_hist",
        sampling_method="gradient_based",
        objective="multi:softprob",
        eval_metric=["mlogloss", "auc"], 
        early_stopping_rounds=40,
        seed=1962,
    )

    eval_set = [(X_fold_train_resampled, y_fold_train_resampled), (X_fold_validate, y_fold_validate)]
    classifier.fit(X_fold_train_resampled, y_fold_train_resampled,
                   eval
                   eval_set=eval_set, 
                   verbose=False)
    
    classifiers.append(classifier)
    

In [None]:
test_model = classifiers[4]
test_model_results = test_model.evals_result()
for key, val in test_model_results.items():
    print(key)

In [None]:
train_results=test_model_results['validation_0']
test_results=test_model_results['validation_1']

In [None]:
plt.plot(train_results['mlogloss'], label='train')
plt.plot(test_results['mlogloss'], label='test')
plt.legend()
plt.show()

In [None]:
plt.plot(train_results['auc'], label='train')
plt.plot(test_results['auc'], label='test')
plt.legend()
plt.show()

In [None]:
accuracy_score(y_test, test_model.predict(X_test_scaled))

In [None]:
matthews_corrcoef(y_test, test_model.predict(X_test_scaled))