## Finding the best parameters

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import f1_score, accuracy_score, log_loss
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.metrics import confusion_matrix, accuracy_score, precision_recall_fscore_support
from sklearn.metrics import roc_auc_score, roc_curve, auc
import matplotlib.pyplot as plt
from itertools import cycle
from sklearn.preprocessing import label_binarize
from scipy import interp
from numpy import unique, concatenate
import seaborn as sns


# Loading the data
df = pd.read_csv('Data.csv')

# Separating features and target
X = df[['Var1', 'Var2', 'Var3', 'Var4', 'Var5', 'Var6', 'Var7', 'Var8']]
y = df['Label']

# Splitting the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=22)

# Defining the space of hyperparameters to search
space = {
    'max_depth': hp.choice('max_depth', range(2, 8)),
    'eta': hp.uniform('eta', 0.01, 0.5),
    'min_child_weight': hp.choice('min_child_weight', range(3, 10)),
    'gamma': hp.uniform('gamma', 0.0, 0.5),
    'subsample': hp.uniform('subsample', 0.5, 1.0),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.0),
    'lambda': hp.uniform('lambda', 1, 10),
    'alpha': hp.uniform('alpha', 1, 10)
}

# Function for hyperopt
def objective(params):
    # Make sure parameters that need to be integers are integers
    params['max_depth'] = int(params['max_depth'])
    params['min_child_weight'] = int(params['min_child_weight'])
    
    # Update the XGBoost parameters
    xgb_params = params.copy()
    xgb_params.update({
        'objective': 'multi:softprob',
        'num_class': 4,
        'eval_metric': 'aucpr'
    })
    
    # Converting data into DMatrix format
    dtrain = xgb.DMatrix(X_train, label=y_train, missing=np.nan)
    dval = xgb.DMatrix(X_val, label=y_val, missing=np.nan)
    
    # Training the model
    bst = xgb.train(xgb_params, dtrain, num_boost_round=2500, evals=[(dval, 'Validation')], early_stopping_rounds=200, verbose_eval=False)
    
    # Predictions
    preds = bst.predict(dval)
    loss = log_loss(y_val, preds)  # Using log loss as the objective to minimize
    return {'loss': loss, 'status': STATUS_OK}

# Running the hyperparameter optimization
trials = Trials()
best_hyperparams = fmin(fn=objective,
                        space=space,
                        algo=tpe.suggest,
                        max_evals=100,  # You can adjust this number
                        trials=trials)

print("The best hyperparameters are: ", best_hyperparams)


## Training and saving the best model

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import f1_score, accuracy_score, log_loss
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.metrics import confusion_matrix, accuracy_score, precision_recall_fscore_support
from sklearn.metrics import roc_auc_score, roc_curve, auc
import matplotlib.pyplot as plt
from itertools import cycle
from sklearn.preprocessing import label_binarize
from scipy import interp
from numpy import unique, concatenate
import seaborn as sns


# Saving the best model
class SaveBestModel(xgb.callback.TrainingCallback):
    def __init__(self, save_path='model_with_lowest_val_loss.json'):
        self.best_loss = float('Inf')
        self.save_path = save_path

    def after_iteration(self, model, epoch, evals_log):
        if 'validation' in evals_log:
            current_loss = evals_log['validation']['mlogloss'][-1]
            if current_loss < self.best_loss:
                self.best_loss = current_loss
                model.save_model(self.save_path)
        return False  # Return True if training must stop

# Loading the data
df = pd.read_csv('Data.csv')

# Separating features and target
X = df[['Var1', 'Var2', 'Var3', 'Var4', 'Var5', 'Var6', 'Var7', 'Var8']]
y = df['Label']

# Splitting the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=21)

# Converting data into DMatrix format
dtrain = xgb.DMatrix(X_train, label=y_train, missing=np.nan)
dval = xgb.DMatrix(X_val, label=y_val, missing=np.nan)


# Defining XGBoost parameters
params = {
    'max_depth': 5,  
    'eta': 0.314,  
    'objective': 'multi:softprob',
    'num_class': 4,
    'eval_metric': ['mlogloss'],
    'min_child_weight': 0,  
    'gamma': 0.102, 
    'subsample': 0.76,  
    'colsample_bytree': 0.83,
    'lambda': 4.04,  
    'alpha': 1.52
}


# Training the model
bst = xgb.train(params, dtrain, num_boost_round=3000, evals=[(dval, 'validation')],
                early_stopping_rounds=1000, callbacks=[SaveBestModel()])

# Loading the best model
bst = xgb.Booster(model_file='model_with_lowest_val_loss.json')

# Predictions
preds = bst.predict(dval)
best_preds = np.asarray([np.argmax(line) for line in preds])

# Evaluation
f1 = f1_score(y_val, best_preds, average='macro')
accuracy = accuracy_score(y_val, best_preds)
print(f"F1 Score: {f1}")
print(f"Accuracy: {accuracy}")

# Adding predictions to validation dataFrame
X_val['xgboost_output'] = best_preds

# Merging the predictions with the original dataFrame
result_df = pd.merge(df, X_val[['xgboost_output']], left_index=True, right_index=True, how='right')

# Exporting results
result_df.to_csv('XGB_validation_Output.csv', index=False)


## Loading the trained model

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb

# Create a new model instance for loading
bst_loaded = xgb.Booster()

model_path_json = "model_with_lowest_val_loss.json"

# Load the model from the JSON file
bst_loaded.load_model(model_path_json)


## Loading the test set

In [None]:
# Load the test dataset
test_df = pd.read_csv('Test.csv')

# Assuming test dataset has the same feature columns
X_test = test_df[['Var1', 'Var2', 'Var3', 'Var4', 'Var5', 'Var6', 'Var7', 'Var8']]

# Convert test data into DMatrix
dtest = xgb.DMatrix(X_test)

# Predict using the loaded model
preds_test = bst_loaded.predict(dtest)
best_preds_test = np.asarray([np.argmax(line) for line in preds_test])



# Add predictions to test DataFrame
test_df['xgboost_test_output'] = best_preds_test

# Export to CSV
test_output_path = 'XGB_Test_Output.csv'
test_df.to_csv(test_output_path, index=False)

#'Label' is the name of the column containing true class labels
y_test = test_df['Label'].values


# Calculate F1 Score
# A multi-class classification, with 'macro' averaging
f1 = f1_score(y_test, best_preds_test, average='macro')
print("F1 Score:", f1)

# Calculate Accuracy
accuracy = accuracy_score(y_test, best_preds_test)
print("Accuracy:", accuracy)



## Performance metrics

In [None]:
import warnings
# Suppress deprecation warning
warnings.filterwarnings('ignore', category=DeprecationWarning)


# Calculating precision, recall, F1 score, and support for each class
precision, recall, f1, _ = precision_recall_fscore_support(y_test, best_preds_test, average=None)


cm = confusion_matrix(y_test, best_preds_test)
specificity = []
for i in range(cm.shape[0]):
    true_negative = np.sum(cm) - np.sum(cm[i, :]) - np.sum(cm[:, i]) + cm[i, i]
    false_positive = np.sum(cm[:, i]) - cm[i, i]
    class_specificity = true_negative / (true_negative + false_positive) if (true_negative + false_positive) > 0 else 0
    specificity.append(class_specificity)


# Creating a dataFrame for precision, recall, F1 score, and specificity
metrics_df = pd.DataFrame({
    'Class': ['Class1', 'Class2', 'Class3', 'Class4'],
    'Precision': precision,
    'Recall': recall,
    'F1 Score': f1,
    'Specificity': specificity
})

# Round the dataFrame
metrics_df_rounded = metrics_df.round(3)

# Print the dataFrame
print(" ")
print(metrics_df_rounded)
print(" ")
print(" ")



# AUC-ROC for each class and macro
# Converting y_test into one hot encoding for multi-class ROC AUC calculation
y_test_bin = label_binarize(y_test, classes=[0, 1, 2, 3])
n_classes = y_test_bin.shape[1]

# Computing ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], preds_test[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Computing micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test_bin.ravel(), preds_test.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

# Computing macro-average ROC curve and ROC area
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

# Interpolating all ROC curves at these points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
    mean_tpr += interp(all_fpr, fpr[i], tpr[i])

# Averaging and computing AUC
mean_tpr /= n_classes

fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])


# Confusion matrix
conf_mat = confusion_matrix(y_test, best_preds_test)
fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(conf_mat, annot=True, annot_kws={"size": 12}, fmt='g', cmap='Blues', cbar=False,
            xticklabels=['Class1', 'Class2', 'Class3', 'Class4'],
            yticklabels=['Class1', 'Class2', 'Class3', 'Class4'])
plt.ylabel('Actual', fontsize=14)
plt.xlabel('Predicted', fontsize=14)
plt.title('Confusion Matrix', fontsize=16)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12) 
plt.show()
