In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import learning_curve
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from sklearn import metrics

from xgboost import XGBClassifier
from xgboost import plot_importance

In [None]:
trainData = pd.read_parquet("../../data/datasets/train.parquet")

In [None]:
#inspect the dataframe-structure

trainData.head()
trainData.dtypes
print(trainData.shape[0])

In [None]:
#splits the data into targets and features

#targets
y = trainData["player_bufferIndex"]

#features
x = trainData.drop(["player_bufferIndex"], axis=1)

In [None]:
#class weights to reduce dominance of '0' class


classes = np.unique(y)
class_weights = compute_class_weight("balanced", classes=classes, y=y)

adjusted_weights = {}

for i, weight in enumerate(class_weights):
    if i == 0:
        adjusted_weights[i] = weight * 0.3
    else:
        adjusted_weights[i] = weight * 2.0


sample_weights = np.array([adjusted_weights[label] for label in y])

In [None]:

xgb = XGBClassifier(objective="multi:softmax", random_state = 42)

In [None]:
#load testing dataset

testData = pd.read_parquet("../../data/datasets/test.parquet")
y_test = testData["player_bufferIndex"]
x_test = testData.drop(["player_bufferIndex"], axis=1)

In [None]:
#parameters for hyperparameter-tuning

cv_params = {
    "max_depth": [6,7,8],
    "learning_rate": [0.05, 0.1, 1.5],
    "subsample": [0.7, 0.8],
    "colsample_bytree": [0.7, 0.8],
    "n_estimators": [200, 250, 300]
}

In [None]:
#scoring criteria

scoring_criteria = {
    "precision": make_scorer(precision_score, average="weighted"), 
    "accuracy": "accuracy",
    "recall": make_scorer(recall_score, average="weighted"), 
    "f1":make_scorer(f1_score, average="weighted")
}

In [None]:
#grid search setup 


xgb_cv = GridSearchCV(xgb, cv_params, scoring=scoring_criteria, cv=5, refit="f1")

In [None]:

xgb_cv = xgb_cv.fit(x, y, sample_weight=sample_weights)
xgb_cv 

In [None]:
#confusion matrix

y_pred = xgb_cv.predict(x_test)
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
y_pred = xgb_cv.predict(x_test)

ac_score = metrics.accuracy_score(y_test, y_pred)
print('accuracy score:', ac_score)

pc_score = metrics.precision_score(y_test, y_pred, average="weighted")
print('precision score:', pc_score)

rc_score = metrics.recall_score(y_test, y_pred, average="weighted")
print('recall score:', rc_score)

f1_score = metrics.f1_score(y_test, y_pred, average="weighted")
print('f1 score:', f1_score)



In [None]:
bestModel = xgb_cv.best_estimator_
importances = bestModel.feature_importances_

feature_names = x.columns if hasattr(x, 'columns') else [f"f{i}" for i in range(x.shape[1])]
df_importance = pd.DataFrame({"Feature": feature_names, "Importance": importances}).sort_values(by="Importance", ascending=False)

print(df_importance)