# XGB

In [2]:
import sklearn as skl
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from prep2 import DataPreparation
from sklearn.metrics import f1_score

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedShuffleSplit
from sklearn import tree

from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import ConfusionMatrixDisplay

from sklearn.metrics import classification_report
%matplotlib inline

In [3]:
prep = DataPreparation()

train_db = pd.read_csv("dataset/train_radiomics_hipocamp.csv")
test_db = pd.read_csv("dataset/test_radiomics_hipocamp.csv")
control_db = pd.read_csv("dataset/train_radiomics_occipital_CONTROL.csv")

train_db = prep.prep_train(train_db)
test_db = prep.transform(test_db)
control_db = prep.transform(control_db)

In [4]:
train_db.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 305 entries, 0 to 304
Columns: 382 entries, diagnostics_Image-original_Mean to Transition
dtypes: float64(381), object(1)
memory usage: 910.4+ KB


In [5]:
test_db.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Columns: 381 entries, diagnostics_Image-original_Mean to Age
dtypes: float64(381)
memory usage: 297.8 KB


In [6]:
X = train_db.drop(columns=["Transition"]) 
y = train_db["Transition"]  

In [7]:
X_train = pd.read_csv('prep2/train_X.csv')
X_test = pd.read_csv('prep2/test_X.csv')
y_train = pd.read_csv('prep2/train_y.csv')
y_test = pd.read_csv('prep2/test_y.csv')

In [27]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y) 
print("Rótulos após codificação:", label_encoder.classes_)

clf = XGBClassifier(
    random_state=2024, 
    eval_metric='logloss'
)

param_grid = {
    'n_estimators': [150],
    'max_depth': [3],               
    'learning_rate': [0.1],    
    'subsample': [0.8],              
    'colsample_bytree': [0.8],       
    'min_child_weight': [2],        
    'gamma': [0.1]                
}


cv_strategy = StratifiedKFold(n_splits=10)

grid_search = GridSearchCV(
    estimator=clf, 
    param_grid=param_grid, 
    scoring='f1_macro',  
    cv=cv_strategy,        
    verbose=1,              
    refit=True,
    n_jobs=-1
)

grid_search.fit(X, y_encoded)

print("Melhores parâmetros encontrados:", grid_search.best_params_)

best_model = grid_search.best_estimator_

print("F1 Score médio com validação cruzada:", grid_search.best_score_)

y_train_encoded = label_encoder.transform(y_train)  
best_model.fit(X_train, y_train_encoded)

y_test_encoded = label_encoder.transform(y_test)  
y_pred_encoded = best_model.predict(X_test)

# Decodificar as previsões para rótulos originais
y_pred = label_encoder.inverse_transform(y_pred_encoded)

# Matriz de confusão
print("Matriz de Confusão:")
ConfusionMatrixDisplay.from_predictions(y_test, y_pred, cmap='Blues')
plt.show()

# Relatório de classificação
print("\nRelatório de Classificação:")
print(classification_report(y_test, y_pred))


Rótulos após codificação: ['AD-AD' 'CN-CN' 'CN-MCI' 'MCI-AD' 'MCI-MCI']
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Melhores parâmetros encontrados: {'colsample_bytree': 0.8, 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 2, 'n_estimators': 150, 'subsample': 0.8}
F1 Score médio com validação cruzada: 0.3149666092361745


  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


Matriz de Confusão:
[[ 5  1  1  4  1]
 [ 1 15  0  1  2]
 [ 0  1  0  0  1]
 [ 6  2  0  4  2]
 [ 1  7  0  4  2]]

Relatório de Classificação:
              precision    recall  f1-score   support

       AD-AD       0.38      0.42      0.40        12
       CN-CN       0.58      0.79      0.67        19
      CN-MCI       0.00      0.00      0.00         2
      MCI-AD       0.31      0.29      0.30        14
     MCI-MCI       0.25      0.14      0.18        14

    accuracy                           0.43        61
   macro avg       0.30      0.33      0.31        61
weighted avg       0.38      0.43      0.40        61



  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


### Testar Controlo

In [None]:
CX = control_db.drop(columns=["Transition"]) 
Cy = control_db["Transition"]  

control_pred = best_model.predict(CX)

control_f1_score = f1_score(Cy, control_pred, average='macro')
print("F1-Score no conjunto de controle:", control_f1_score)

# Matriz de Confusão
ConfusionMatrixDisplay.from_predictions(Cy, control_pred, cmap='Blues')
plt.show()

# Relatório de Classificação
print("\nRelatório de Classificação no conjunto de controle:")
print(classification_report(Cy, control_pred))


### Guardar o melhor modelo do XGB

In [9]:
y_encoded_full = label_encoder.fit_transform(y)

best_model.fit(X, y_encoded_full)

test_predictions_encoded = best_model.predict(test_db)

test_predictions = label_encoder.inverse_transform(test_predictions_encoded)

row_ids = range(1, len(test_predictions) + 1)
output_df = pd.DataFrame({
    "RowId": row_ids,
    "Result": test_predictions
})

output_df.to_csv("resultados/xgb2.csv", index=False)