# Imports

In [None]:
!pip uninstall -y scikit-learn
!pip install scikit-learn==1.5.2

In [None]:
import sklearn as skl
print(f"New scikit-learn version: {skl.__version__}")
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import classification_report
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from numpy import median
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler

RANDOM_SEED = 1507

# Datasets

In [None]:
data_train = pd.read_csv('/content/sample_data/train_radiomics_hipocamp.csv')
data_test = pd.read_csv('/content/sample_data/test_radiomics_hipocamp.csv')
data_occ = pd.read_csv('/content/sample_data/train_radiomics_occipital_CONTROL.csv')

# Tratamento de Dados

**Verificação de Missing Values**

In [None]:
data_train.isna().any()
data_test.isna().any()

**Veirifição das Colunas com o Mesmo Valor em Todas as Entradas**

In [None]:
data_train = data_train.loc[:, (data_train.nunique() > 1)]
data_test = data_test.loc[:, (data_test.nunique() > 1)]
data_occ = data_occ.loc[:, (data_occ.nunique() > 1)]

**Verificação de Linhas Duplicadas**

In [None]:
print(data_train.duplicated().sum())
print(data_test.duplicated().sum())

**Remoção de Colunas Irrelevantes**

In [None]:
columns_to_drop = ["ID","Image", "Mask",'diagnostics_Image-original_Hash', 'diagnostics_Mask-original_Hash',
                   'diagnostics_Mask-original_BoundingBox', 'diagnostics_Mask-original_CenterOfMassIndex',
                   'diagnostics_Mask-original_CenterOfMass']

data_train = data_train.drop(columns=columns_to_drop)
data_test = data_test.drop(columns=columns_to_drop)
data_occ = data_occ.drop(columns=columns_to_drop)

**Converter a Coluna *\[Age\]* para o tipo INT**

In [None]:
data_train['Age'] = data_train['Age'].astype(int)
data_test['Age'] = data_test['Age'].astype(int)
data_occ['Age'] = data_occ['Age'].astype(int)

**Converter a Coluna *\[Transition\]* para valores Numéricos**

In [None]:
data_train["Transition"] = data_train['Transition'].replace({'CN-CN':0,'CN-MCI':1,'MCI-MCI':2,'MCI-AD':3,'AD-AD':4}).astype(int)
data_occ['Transition'] = data_occ['Transition'].replace({'CN-CN':0,'CN-MCI':1,'MCI-MCI':2,'MCI-AD':3,'AD-AD':4}).astype(int)

**Verificação de Valores *Outliers***

In [None]:
# Calcular IQR para cada coluna
Q1 = data_train.quantile(0.25)
Q3 = data_train.quantile(0.75)
IQR = Q3 - Q1

# Determinar os limites inferior e superior
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Verificar outliers
outliers = (data_train < lower_bound) | (data_train > upper_bound)
print(outliers.sum())  # Número de outliers por coluna

# Calcular IQR para cada coluna
Q1 = data_occ.quantile(0.25)
Q3 = data_occ.quantile(0.75)
IQR = Q3 - Q1

# Determinar os limites inferior e superior
occ_lower_bound = Q1 - 1.5 * IQR
occ_upper_bound = Q3 + 1.5 * IQR

# Verificar outliers
occ_outliers = (data_occ < occ_lower_bound) | (data_occ > occ_upper_bound)
print(outliers.sum())  # Número de outliers por coluna


**Remoção de Valores *Outliers***

In [None]:
# Substituir valores fora dos limites
data_train = data_train.clip(lower=lower_bound, upper=upper_bound, axis=1)
data_test = data_test.clip(lower=lower_bound, upper=upper_bound, axis=1)
data_occ = data_occ.clip(lower=occ_lower_bound, upper=occ_upper_bound, axis=1)

outliers = (data_train < lower_bound) | (data_train > upper_bound)
print(outliers.sum())  # Número de outliers por coluna

**Verificação do *Dataset* Tratado**

In [None]:
data_test.head()

In [None]:
# Align features in occipital data to match training data
data_occ = data_occ.reindex(columns=data_train.columns, fill_value=0)

In [None]:
from sklearn.metrics import classification_report, ConfusionMatrixDisplay

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

X_treino = data_train.drop('Transition', axis=1)
y_treino = data_train['Transition']

X_treino_occ = data_occ.drop('Transition', axis=1)
y_treino_occ = data_occ['Transition']

X_teste = data_test


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_treino, y_treino, test_size=0.25, random_state=2023, stratify=y_treino)

# Modelação

In [None]:
from xgboost import XGBRegressor
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
# Criar o modelo de XGBoost
xgb = XGBClassifier(random_state=2023, objective='multi:softprob', num_class=5)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=2023)

param_grid = {
    'learning_rate': [0.01],
    'n_estimators': [800],
    'max_depth': [5],
    'gamma': [0.1],
    'min_child_weight': [1],
    'colsample_bytree': [1.0],
}

grid_searchXGB = GridSearchCV(xgb, param_grid, cv=cv, refit=True, verbose=3,return_train_score=True)
grid_searchXGB.fit(X_train, y_train)
grid_predictionXGB = grid_searchXGB.predict(X_treino_occ)

print(grid_searchXGB.best_estimator_)

XGB_best = grid_searchXGB.best_estimator_

In [None]:
print("Performance on occipital-control data:")
print(classification_report(y_treino_occ, grid_predictionXGB))

In [None]:
cv_results = pd.DataFrame(grid_searchXGB.cv_results_)
print(cv_results[['param_learning_rate', 'param_n_estimators',
                  'mean_train_score', 'mean_test_score']])

In [None]:
print("Best Macro-F1 score:", grid_searchXGB.best_score_)

In [None]:
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt

train_sizes, train_scores, val_scores = learning_curve(
    XGBClassifier(**grid_searchXGB.best_params_),
    X_train, y_train, cv=5, scoring='accuracy',
    train_sizes=[0.1, 0.3, 0.5, 0.7, 1.0], shuffle=True
)

train_scores_mean = train_scores.mean(axis=1)
val_scores_mean = val_scores.mean(axis=1)

plt.plot(train_sizes, train_scores_mean, 'o-', label="Training score")
plt.plot(train_sizes, val_scores_mean, 'o-', label="Validation score")
plt.legend(loc="best")
plt.xlabel("Training examples")
plt.ylabel("Score")
plt.title("Learning Curve - XGBoost")
plt.show()


# Converter para *CSV*

In [None]:
submission = pd.DataFrame(grid_predictionXGB, columns = ["Transition"])
submission.insert(0, "RowId", range(1,len(grid_predictionXGB) + 1), True)

print(submission)

#transformação dos valores para formato escrito
submission['Transition']= submission['Transition'].replace({0 : 'CN-CN', 1 : 'CN-MCI', 2 : 'MCI-MCI', 3 : 'MCI-AD', 4 : 'AD-AD'})

print(submission)

#passagem para ficheiro csv
submission.to_csv('submission_Kaggle.csv', index=False)