In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import imblearn
from sklearn.preprocessing import MinMaxScaler

In [2]:
# Load dataset
radi = pd.read_csv("sbsppdaa24/train_radiomics_hipocamp.csv")

# Drop unique identifier columns
radi.drop(columns=["Mask", "ID", "Image"], inplace=True)

# Drop non-numeric columns except for 'Transition'
columns_to_drop = [col for col in radi.columns if radi[col].dtype == 'object' and col != 'Transition']
radi.drop(columns=columns_to_drop, inplace=True)

# Apply MinMax scaling to columns
float_cols = radi.select_dtypes(include=['float','int']).columns
scaler = MinMaxScaler()
radi[float_cols] = scaler.fit_transform(radi[float_cols])

# Intantiate Report
classification_reports = {}

# Check final dataset
radi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 305 entries, 0 to 304
Columns: 2162 entries, diagnostics_Image-original_Dimensionality to Transition
dtypes: float64(2161), object(1)
memory usage: 5.0+ MB


In [3]:
from sklearn.model_selection import train_test_split, GridSearchCV,StratifiedKFold
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, f1_score

# Estado vai ser comum para todos os modelos, 
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2025)


## Bagging

In [4]:
# Split data into features and target
df = radi.copy()
X = df.drop(columns=["Transition"])
y = df["Transition"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2025)

bagging_params = {"n_estimators": [100, 500, 800, 1000]}
bagging_model = BaggingClassifier(random_state=2025)
bagging_grid = GridSearchCV(bagging_model, bagging_params, scoring='f1_micro', cv=skf, n_jobs=-1)
bagging_grid.fit(X_train, y_train)
y_pred_bagging = bagging_grid.best_estimator_.predict(X_test)
classification_reports["Bagging"] = classification_report(y_test, y_pred_bagging, output_dict=True)
print(f"Best Bagging Model Parameters: {bagging_grid.best_params_}")
print(f"Bagging Classification Report:\n", classification_report(y_test, y_pred_bagging))

Best Bagging Model Parameters: {'n_estimators': 1000}
Bagging Classification Report:
               precision    recall  f1-score   support

       AD-AD       0.56      0.36      0.43        14
       CN-CN       0.51      0.69      0.59        26
      CN-MCI       0.00      0.00      0.00         1
      MCI-AD       0.32      0.43      0.36        14
     MCI-MCI       0.21      0.14      0.17        22

    accuracy                           0.42        77
   macro avg       0.32      0.32      0.31        77
weighted avg       0.39      0.42      0.39        77



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Random Forest

In [5]:
# Split data into features and target
df = radi.copy()
X = df.drop(columns=["Transition"])
y = df["Transition"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2025)

# Params Definition
rf_params = {"n_estimators": [100, 500, 800, 1000], "max_depth": [5, 10, 20, None]}
rf_model = RandomForestClassifier()

# model, params, scoring using f1, 5 folds, full processor
rf_grid = GridSearchCV(rf_model, rf_params, scoring='f1_micro', cv=skf, n_jobs=-1)
rf_grid.fit(X_train, y_train)

y_pred_rf = rf_grid.best_estimator_.predict(X_test)
classification_reports["RandomForest"] = classification_report(y_test, y_pred_rf, output_dict=True)
print(f"Best RandomForest Model Parameters: {rf_grid.best_params_}")
print(f"RandomForest Classification Report:\n", classification_report(y_test, y_pred_rf))

Best RandomForest Model Parameters: {'max_depth': 5, 'n_estimators': 1000}
RandomForest Classification Report:
               precision    recall  f1-score   support

       AD-AD       0.50      0.36      0.42        14
       CN-CN       0.54      0.81      0.65        26
      CN-MCI       0.00      0.00      0.00         1
      MCI-AD       0.28      0.36      0.31        14
     MCI-MCI       0.40      0.18      0.25        22

    accuracy                           0.45        77
   macro avg       0.34      0.34      0.33        77
weighted avg       0.44      0.45      0.42        77



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Gradient Boosting

In [None]:
# Split data into features and target
df = radi.copy()
X = df.drop(columns=["Transition"]) 
y = df["Transition"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2025)

# Params Definition
gb_params = {"n_estimators": [500, 600, 800], "learning_rate": [0.1, 0.3, 0.05]}
gb_model = GradientBoostingClassifier()

# model, params, scoring using f1, 5 folds, full processor
gb_grid = GridSearchCV(gb_model, gb_params, scoring='f1_micro',cv=skf, n_jobs=-1)
gb_grid.fit(X_train, y_train)

y_pred_gb = gb_grid.best_estimator_.predict(X_test)
classification_reports["GradientBoosting"] = classification_report(y_test, y_pred_gb, output_dict=True)
print(f"Best GradientBoosting Model Parameters: {gb_grid.best_params_}")
print(f"GradientBoosting Classification Report:\n", classification_report(y_test, y_pred_gb))


## XGBoosting

In [None]:
from sklearn.preprocessing import LabelEncoder

# Copy the dataframe and apply label encoding to the target variable
df = radi.copy()
label_encoder = LabelEncoder()
df['Transition'] = label_encoder.fit_transform(df['Transition'])

# Define features and target
X = df.drop(columns=["Transition"])
y = df["Transition"]

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define XGBoost hyperparameters and model with a multi-class objective and compatible eval_metric
xgb_params = {
    "n_estimators": [500,800,600],
    "learning_rate": [0.1,0.3],
    "max_depth": [5,6,8],
    
}

xgb_model = XGBClassifier(eval_metric='mlogloss')

# Run GridSearchCV to find the best parameters
xgb_grid = GridSearchCV(xgb_model, xgb_params, scoring='f1_micro', cv=skf, n_jobs=-1)
xgb_grid.fit(X_train, y_train)

# Predict and evaluate
y_pred_xgb = xgb_grid.best_estimator_.predict(X_test)
classification_reports["XGBoost"] = classification_report(y_test, y_pred_xgb, output_dict=True)
print(f"Best XGBoost Model Parameters: {xgb_grid.best_params_}")
print(f"XGBoost Classification Report:\n", classification_report(y_test, y_pred_xgb))


## Ver Resultados

In [None]:
f1_scores = {model: report["weighted avg"]["f1-score"] for model, report in classification_reports.items()}
accuracies = {model: report["accuracy"] for model, report in classification_reports.items()}

# Plotting
plt.figure(figsize=(12, 5))

# F1 Score Graph
plt.subplot(1, 2, 1)
plt.bar(f1_scores.keys(), f1_scores.values(), color='skyblue')
plt.xlabel("Model")
plt.ylabel("F1 Score")
plt.title("Model Comparison - F1 Scores")
plt.ylim(0, 1)

# Accuracy Graph
plt.subplot(1, 2, 2)
plt.bar(accuracies.keys(), accuracies.values(), color='salmon')
plt.xlabel("Model")
plt.ylabel("Accuracy")
plt.title("Model Comparison - Accuracy")
plt.ylim(0, 1)

plt.tight_layout()
plt.show()

## Generating csv

In [11]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Load the test dataset
test_data = pd.read_csv("sbsppdaa24/test_radiomics_hipocamp.csv")

# Apply the same preprocessing as in the training phase
# Drop unique identifier columns
test_data.drop(columns=["Mask", "ID", "Image"], inplace=True)

# Ensure 'columns_to_drop' is available for test data
# If you haven't redefined this variable, you need to redo this step for the test set.
# Use the same method to identify non-numeric columns for dropping
non_numeric_columns = [col for col in test_data.columns if test_data[col].dtype == 'object']
test_data.drop(columns=non_numeric_columns, inplace=True)

# Apply the same MinMaxScaler that was fit on the training data
test_data[float_cols] = scaler.transform(test_data[float_cols])  # Correctly reference columns in test_data

# Generate predictions using the RandomForest model
bg_predictions_test = bagging_grid.predict(test_data)

# Generate predictions using the RandomForest model
rf_predictions_test = rf_grid.predict(test_data)


res0 = pd.DataFrame({
    'RowId': range(1, len(bg_predictions_test) + 1),
    'Result': bg_predictions_test 
})


# Store the results in a DataFrame and save to CSV
res1 = pd.DataFrame({
    'RowId': range(1, len(rf_predictions_test) + 1),
    'Result': rf_predictions_test
})

# Save predictions to a CSV file
res0.to_csv('BagginGrid1.0.csv', index=False)
res1.to_csv('RandomForestGrid1.0.csv', index=False)
