# Notebook do Grupo23 para o dataset Controlo!

## Primeira análise

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import imblearn

In [None]:
control = pd.read_csv("sbsppdaa24/train_radiomics_hipocamp.csv")

In [None]:
control.info()

In [None]:
control.columns

## Drop das primeiras Features

In [None]:
control['Mask'].unique().size

In [None]:
control['ID'].unique().size

In [None]:
control['Image'].unique().size

In [None]:
control.drop(columns=["Mask","ID","Image"], inplace=True)

### Os três atributos são únicos para todas as entradas e não são numéricos.

In [None]:
# Find and drop columns where all values are the same
columns_to_drop = [col for col in control.columns if control[col].nunique() == 1]

# Drop the columns
control.drop(columns=columns_to_drop, inplace=True)
print(f"Dropped {len(columns_to_drop)} constant columns.")

In [None]:

# Drop columns where all values are unique and the column is non-numeric
columns_to_drop = [col for col in control.columns if control[col].dtype == 'object' and control[col].nunique() == len(control)]
columns_to_drop
# Drop the columns
control.drop(columns=columns_to_drop, inplace=True)
print(f"Dropped {len(columns_to_drop)} unique non-numerical columns.")


In [None]:
# Check for columns that contain the word "Hash" in their name
hash_columns = [col for col in control.columns if 'Hash' in col]

control.drop(columns=hash_columns, inplace=True)
# Print the number of columns that contain "Hash"
print(f"Number of columns containing 'Hash': {len(hash_columns)}")

# Optionally, print the names of these columns
print("Columns containing 'Hash':", hash_columns)
control.head()

## Missing values

In [None]:
sns.heatmap(control.isnull(),yticklabels=False,cbar=False,cmap="viridis")

In [None]:
control.isnull().sum()

In [None]:
# Check for columns with missing values
missing_values = control.isnull().sum()

# Filter to display only columns where the number of missing values is greater than 0
missing_values = missing_values[missing_values > 0]

# Print the result
print(missing_values)

## Handling Categoric Data

In [None]:
# Identify categorical columns (data type 'object' or 'category')
categorical_columns = control.select_dtypes(include=['object', 'category']).columns

# Iterate over each categorical column and print its unique categories
for col in categorical_columns:
    unique_values = control[col].unique()
    print(f"Column '{col}' has the following categories: {unique_values}")


In [None]:
transition_count = control['Transition'].value_counts()
sns.barplot(x=transition_count.index,y=transition_count.values)
plt.show()

In [None]:
# Target muito unbalanced

## Handeling Dates

In [None]:
# Identify columns with datetime64 data type
date_columns = control.select_dtypes(include=['datetime64']).columns

# Print the columns with date data type
print(f"Columns with datetime data type: {date_columns}")

## Handeling highly correlated features

In [None]:
encoded_target = pd.get_dummies(control['Transition'], prefix='category')
# Concatenate the one-hot encoded target back into your dataset
control_encoded = pd.concat([control, encoded_target], axis=1)
# Drop the original target column if needed
control_encoded.drop(columns=['Transition'],inplace=True)

In [None]:
fig = plt.figure(figsize=(500, 500))
control_corr = control_encoded.corr(method='pearson')
sns.heatmap(control_corr, linecolor='black', linewidths=0.5)
plt.show()

In [None]:
# Demasiadas features para retirar alguma conclusão

In [None]:
from sklearn.feature_selection import VarianceThreshold

# Initialize the variance threshold
selector = VarianceThreshold(threshold=0.01)  # Adjust the threshold as needed

# Fit and transform the dataset to remove low-variance features
control_cleaned = selector.fit_transform(control_encoded)

print(f"Remaining columns after variance thresholding: {control_cleaned.shape[1]}")


In [None]:
from sklearn.feature_selection import SelectKBest, f_classif

X = control.drop(columns=['Transition'])
y = control['Transition']

k = 8 # Number of top features to select
selector = SelectKBest(f_classif, k=k)
X_new = selector.fit_transform(X, y)

# Get the boolean mask of selected features
selected_features_mask = selector.get_support()

# Get the names of the selected features
selected_features = X.columns[selected_features_mask]

# Create a new DataFrame with the selected features
controlKbest = control[selected_features.tolist() + ['Transition']]

print(f"Reduced dataset has {controlKbest.shape[1]} columns after feature selection.")
print("Selected features DataFrame:")
controlKbest.head()


In [None]:
from sklearn.ensemble import RandomForestClassifier

X = control.drop(columns=['Transition'])  # Features
y = control['Transition']  # Target variable

# Fit a RandomForestClassifier to compute feature importances
model = RandomForestClassifier(n_estimators=500)
model.fit(X, y)

# Get feature importances and drop low-importance features
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]  # Sort by importance

# Select top N features, for example, the top 20 features
top_n = 80
top_indices = indices[:top_n]

# Create a new DataFrame with the selected features
X_new = X.iloc[:, top_indices]

# If you want to include the target variable as well
controlKbestRandomForest = control.iloc[:, top_indices.tolist() + [control.columns.get_loc('Transition')]]

print(f"Reduced dataset has {controlKbest.shape[1]} columns after Random Forest feature selection.")
print("Selected features DataFrame:")
controlKbestRandomForest.head()



In [None]:
encoded_target = pd.get_dummies(controlKbest['Transition'], prefix='category')
controlKbest_encoded = pd.concat([controlKbest, encoded_target], axis=1)
controlKbest_encoded.drop(columns=['Transition'],inplace=True)


fig = plt.figure(figsize = (5,5))
controlKbest_corr = controlKbest_encoded.corr( method = "pearson")
sns.heatmap(controlKbest_corr, linecolor='black', linewidths=0.5)
plt.show()

In [None]:
encoded_target = pd.get_dummies(controlKbestRandomForest['Transition'], prefix='category')
controlKbest_encoded = pd.concat([controlKbestRandomForest, encoded_target], axis=1)
controlKbest_encoded.drop(columns=['Transition'],inplace=True)


fig = plt.figure(figsize = (50,50))
controlKbest_corr = controlKbest_encoded.corr( method = "pearson")
sns.heatmap(controlKbest_corr, linecolor='black', linewidths=0.5)
#plt.show()

In [None]:
#Remover algumas colunas
# Assuming X_new is your DataFrame with the top 80 features
# Calculate the correlation matrix

encoded_target = pd.get_dummies(controlKbestRandomForest['Transition'], prefix='category')
controlKbest_encoded = pd.concat([controlKbestRandomForest, encoded_target], axis=1)
controlKbest_encoded.drop(columns=['Transition'],inplace=True)

correlation_matrix = controlKbest_encoded.corr().abs()  # Use absolute values for correlation

# Select upper triangle of correlation matrix
upper = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))

# Find features with correlation greater than a threshold (e.g., 0.9)
threshold = 0.9
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]

# Drop the highly correlated features
X_new_reduced = X_new.drop(columns=to_drop)

# If you want to keep the target variable in the final dataset
controlKbestRandomForestReduced = pd.concat([X_new_reduced, control['Transition']], axis=1)

print(f"Reduced dataset has {controlKbestRandomForestReduced.shape[1]} columns after removing highly correlated features.")
print("Reduced DataFrame:")

encoded_target = pd.get_dummies(controlKbestRandomForestReduced['Transition'], prefix='category')
controlKbest_encoded = pd.concat([controlKbestRandomForestReduced, encoded_target], axis=1)
controlKbest_encoded.drop(columns=['Transition'],inplace=True)
ig = plt.figure(figsize = (25,25))
controlKbest_corr = controlKbest_encoded.corr( method = "pearson")
sns.heatmap(controlKbest_corr, linecolor='black', linewidths=0.5)
plt.show()

### Agora Vamos modelar! ( Sem esquecer preparação de dados claro )

In [None]:
controlKbestRandomForestReduced.info()

In [None]:
from sklearn.preprocessing import MinMaxScaler
# Normalizar

# Select only the float columns
float_cols = controlKbestRandomForestReduced.select_dtypes(include=['float'])

# Apply Min-Max scaling
scaler = MinMaxScaler()
scaled_float_cols = pd.DataFrame(scaler.fit_transform(float_cols), columns=float_cols.columns)

# Replace the original float columns with the scaled columns
controlKbestRandomForestReduced[float_cols.columns] = scaled_float_cols
controlKbestRandomForestReduced.head()

In [None]:
df = controlKbestRandomForestReduced.copy()

In [None]:
# Modelo com Gradient Boosting
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

X = df.drop(columns=['Transition'])  # Feature columns
y = df['Transition']  # Target column

# Step 3: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2025)

# Step 4: Initialize Gradient Boosting Classifier
gb_model = GradientBoostingClassifier(
    n_estimators=1000,      # Number of boosting stages (trees)
    learning_rate=0.01,     # Step size shrinkage (lower values make training more robust but slower)
    max_depth=20,           # Maximum depth of the individual regression estimators
    random_state=2025        # Seed for reproducibility
)

# Step 5: Fit the model on the training data
gb_model.fit(X_train, y_train)

# Step 6: Predict on the test data
y_pred = gb_model.predict(X_test)

# Step 7: Evaluate the model using accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Step 8: Compute the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Step 9: Display the confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap=plt.cm.Blues)  # Use a blue color map for visualization
plt.title("Confusion Matrix")
plt.show()



In [None]:
# Modelo com Gradient Boosting + SMOTE
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Step 2: Prepare your data
X = df.drop(columns=['Transition'])  # Feature columns
y = df['Transition']  # Target column

# Step 3: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2025)

# Step 4: Apply SMOTE to the training data
smote = SMOTE(random_state=2025)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Optional: Check the class distribution after SMOTE
print(f"Original class distribution: {Counter(y_train)}")
print(f"Resampled class distribution: {Counter(y_train_resampled)}")

# Step 5: Initialize Gradient Boosting Classifier
gb_model = GradientBoostingClassifier(
    n_estimators=1000,      # Number of boosting stages (trees)
    learning_rate=0.01,     # Step size shrinkage (lower values make training more robust but slower)
    max_depth=20,           # Maximum depth of the individual regression estimators
    random_state=2025        # Seed for reproducibility
)

# Step 6: Fit the model on the resampled training data
gb_model.fit(X_train_resampled, y_train_resampled)

# Step 7: Predict on the original test data (not resampled)
y_pred = gb_model.predict(X_test)

# Step 8: Evaluate the model using accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Step 9: Compute the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Step 10: Display the confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap=plt.cm.Blues)  # Use a blue color map for visualization
plt.title("Confusion Matrix")
plt.show()


In [None]:
# Load the test dataset
test_data = pd.read_csv("sbsppdaa24/test_radiomics_hipocamp.csv")

# Apply the same preprocessing steps as we did for training data
# 1. Drop unnecessary columns
test_data.drop(columns=["Mask", "ID", "Image"], inplace=True)

# 2. Drop constant columns (use the same columns we dropped in training)
columns_to_drop = [col for col in test_data.columns if test_data[col].nunique() == 1]
test_data.drop(columns=columns_to_drop, inplace=True)

# 3. Drop Hash columns
hash_columns = [col for col in test_data.columns if 'Hash' in col]
test_data.drop(columns=hash_columns, inplace=True)

# 4. Keep only the features we used in training (from controlKbestRandomForestReduced)
test_features = test_data[X.columns].copy()

# 5. Apply the same scaling
scaler = MinMaxScaler()
float_cols = test_features.select_dtypes(include=['float'])
scaled_float_cols = pd.DataFrame(
    scaler.fit_transform(float_cols), 
    columns=float_cols.columns
)

# Use .loc to avoid the SettingWithCopyWarning
test_features.loc[:, float_cols.columns] = scaled_float_cols

# Make predictions using both models
# Model without SMOTE
predictions_no_smote = gb_model.predict(test_features)

# Model with SMOTE
predictions_with_smote = gb_model.predict(test_features)  # Using the SMOTE-trained model

# Create DataFrames with predictions, starting from ID 1
results_no_smote = pd.DataFrame({
    'RowId': range(1, len(predictions_no_smote) + 1),
    'Result': predictions_no_smote
})

results_with_smote = pd.DataFrame({
    'RowId': range(1, len(predictions_with_smote) + 1),
    'Result': predictions_with_smote
})

# Save predictions to CSV files
results_no_smote.to_csv('predictions_no_smote.csv', index=False)
results_with_smote.to_csv('predictions_with_smote.csv', index=False)

# Print first few predictions from both models
print("First few predictions without SMOTE:")
print(results_no_smote.head())
print("\nFirst few predictions with SMOTE:")
print(results_with_smote.head())

# Calculate prediction distribution
print("\nPrediction distribution without SMOTE:")
print(pd.Series(predictions_no_smote).value_counts())
print("\nPrediction distribution with SMOTE:")
print(pd.Series(predictions_with_smote).value_counts())