<a href="https://colab.research.google.com/github/SofJRL/AIMaster_ActivityModule2/blob/main/ML_identifySCDsubjects.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
# Step 1: Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, roc_curve, auc, classification_report
from imblearn.over_sampling import SMOTE
import plotly.express as px

# Step 2: Upload dataset
from google.colab import files
uploaded = files.upload()  # Upload the dataset file

df = pd.read_csv("DATASET_Activity2.csv", sep=";")  # Use semicolon as separator
print("Data loaded:")
print(df.head())

# Step 3: Initial exploration
print("\nDataset information:")
print(df.info())
print("\nDescriptive statistics:")
print(df.describe())

# Step 4: Variable visualization
# Distributions of numerical variables
num_vars = ['Age', 'MMSE', 'GDS']
for col in num_vars:
    if col in df.columns:
        plt.figure(figsize=(8, 5))
        # Option 1: Histograms
        sns.histplot(df[col], kde=True, bins=15, color="skyblue")  # You can adjust bins
        plt.title(f"Distribution of {col} (Histogram)")
        plt.show()

        plt.figure(figsize=(8, 5))
        # Option 2: Boxplots
        sns.boxplot(y=df[col], color="lightcoral") # Boxplots are better for detecting outliers
        plt.title(f"Distribution of {col} (Boxplot)")
        plt.show()

    else:
        print(f"Column '{col}' not found in DataFrame. Skipping...")


# Distributions of categorical variables
cat_vars = ['Gender', 'Education', 'HTA', 'DLP', 'Smoker']
for col in cat_vars:
    if col in df.columns:
        plt.figure(figsize=(8, 5))
        sns.countplot(x=col, data=df, palette='viridis')  # Use countplot for categories
        plt.title(f"Distribution of {col}")
        plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels if needed
        plt.show()
    else:
        print(f"Column '{col}' not found in DataFrame. Skipping...")

# Distribution of the target variable (Conversion)
plt.figure(figsize=(6, 4))
sns.countplot(x='Conversion', data=df, palette='pastel')
plt.title("Distribution of the target variable")
plt.show()
# Correlation heatmap for numerical variables and target
plt.figure(figsize=(8, 6))
sns.heatmap(df[['Age', 'MMSE', 'GDS', 'Conversion']].corr(), annot=True, cmap='coolwarm', fmt=".2f")  # Select numerical + target
plt.title("Correlation Heatmap (Numerical Variables)")
plt.show()

# Step 5: Balance the classes
print("\nDistribution before balancing:")
print(df['Conversion'].value_counts())

# Apply SMOTE
X = df.drop(columns=['Conversion', 'ID'])  # Remove the target variable and ID
y = df['Conversion']

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

print("\nDistribution after balancing:")
print(pd.Series(y_resampled).value_counts())

# Step 6: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Step 7: Train a Random Forest model
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

# Prediction and initial evaluation
y_pred = rf.predict(X_test)

print("\nClassification report:")
print(classification_report(y_test, y_pred))

# Step 8: Hyperparameter optimization with GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'max_features': ['sqrt', 'log2'],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

print("\nBest hyperparameters:", grid_search.best_params_)

# Train with the best parameters
best_rf = grid_search.best_estimator_

# Step 9: Identify the most important variables
importances = best_rf.feature_importances_
features = X.columns
importance_df = pd.DataFrame({'Feature': features, 'Importance': importances}).sort_values(by='Importance', ascending=False)

# Bar chart for feature importance
fig = px.bar(importance_df, x='Importance', y='Feature', title='Feature Importance', orientation='h')
fig.show()

# Step 10: Model evaluation
# Confusion matrix
conf_matrix = confusion_matrix(y_test, best_rf.predict(X_test))
plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=[0, 1], yticklabels=[0, 1])
plt.title("Confusion Matrix")
plt.ylabel("True Class")
plt.xlabel("Predicted Class")
plt.show()

# ROC Curve
y_prob = best_rf.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'AUC = {roc_auc:.2f}', color='darkorange')
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()

# Step 11: Interactive summary
# Interactive histograms of the distributions (Numerical Variables)
for col in ['Age', 'MMSE', 'GDS']:
    if col in df.columns:
        fig_hist = px.histogram(df, x=col, title=f"Interactive Histogram of {col}")
        fig_hist.show()

# Interactive bar charts for categorical variables
for col in ['Gender', 'Education', 'HTA', 'DLP', 'Smoker']:
    if col in df.columns:
        fig_cat = px.bar(df, x=col, color='Conversion', title=f"Interactive Bar Chart of {col}")
        fig_cat.show()

# Interactive Variable Importance graph (from Step 9)
fig_importance = px.bar(importance_df, x='Importance', y='Feature',
                         title='Interactive Feature Importance', orientation='h')
fig_importance.show()

# Interactive ROC curve (from Step 10)
roc_data = pd.DataFrame({'FPR': fpr, 'TPR': tpr, 'Threshold': thresholds})
fig_roc = px.area(roc_data, x='FPR', y='TPR', title=f'Interactive ROC Curve (AUC = {roc_auc:.2f})',
                  labels={'FPR': 'False Positive Rate', 'TPR': 'True Positive Rate'})
fig_roc.show()