# EEG-Based Neurological Disorder Classification
## Rice Datathon 2025 - Neurotech Track

This notebook implements machine learning models to classify neurological disorders using EEG data.

### Models Implemented:
- **XGBoost** (Primary model)
- Support Vector Machine (SVM)
- Random Forest
- Logistic Regression (One-vs-Rest approach)

## Configuration

In [None]:
# Set to True to generate competition predictions
# Set to False for model validation and testing
PRODUCTION_MODE = False

# File paths
TRAIN_DATA_PATH = '../data/Train_and_Validate_EEG.csv'
TEST_DATA_PATH = '../data/Test_Set_EEG.csv'
RESULTS_PATH = '../results/'

# Model parameters
RANDOM_STATE = 42
TEST_SIZE = 0.2
CORRELATION_THRESHOLD = 0.95

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay

# Models
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb

# Set style for better visualizations
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

## 2. Data Loading

In [None]:
# Load training and test data
print("Loading data...")
train_df = pd.read_csv(TRAIN_DATA_PATH)
test_df = pd.read_csv(TEST_DATA_PATH)

print(f"Training data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")
print(f"\nTarget classes: {train_df['main.disorder'].unique()}")

## 3. Data Preprocessing

### 3.1 Initial Data Cleaning

In [None]:
# Select relevant columns
train_df = train_df.iloc[:, :123]
test_df = test_df.iloc[:, :120]

# Remove unnecessary columns
columns_to_drop = ['specific.disorder', 'ID', 'eeg.date']
train_df = train_df.drop(columns=[col for col in columns_to_drop if col in train_df.columns])

# Store test IDs for final submission
test_ids = test_df[['ID']].copy()
test_df = test_df.drop(columns=['eeg.date', 'ID'], errors='ignore')

print(f"Shape after initial cleaning - Train: {train_df.shape}, Test: {test_df.shape}")

### 3.2 Handle Missing Values

In [None]:
# Visualize missing values in training data
missing_counts = train_df.isnull().sum()
missing_counts = missing_counts[missing_counts > 0].sort_values(ascending=False)

if len(missing_counts) > 0:
    plt.figure(figsize=(12, 6))
    missing_counts.head(20).plot(kind='bar', color='coral', edgecolor='black')
    plt.title('Top 20 Columns with Missing Values', fontsize=14, fontweight='bold')
    plt.xlabel('Column Names')
    plt.ylabel('Number of Missing Values')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

# Handle missing values in test data
test_df['IQ'] = test_df['IQ'].fillna(test_df['IQ'].mean())
test_df['education'] = test_df['education'].fillna(test_df['education'].mean())

# Remove rows with missing values from training data
train_df_clean = train_df.drop(columns=['Unnamed: 122'], errors='ignore').dropna()
print(f"\nRows removed due to missing values: {len(train_df) - len(train_df_clean)}")
print(f"Final training data shape: {train_df_clean.shape}")

### 3.3 Target Distribution Analysis

In [None]:
# Analyze target distribution
target_distribution = train_df_clean['main.disorder'].value_counts()

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Bar plot
target_distribution.plot(kind='bar', ax=ax1, color='skyblue', edgecolor='black')
ax1.set_title('Distribution of Neurological Disorders', fontsize=12, fontweight='bold')
ax1.set_xlabel('Disorder Type')
ax1.set_ylabel('Count')
ax1.tick_params(axis='x', rotation=45)

# Pie chart
colors = plt.cm.Set3(np.linspace(0, 1, len(target_distribution)))
ax2.pie(target_distribution, labels=target_distribution.index, autopct='%1.1f%%', 
        colors=colors, startangle=90)
ax2.set_title('Disorder Distribution (%)', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.show()

print("\nClass distribution:")
print(target_distribution)

### 3.4 Feature Selection - Remove Highly Correlated Features

In [None]:
# Calculate correlation matrix for numerical features
numerical_features = train_df_clean.select_dtypes(include=[np.number])
correlation_matrix = numerical_features.corr().abs()

# Find highly correlated feature pairs
upper_triangle = correlation_matrix.where(
    np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool)
)

# Identify features to drop
features_to_drop = [column for column in upper_triangle.columns 
                   if any(upper_triangle[column] > CORRELATION_THRESHOLD)]

print(f"Removing {len(features_to_drop)} highly correlated features (correlation > {CORRELATION_THRESHOLD})")
print(f"Features removed: {features_to_drop[:5]}..." if len(features_to_drop) > 5 else f"Features removed: {features_to_drop}")

# Drop highly correlated features
train_df_clean = train_df_clean.drop(columns=features_to_drop, errors='ignore')
test_df = test_df.drop(columns=features_to_drop, errors='ignore')

print(f"\nShape after removing correlated features - Train: {train_df_clean.shape}, Test: {test_df.shape}")

### 3.5 Encode Categorical Variables

In [None]:
# Encode sex variable
label_encoder_sex = LabelEncoder()
train_df_clean['sex'] = label_encoder_sex.fit_transform(train_df_clean['sex'])
test_df['sex'] = label_encoder_sex.transform(test_df['sex'])

print("Categorical encoding completed")

## 4. Model Training and Evaluation

### 4.1 Prepare Data for Modeling

In [None]:
# Separate features and target
X = train_df_clean.drop(columns=['main.disorder'])
y = train_df_clean['main.disorder']

# Create train-test split or use full data for production
if PRODUCTION_MODE:
    X_train, X_test = X, test_df
    y_train, y_test = y, None
    print("Production mode: Using full training data")
else:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
    )
    print(f"Validation mode: Train size: {X_train.shape}, Test size: {X_test.shape}")

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Data preprocessing completed")

### 4.2 XGBoost Model (Primary)

In [None]:
print("Training XGBoost model...")

# Encode target variable
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)

# Initialize and train XGBoost
xgb_model = xgb.XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    objective='multi:softprob',
    eval_metric='mlogloss',
    random_state=RANDOM_STATE,
    n_jobs=-1
)

xgb_model.fit(X_train_scaled, y_train_encoded)

# Make predictions
y_pred_xgb = xgb_model.predict(X_test_scaled)

if PRODUCTION_MODE:
    # Save predictions for submission
    predictions_df = pd.DataFrame({
        'ID': test_ids['ID'],
        'main.disorder.class': label_encoder.inverse_transform(y_pred_xgb)
    })
    predictions_df.to_csv(f'{RESULTS_PATH}xgb_predictions.csv', index=False)
    print(f"XGBoost predictions saved to {RESULTS_PATH}xgb_predictions.csv")
else:
    # Evaluate model
    y_test_encoded = label_encoder.transform(y_test)
    accuracy = accuracy_score(y_test_encoded, y_pred_xgb)
    print(f"\nXGBoost Accuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    print(classification_report(
        label_encoder.inverse_transform(y_test_encoded),
        label_encoder.inverse_transform(y_pred_xgb)
    ))
    
    # Confusion Matrix
    disp = ConfusionMatrixDisplay.from_predictions(
        label_encoder.inverse_transform(y_test_encoded),
        label_encoder.inverse_transform(y_pred_xgb),
        xticks_rotation='vertical',
        cmap='Blues'
    )
    disp.ax_.set_title("XGBoost Confusion Matrix")
    plt.tight_layout()
    plt.show()

### 4.3 Feature Importance Analysis

In [None]:
# Display top feature importances
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': xgb_model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 8))
top_features = feature_importance.head(20)
plt.barh(range(len(top_features)), top_features['importance'].values, color='steelblue')
plt.yticks(range(len(top_features)), top_features['feature'].values)
plt.xlabel('Feature Importance')
plt.title('Top 20 Most Important Features (XGBoost)', fontweight='bold')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

print("\nTop 10 Most Important Features:")
print(feature_importance.head(10).to_string(index=False))

### 4.4 Support Vector Machine

In [None]:
print("Training SVM model...")

# Initialize and train SVM
svm_model = SVC(
    kernel='rbf',
    C=1.0,
    gamma='scale',
    class_weight='balanced',
    random_state=RANDOM_STATE
)

svm_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred_svm = svm_model.predict(X_test_scaled)

if PRODUCTION_MODE:
    # Save predictions
    predictions_df = pd.DataFrame({
        'ID': test_ids['ID'],
        'main.disorder.class': y_pred_svm
    })
    predictions_df.to_csv(f'{RESULTS_PATH}svm_predictions.csv', index=False)
    print(f"SVM predictions saved to {RESULTS_PATH}svm_predictions.csv")
else:
    # Evaluate model
    accuracy = accuracy_score(y_test, y_pred_svm)
    print(f"\nSVM Accuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred_svm))

### 4.5 Random Forest

In [None]:
print("Training Random Forest model...")

# Initialize and train Random Forest
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    class_weight='balanced',
    random_state=RANDOM_STATE,
    n_jobs=-1
)

rf_model.fit(X_train_scaled, y_train_encoded)

# Make predictions
y_pred_rf = rf_model.predict(X_test_scaled)

if PRODUCTION_MODE:
    # Save predictions
    predictions_df = pd.DataFrame({
        'ID': test_ids['ID'],
        'main.disorder.class': label_encoder.inverse_transform(y_pred_rf)
    })
    predictions_df.to_csv(f'{RESULTS_PATH}rf_predictions.csv', index=False)
    print(f"Random Forest predictions saved to {RESULTS_PATH}rf_predictions.csv")
else:
    # Evaluate model
    accuracy = accuracy_score(y_test_encoded, y_pred_rf)
    print(f"\nRandom Forest Accuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    print(classification_report(
        label_encoder.inverse_transform(y_test_encoded),
        label_encoder.inverse_transform(y_pred_rf)
    ))

### 4.6 Binary Classification Analysis (One-vs-Rest)

In [None]:
if not PRODUCTION_MODE:
    print("\nPerforming binary classification analysis (disorder vs healthy control)...\n")
    
    disorders = ['Addictive disorder', 'Anxiety disorder', 'Mood disorder', 
                'Obsessive compulsive disorder', 'Schizophrenia', 
                'Trauma and stress related disorder']
    
    binary_results = []
    
    for disorder in disorders:
        # Filter data for binary classification
        binary_mask_train = y_train.isin([disorder, 'Healthy control'])
        binary_mask_test = y_test.isin([disorder, 'Healthy control'])
        
        X_train_binary = X_train_scaled[binary_mask_train]
        y_train_binary = y_train[binary_mask_train]
        X_test_binary = X_test_scaled[binary_mask_test]
        y_test_binary = y_test[binary_mask_test]
        
        # Encode binary labels
        le_binary = LabelEncoder()
        y_train_binary_encoded = le_binary.fit_transform(y_train_binary)
        y_test_binary_encoded = le_binary.transform(y_test_binary)
        
        # Train logistic regression
        lr_model = LogisticRegression(
            penalty='l2',
            C=1.0,
            max_iter=1000,
            random_state=RANDOM_STATE
        )
        
        lr_model.fit(X_train_binary, y_train_binary_encoded)
        y_pred_binary = lr_model.predict(X_test_binary)
        
        # Calculate accuracy
        accuracy = accuracy_score(y_test_binary_encoded, y_pred_binary)
        binary_results.append({
            'Disorder': disorder,
            'Accuracy': accuracy,
            'Test Samples': len(y_test_binary)
        })
    
    # Display results
    results_df = pd.DataFrame(binary_results)
    results_df = results_df.sort_values('Accuracy', ascending=False)
    
    plt.figure(figsize=(10, 6))
    plt.barh(results_df['Disorder'], results_df['Accuracy'], color='teal')
    plt.xlabel('Accuracy')
    plt.title('Binary Classification Performance (Disorder vs Healthy Control)', fontweight='bold')
    plt.xlim([0, 1])
    
    # Add accuracy values on bars
    for i, (disorder, acc) in enumerate(zip(results_df['Disorder'], results_df['Accuracy'])):
        plt.text(acc + 0.01, i, f'{acc:.3f}', va='center')
    
    plt.tight_layout()
    plt.show()
    
    print("\nBinary Classification Results:")
    print(results_df.to_string(index=False))

## 5. Model Comparison Summary

In [None]:
if not PRODUCTION_MODE:
    print("\n" + "="*50)
    print("MODEL PERFORMANCE SUMMARY")
    print("="*50)
    print("\nBest performing model: XGBoost")
    print("\nRecommendation: Use XGBoost predictions for final submission")
else:
    print("\n" + "="*50)
    print("PRODUCTION RUN COMPLETE")
    print("="*50)
    print(f"\nPrediction files saved to: {RESULTS_PATH}")
    print("- xgb_predictions.csv (RECOMMENDED)")
    print("- svm_predictions.csv")
    print("- rf_predictions.csv")