Peak the power spectrum of the data

In [None]:
import os
import mne
import matplotlib.pyplot as plt

# Define the folder containing the .fif files
processed_folder = "AllData/Preprocessed_FIF_OpenNeuroHC"  # Replace with the actual path to the folder

# Define the folder for PSD plots
psd_plot_folder = os.path.join(processed_folder, "PSD_Plot")
os.makedirs(psd_plot_folder, exist_ok=True)  # Create the folder if it doesn't exist

# Loop through all .fif files in the folder
for file_name in os.listdir(processed_folder):
    if file_name.endswith(".fif"):  # Process only .fif files
        file_path = os.path.join(processed_folder, file_name)

        # Load the .fif file
        print(f"Processing: {file_name}")
        raw = mne.io.read_raw_fif(file_path, preload=True)

        # Plot the PSD for each channel
        plt.figure(figsize=(10, 6))
        raw.plot_psd(fmax=45, show=False)  # Limit frequency range to 50 Hz for clarity
        plt.title(f"PSD for {file_name}")

        # Save the plot to the PSD_Plot folder
        output_plot_path = os.path.join(psd_plot_folder, f"{file_name.replace('.fif', '_psd.png')}")
        plt.savefig(output_plot_path)
        plt.close()

        print(f"PSD plot saved for {file_name} at {output_plot_path}")


Decomposing periodic and aperiodic components

In [None]:
import os
import mne
import numpy as np
from fooof import FOOOF
import pandas as pd

# Define the folder containing the .fif files
processed_folder = "AllData/preprocessed_dryad"  # Replace with your path

# Initialize data storage
results = []

# Loop through all .fif files in the folder
for file_name in os.listdir(processed_folder):
    if file_name.endswith(".fif"):  # Process only .fif files
        file_path = os.path.join(processed_folder, file_name)

        # Load the .fif file
        raw = mne.io.read_raw_fif(file_path, preload=True)

        # Compute PSD for each channel
        psd = raw.compute_psd(fmin=1, fmax=45, n_fft=1024, verbose=False)
        freqs = psd.freqs  # Extract frequencies
        psds = psd.get_data()  # Power values for each channel

        # Loop through each channel to apply FOOOF
        for ch_idx, ch_name in enumerate(raw.ch_names):
            channel_psd = psds[ch_idx]  # PSD for the current channel

            # Initialize FOOOF model
            fm = FOOOF(aperiodic_mode='knee',max_n_peaks=7,min_peak_height=0.1)  # Use 'knee' mode

            # Fit FOOOF to the PSD
            fm.fit(freqs, channel_psd)
            # fm.plot()
            # Collect aperiodic and periodic parameters
            aperiodic_params = fm.aperiodic_params_
            peak_params = fm.peak_params_

            # Append the results for this channel
            results.append({
                'Subject': file_name,
                'Channel': ch_name,
                'Aperiodic_Offset': aperiodic_params[0],
                'Aperiodic_Exponent': aperiodic_params[1],
                'Aperiodic_Knee': aperiodic_params[2] if len(aperiodic_params) > 2 else None,
                'Peak_Params': peak_params.tolist(),
            })

# Save results to a CSV file
results_df = pd.DataFrame(results)
output_path = os.path.join(processed_folder, "fooof_results_channels.csv")
results_df.to_csv(output_path, index=False)

print(f"FOOOF analysis completed and results saved to {output_path}")

In [None]:
Coherence features

In [None]:
import os
import numpy as np
import pandas as pd
from scipy.signal import coherence
import mne

# Define the folder containing the .fif files
processed_folder = "AllData/preprocessed_dryad"  # Replace with your folder path

# Define frequency bands of interest
bands = {
    "Delta": (1, 4),
    "Theta": (4, 8),
    "Alpha": (8, 12),
    "Beta": (12, 30),
    "Gamma": (30, 45),
}

# Initialize the DataFrame to store results
results = []

# Parameters for coherence calculation
fs = 250  # Sampling frequency (after downsampling)

# Loop through each .fif file in the folder
for file_name in os.listdir(processed_folder):
    if file_name.endswith(".fif"):  # Process only .fif files
        file_path = os.path.join(processed_folder, file_name)
        
        # Load the .fif file
        raw = mne.io.read_raw_fif(file_path, preload=True)
        data, channel_names = raw.get_data(), raw.ch_names
        
        # Calculate coherence between every pair of channels
        n_channels = len(channel_names)
        for i in range(n_channels):
            for j in range(i + 1, n_channels):  # Avoid duplicate pairs
                # Extract the two signals
                signal_1, signal_2 = data[i, :], data[j, :]
                
                # Calculate coherence
                freqs, coh_values = coherence(signal_1, signal_2, fs=fs, nperseg=fs * 2)  # Adjust nperseg for segments
                
                # Loop through each frequency band
                for band_name, (fmin, fmax) in bands.items():
                    # Extract average coherence in the frequency band of interest
                    band_indices = (freqs >= fmin) & (freqs <= fmax)
                    avg_coh = np.mean(coh_values[band_indices])
                    
                    # Append results to the list
                    results.append({
                        "Subject": file_name,
                        "Channel_Pair": f"{channel_names[i]}-{channel_names[j]}",
                        "Band": band_name,
                        "Coherence": avg_coh
                    })

# Convert results to a DataFrame
results_df = pd.DataFrame(results)

# Save the results to a CSV file
output_path = os.path.join(processed_folder, "coherence_results.csv")
results_df.to_csv(output_path, index=False)

print(f"Coherence calculation completed and saved to {output_path}")


Entropy features

In [None]:
import os
import mne
import pandas as pd
from antropy import spectral_entropy  # Or use other entropy functions like 'perm_entropy', 'svd_entropy'

# Path to the folder containing .fif files
processed_fif_folder = 'AllData/Figshare/Processed_FIF'  # Replace with your folder path

# Initialize a list to store combined results
combined_results = []

# Loop through all .fif files in the folder
for file_name in os.listdir(processed_fif_folder):
    if file_name.endswith('.fif'):  # Only process .fif files
        file_path = os.path.join(processed_fif_folder, file_name)
        
        # Load the .fif file
        print(f"Processing: {file_name}")
        raw = mne.io.read_raw_fif(file_path, preload=True)
        
        # Get the data (channels x time) and sampling frequency
        data, sfreq = raw.get_data(), raw.info['sfreq']
        
        # Calculate entropy for each channel
        for i, channel_name in enumerate(raw.ch_names):
            channel_data = data[i, :]  # Data for one channel
            entropy = spectral_entropy(channel_data, sfreq, method='fft')  # Compute spectral entropy
            combined_results.append({
                'Subject': file_name.replace('.fif', ''),  # Extract subject ID
                'Channel': channel_name,
                'Entropy': entropy
            })

# Convert combined results to a DataFrame
entropy_df = pd.DataFrame(combined_results)

# Save the combined results to a CSV file
output_path = os.path.join(processed_fif_folder, 'all_entropy_features.csv')
entropy_df.to_csv(output_path, index=False)

print(f"Combined entropy features saved to: {output_path}")

I didn't use this feature, but I onced look at it, not that helpful compared with the other 3

In [None]:
import os
import mne
import pandas as pd
import numpy as np
from scipy.signal import hilbert

# Path to the folder containing .fif files
processed_fif_folder = 'AllData/Figshare/Processed_FIF'  # Update with your folder path

# Initialize a list to store results
results = []

# Loop through all .fif files in the folder
for file_name in os.listdir(processed_fif_folder):
    if file_name.endswith('.fif'):  # Only process .fif files
        file_path = os.path.join(processed_fif_folder, file_name)
        
        # Load the .fif file
        print(f"Processing: {file_name}")
        raw = mne.io.read_raw_fif(file_path, preload=True)
        
        # Get the data (channels x time) and sampling frequency
        data, sfreq = raw.get_data(), raw.info['sfreq']
        
        # Calculate Fluctuate Dynamics and Amplitude Envelope Variance for each channel
        for i, channel_name in enumerate(raw.ch_names):
            channel_data = data[i, :]  # Data for one channel
            
            # Compute the analytic signal using the Hilbert transform
            analytic_signal = hilbert(channel_data)
            
            # Extract the amplitude envelope
            amplitude_envelope = np.abs(analytic_signal)
            
            # Compute Fluctuate Dynamics as the standard deviation of the amplitude envelope
            fluctuate_dynamics = np.std(amplitude_envelope)
            
            # Compute Amplitude Envelope Variance
            envelope_variance = np.var(amplitude_envelope)
            
            # Append the results
            results.append({
                'Subject': file_name.replace('.fif', ''),  # Extract subject ID
                'Channel': channel_name,
                'Fluctuate_Dynamics': fluctuate_dynamics,
                'Amplitude_Envelope_Variance': envelope_variance
            })

# Convert results to a DataFrame
results_df = pd.DataFrame(results)

# Save results to a CSV file
output_path = os.path.join(processed_fif_folder, 'oscillation_fluctuation_features.csv')
results_df.to_csv(output_path, index=False)

print(f"Oscillation features saved to: {output_path}")


In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GroupKFold
from sklearn.metrics import accuracy_score
import numpy as np

# File path
file_path = 'AllData/Figshare/Processed_FIF/oscillation_fluctuation_features.csv'  # Replace with the actual file path

# Load the file
features_df = pd.read_csv(file_path)

# Reshape the data: pivot to create one row per subject with 19*2 features
pivot_df = features_df.pivot(index='Subject', columns='Channel', values=['Fluctuate_Dynamics', 'Amplitude_Envelope_Variance'])

# Flatten the multi-index columns to single-level columns with format "channel_FD" or "channel_AEV"
pivot_df.columns = [f"{channel}_{metric}" for metric, channel in pivot_df.columns]

# Reset the index to make it a DataFrame
pivot_df.reset_index(inplace=True)

# Create labels (assuming subject names starting with 'H' are healthy, and others are MDD)
pivot_df['Target'] = pivot_df['Subject'].apply(lambda x: 0 if x.startswith('H') else 1)

# Separate features and target
X = pivot_df.drop(columns=['Subject', 'Target'])
y = pivot_df['Target']
groups = pivot_df['Subject']  # Use subjects for group-based cross-validation

# Initialize Random Forest model
rf_model = RandomForestClassifier(random_state=42, n_estimators=100)

# Perform subject-wise cross-validation using GroupKFold
group_kfold = GroupKFold(n_splits=10)
cv_scores = []

# Run subject-wise cross-validation
for train_idx, test_idx in group_kfold.split(X, y, groups):
    # Split the data
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    # Train the Random Forest model
    rf_model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = rf_model.predict(X_test)
    
    # Compute accuracy
    acc = accuracy_score(y_test, y_pred)
    cv_scores.append(acc)

# Calculate mean and standard deviation of the scores
mean_accuracy = np.mean(cv_scores)
std_accuracy = np.std(cv_scores)

# Print results
print(f"10-Fold Subject-wise Cross-Validation Accuracy Scores: {cv_scores}")
print(f"Mean Accuracy: {mean_accuracy:.4f} ± {std_accuracy:.4f}")


Feature get-together

In [None]:
import pandas as pd

# Load the datasets
entropy_path = 'AllData/Figshare/Processed_FIF/all_entropy_features.csv'
coherence_path = 'AllData/Figshare/Processed_FIF/coherence_results.csv'
aperiodic_path = 'AllData/Figshare/Processed_FIF/fooof_results_channels.csv'

# Read the data
entropy_df = pd.read_csv(entropy_path)
coherence_df = pd.read_csv(coherence_path)
aperiodic_df = pd.read_csv(aperiodic_path)

# Clean subject names by removing ".fif"
entropy_df['Subject'] = entropy_df['Subject'].str.replace('.fif', '', regex=False)
coherence_df['Subject'] = coherence_df['Subject'].str.replace('.fif', '', regex=False)
aperiodic_df['Subject'] = aperiodic_df['Subject'].str.replace('.fif', '', regex=False)

# Prepare Entropy Features
entropy_pivot = entropy_df.pivot(index='Subject', columns='Channel', values='Entropy')
entropy_pivot.columns = [f"{col}_Entropy" for col in entropy_pivot.columns]
entropy_pivot.reset_index(inplace=True)

# Prepare Coherence Features
coherence_df['Feature'] = coherence_df['Channel_Pair'] + "_" + coherence_df['Band']
coherence_pivot = coherence_df.pivot(index='Subject', columns='Feature', values='Coherence')
coherence_pivot.columns = [f"{col}_Coherence" for col in coherence_pivot.columns]
coherence_pivot.reset_index(inplace=True)

# Prepare Aperiodic Features
aperiodic_pivot = aperiodic_df.pivot(index='Subject', columns='Channel', values=['Aperiodic_Offset', 'Aperiodic_Exponent', 'Aperiodic_Knee'])
aperiodic_pivot.columns = ['_'.join(col).strip() for col in aperiodic_pivot.columns.values]
aperiodic_pivot.reset_index(inplace=True)

# Merge all features into one DataFrame
merged_df = pd.merge(entropy_pivot, coherence_pivot, on='Subject', how='outer')
merged_df = pd.merge(merged_df, aperiodic_pivot, on='Subject', how='outer')

# Save the combined DataFrame
output_path = 'AllData/Figshare/Processed_FIF/all_combined_features.csv'
merged_df.to_csv(output_path, index=False)

print(f"Combined features saved to {output_path}")


In [None]:
import pandas as pd

# Load the combined dataset
file_path = 'AllData/Figshare/Processed_FIF/all_combined_features.csv'
data = pd.read_csv(file_path)

# Extract the group label from the Subject column
data['Group'] = data['Subject'].apply(lambda x: 'H' if x.startswith('H') else 'MDD')

# Function to fill missing values with group-specific mean
def fill_missing_with_group_mean(df, group_col, value_cols):
    filled_df = df.copy()
    # Replace infinite values with NaN
    for col in value_cols:
        filled_df[col] = filled_df[col].replace([float('inf'), float('-inf')], pd.NA)
    # Fill NaN values with group-specific mean
    for col in value_cols:
        filled_df[col] = filled_df.groupby(group_col)[col].transform(lambda group: group.fillna(group.mean()))
    return filled_df

# Identify feature columns (excluding Subject and Group columns)
feature_columns = data.columns.difference(['Subject', 'Group'])

# Fill missing values
filled_data = fill_missing_with_group_mean(data, group_col='Group', value_cols=feature_columns)

# Save the filled dataset
output_path = 'AllData/Figshare/Processed_FIF/all_combined_features_filled.csv'
filled_data.to_csv(output_path, index=False)

print(f"Filled dataset saved to {output_path}")


In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load the combined dataset
file_path = 'AllData/Figshare/Processed_FIF/all_combined_features.csv'
data = pd.read_csv(file_path)

# Extract the group label from the Subject column
data['Group'] = data['Subject'].apply(lambda x: 'H' if x.startswith('H') else 'MDD')

# Function to fill missing values with group-specific mean and normalize data
def fill_and_normalize_group_mean(df, group_col, value_cols):
    filled_df = df.copy()
    
    # Replace infinite values with NaN
    for col in value_cols:
        filled_df[col] = filled_df[col].replace([float('inf'), float('-inf')], pd.NA)
    
    # Fill NaN values with group-specific mean
    for col in value_cols:
        filled_df[col] = filled_df.groupby(group_col)[col].transform(lambda group: group.fillna(group.mean()))
    
    # Normalize the features
    scaler = StandardScaler()
    filled_df[value_cols] = scaler.fit_transform(filled_df[value_cols])
    
    return filled_df

# Identify feature columns (excluding Subject and Group columns)
feature_columns = data.columns.difference(['Subject', 'Group'])

# Fill missing values and normalize features
filled_data = fill_and_normalize_group_mean(data, group_col='Group', value_cols=feature_columns)

# Save the filled and normalized dataset
output_path = 'AllData/Figshare/Processed_FIF/all_combined_features_filled_normalized.csv'
filled_data.to_csv(output_path, index=False)

print(f"Filled and normalized dataset saved to {output_path}")


Feature selection

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GroupKFold
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Load the filled dataset
file_path = 'AllData/Figshare/Processed_FIF/all_combined_features_filled_normalized.csv'
data = pd.read_csv(file_path)

# Replace infinite values with NaN
data.replace([np.inf, -np.inf], np.nan, inplace=True)

# Drop rows with any remaining NaN values
data.dropna(inplace=True)

# Check and ensure all columns are numeric (convert if necessary)
for col in data.columns:
    if col not in ['Subject', 'Group']:
        data[col] = pd.to_numeric(data[col], errors='coerce')

# Drop any rows with remaining NaNs after conversion
data.dropna(inplace=True)

# Separate features, target, and groups
X = data.drop(columns=['Subject', 'Group'])
y = data['Group'].apply(lambda x: 0 if x == 'H' else 1)  # Binary target: 0 for 'H', 1 for 'MDD'
groups = data['Subject']

# Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Initialize Random Forest model with best parameters
rf_model = RandomForestClassifier(
    max_depth=None, 
    max_features='log2', 
    min_samples_leaf=1, 
    min_samples_split=2, 
    n_estimators=100, 
    random_state=22
)

# Recursive Feature Elimination (RFE) for top 25 features
rfe = RFE(estimator=rf_model, n_features_to_select=25, step=10)
X_selected = rfe.fit_transform(X_scaled, y)

# Get the selected feature names
selected_features = X.columns[rfe.support_].tolist()
print(f"Selected Top 25 Features: {selected_features}")

# Perform subject-wise cross-validation
group_kfold = GroupKFold(n_splits=10)
cv_scores = []

for train_idx, test_idx in group_kfold.split(X_selected, y, groups):
    # Split the data
    X_train, X_test = X_selected[train_idx], X_selected[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    # Train Random Forest model
    rf_model.fit(X_train, y_train)
    
    # Predict and evaluate
    y_pred = rf_model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    cv_scores.append(acc)

# Calculate mean and standard deviation of the scores
mean_accuracy = np.mean(cv_scores)
std_accuracy = np.std(cv_scores)

print(f"10-Fold Subject-wise Cross-Validation Accuracy Scores: {cv_scores}")
print(f"Mean Accuracy: {mean_accuracy:.4f} ± {std_accuracy:.4f}")


In [None]:
Visualizing Feature Importance

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Extract feature importance scores from the trained Random Forest model
feature_importances = pd.DataFrame({
    'Feature': selected_features,  # Top 25 selected feature names
    'Importance': rf_model.feature_importances_
})

# Sort features by importance
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)

# Plot the feature importance scores
plt.figure(figsize=(10, 6))
plt.barh(feature_importances['Feature'], feature_importances['Importance'], color='tab:blue')
plt.xlabel('Importance Score', fontsize=12)
plt.ylabel('Features', fontsize=12)
plt.title('Feature Importance (Top 25 Selected Features)', fontsize=14)
plt.gca().invert_yaxis()  # Show the most important features on top
plt.tight_layout()
plt.show()


In [None]:
Visualizing Feature Importance 2

In [None]:
import pandas as pd

# Define brain regions and feature types
brain_regions = {
    'frontal_left': ['Fp1', 'F3', 'F7'],
    'frontal_right': ['Fp2', 'F4', 'F8'],
    'temporal_left': ['T3', 'T5'],
    'temporal_right': ['T4', 'T6'],
    'central_left': ['C3'],
    'central_right': ['C4'],
    'parietal_left': ['P3'],
    'parietal_right': ['P4'],
    'occipital_left': ['O1'],
    'occipital_right': ['O2']
}


feature_types = {
    'aperiodic_offset': 'Offset',
    'aperiodic_knee': 'Knee',
    'coherence': 'Coherence'
}

# Initialize dictionaries to store weighted sums
region_weights = {region: 0 for region in brain_regions.keys()}
feature_type_weights = {ftype: 0 for ftype in feature_types.keys()}

# Corrected handling of coherence features
for _, row in feature_importances.iterrows():
    feature_name = row['Feature']
    importance = row['Importance']
    
    # Check for feature type
    for ftype, identifier in feature_types.items():
        if identifier in feature_name:
            if ftype == 'coherence':
                # Split coherence features into channels and weight them
                channels = feature_name.split('_')[0:2]  # Extract channel names
                weight = importance / 2  # Each channel gets half weight
                feature_type_weights[ftype] += importance  # Add importance to coherence
                
                # Assign importance to brain regions
                for channel in channels:
                    for region, channels_list in brain_regions.items():
                        if channel in channels_list:
                            region_weights[region] += weight
            else:
                feature_type_weights[ftype] += importance

            # Assign importance for non-coherence features to brain regions
            for region, channels_list in brain_regions.items():
                if any(channel in feature_name for channel in channels_list):
                    region_weights[region] += importance

# Print corrected results
print("Feature Type Importance Weights:")
print(feature_type_weights)
print("\nBrain Region Importance Weights:")
print(region_weights)

import matplotlib.pyplot as plt

# Colors for feature types
feature_type_colors = {
    'aperiodic_offset': '#66c2a5',  # Greenish shade
    'aperiodic_knee': '#fc8d62',    # Orange shade
    'coherence': '#8da0cb'          # Bluish shade
}

import matplotlib.pyplot as plt

# Updated color palette with purple shades
feature_type_colors = {
    'aperiodic_offset': '#a6bddb',  # Light blue-gray
    'aperiodic_knee': '#3690c0',    # Medium blue-gray
    'coherence': '#8c6bb1'          # Purple
}

brain_region_colors = {
    'frontal_left': '#c7e9c0',   # Soft green
    'frontal_right': '#c7e9c0',  # Soft green
    'temporal_left': '#9e9ac8',  # Light purple
    'temporal_right': '#9e9ac8', # Light purple
    'central_left': '#3690c0',   # Medium blue-gray
    'central_right': '#3690c0',  # Medium blue-gray
    'parietal_left': '#bdbdbd',  # Light gray
    'parietal_right': '#bdbdbd', # Light gray
    'occipital_left': '#6a51a3', # Dark purple
    'occipital_right': '#6a51a3' # Dark purple
}

# Sorting the weights for ranking
sorted_feature_type_weights = {k: v for k, v in sorted(feature_type_weights.items(), key=lambda item: item[1], reverse=True)}
sorted_region_weights = {k: v for k, v in sorted(region_weights.items(), key=lambda item: item[1], reverse=True)}

# Plot Feature Type Importance
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.pie(sorted_feature_type_weights.values(),
        labels=sorted_feature_type_weights.keys(),
        colors=[feature_type_colors[key] for key in sorted_feature_type_weights.keys()],
        autopct='%1.1f%%',
        startangle=140)
plt.title('Feature Type Importance', fontsize=14)

# Plot Brain Region Importance
plt.subplot(1, 2, 2)
plt.pie(sorted_region_weights.values(),
        labels=sorted_region_weights.keys(),
        colors=[brain_region_colors[key] for key in sorted_region_weights.keys()],
        autopct='%1.1f%%',
        startangle=140)
plt.title('Brain Region Importance', fontsize=14)

plt.tight_layout()
plt.show()


In [None]:
Grib search best params

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, GroupKFold
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score

# Load the dataset
file_path = 'AllData/Figshare/Processed_FIF/all_combined_features_filled_normalized.csv'
data = pd.read_csv(file_path)

# Preprocess the dataset
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.dropna(inplace=True)

selected_features = [
    'C3-T3_Alpha_Coherence', 'F7-O2_Gamma_Coherence', 'Fp1-F4_Theta_Coherence', 'O2-T4_Alpha_Coherence', 'P4-Cz_Alpha_Coherence', 'Aperiodic_Offset_C3', 'Aperiodic_Offset_C4', 'Aperiodic_Offset_Fp2', 'Aperiodic_Offset_P3', 'Aperiodic_Offset_P4', 'Aperiodic_Offset_Pz', 'Aperiodic_Offset_T4', 'Aperiodic_Offset_T5', 'Aperiodic_Offset_T6', 'Aperiodic_Knee_C3', 'Aperiodic_Knee_Cz', 'Aperiodic_Knee_Fz', 'Aperiodic_Knee_O1', 'Aperiodic_Knee_O2', 'Aperiodic_Knee_P3', 'Aperiodic_Knee_P4', 'Aperiodic_Knee_Pz', 'Aperiodic_Knee_T4', 'Aperiodic_Knee_T5', 'Aperiodic_Knee_T6'

]
X = data[selected_features]
y = data['Group'].apply(lambda x: 0 if x == 'H' else 1)  # Binary target: 0 for 'H', 1 for 'MDD'
groups = data['Subject']

# Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA to reduce dimensionality
pca = PCA(n_components=3)  # Adjust number of components as needed
X_pca = pca.fit_transform(X_scaled)

# Initialize GroupKFold
group_kfold = GroupKFold(n_splits=10)

# Define parameter grids for each model
param_grids = {
    'RandomForest': {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10]
    },
    'KNN': {
        'n_neighbors': [3, 5, 7],
        'weights': ['uniform', 'distance'],
        'p': [1, 2]
    },
    'LogisticRegression': {
        'C': [0.1, 1, 10],
        'max_iter': [100, 200, 500],
        'penalty': ['l2'],
        'solver': ['lbfgs']
    },
    'GradientBoosting': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7]
    },
    'SVM': {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf'],
        'gamma': ['scale', 'auto']
    }
}

# Define models
models = {
    'RandomForest': RandomForestClassifier(random_state=42),
    'KNN': KNeighborsClassifier(),
    'LogisticRegression': LogisticRegression(random_state=42),
    'GradientBoosting': GradientBoostingClassifier(random_state=42),
    'SVM': SVC(random_state=42)
}

# Perform grid search for each model
best_params = {}
for model_name, model in models.items():
    print(f"\n--- Grid Search for {model_name} ---")
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grids[model_name],
        cv=group_kfold.split(X_pca, y, groups),
        scoring='accuracy',
        n_jobs=-1
    )
    grid_search.fit(X_pca, y)
    best_params[model_name] = grid_search.best_params_
    print(f"Best Parameters for {model_name}: {grid_search.best_params_}")
    print(f"Best Accuracy for {model_name}: {grid_search.best_score_:.4f}")

print("\nSummary of Best Parameters:")
for model_name, params in best_params.items():
    print(f"{model_name}: {params}")


PCA and Classifiers

In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.stats.multitest import multipletests
from scipy.stats import ttest_ind

# Define the folder path where the logs are stored
folder_path = "AllData/Figshare/Processed_FIF"

# File paths for the logs
rejected_subjects_file = os.path.join(folder_path, "rejected_subjects.txt")
rejected_channels_file = os.path.join(folder_path, "rejected_channels.txt")
rejected_components_file = os.path.join(folder_path, "rejected_components.txt")

# Read the logs
with open(rejected_subjects_file, 'r') as f:
    rejected_subjects = f.readlines()

with open(rejected_channels_file, 'r') as f:
    rejected_channels = [line.strip() for line in f.readlines()]

with open(rejected_components_file, 'r') as f:
    rejected_components = [line.strip() for line in f.readlines()]

# Initialize data storage for groups
data = {"H": {"subjects": 0, "channels": [], "independent components": [], "variance_explained": []},
        "MDD": {"subjects": 0, "channels": [], "independent components": [], "variance_explained": []}}

# Process rejected subjects
for subject in rejected_subjects:
    group = "H" if subject.startswith("H") else "MDD"
    data[group]["subjects"] += 1

# Process rejected channels
for line in rejected_channels:
    if line:
        parts = line.split(": ")
        group = "H" if parts[0].startswith("H") else "MDD"
        channels_list = eval(parts[1]) if len(parts) > 1 and parts[1] else []
        data[group]["channels"].append(len(channels_list))

# Process rejected independent components
for line in rejected_components:
    if line:
        parts = line.split(": ", 1)
        group = "H" if parts[0].startswith("H") else "MDD"
        stats_dict = eval(parts[1]) if len(parts) > 1 else {}
        data[group]["independent components"].append(stats_dict.get('rejected_components', 0))
        data[group]["variance_explained"].append(stats_dict.get('variance_explained', 0))

# Calculate means and standard errors
def calc_stats(data_list):
    return np.mean(data_list), np.std(data_list) / np.sqrt(len(data_list))

stats = {group: {
    "subjects": data[group]["subjects"],
    "channels": calc_stats(data[group]["channels"]),
    "independent components": calc_stats(data[group]["independent components"]),
    "variance_explained": calc_stats(data[group]["variance_explained"]),
} for group in data.keys()}

# Perform statistical tests
p_channels = ttest_ind(data["H"]["channels"], data["MDD"]["channels"], equal_var=False).pvalue
p_components = ttest_ind(data["H"]["independent components"], data["MDD"]["independent components"], equal_var=False).pvalue
p_variance = ttest_ind(data["H"]["variance_explained"], data["MDD"]["variance_explained"], equal_var=False).pvalue

# Apply Bonferroni correction
p_values = [p_channels, p_components, p_variance]
bonferroni_corrected = multipletests(p_values, method='bonferroni')[1]

# Print corrected p-values
print("Bonferroni Corrected p-values:")
print(f"Channels: {bonferroni_corrected[0]:.3f}")
print(f"Components: {bonferroni_corrected[1]:.3f}")
print(f"Variance Explained: {bonferroni_corrected[2]:.3f}")

# Plot the results
fig, axes = plt.subplots(1, 4, figsize=(22, 6))

# Rejected Subjects
axes[0].bar(["H", "MDD"], [stats["H"]["subjects"], stats["MDD"]["subjects"]])
axes[0].set_title("Rejected Subjects")
axes[0].set_ylabel("Count")

# Rejected Channels
axes[1].bar(["H", "MDD"], [stats["H"]["channels"][0], stats["MDD"]["channels"][0]],
            yerr=[stats["H"]["channels"][1], stats["MDD"]["channels"][1]], capsize=5)
axes[1].set_title("Rejected Channels")
axes[1].set_ylabel("Mean Count")
axes[1].text(0.5, max(stats["H"]["channels"][0], stats["MDD"]["channels"][0]),
             f"p={bonferroni_corrected[0]:.3f}", ha='center')

# Rejected Components
axes[2].bar(["H", "MDD"], [stats["H"]["independent components"][0], stats["MDD"]["independent components"][0]],
            yerr=[stats["H"]["independent components"][1], stats["MDD"]["independent components"][1]], capsize=5)
axes[2].set_title("Rejected Independent Components")
axes[2].set_ylabel("Mean Count")
axes[2].text(0.5, max(stats["H"]["independent components"][0], stats["MDD"]["independent components"][0]),
             f"p={bonferroni_corrected[1]:.3f}", ha='center')

# Variance Explained by Rejected Components
axes[3].bar(["H", "MDD"], [stats["H"]["variance_explained"][0], stats["MDD"]["variance_explained"][0]],
            yerr=[stats["H"]["variance_explained"][1], stats["MDD"]["variance_explained"][1]], capsize=5)
axes[3].set_title("Variance Explained by Rejected Independent Components")
axes[3].set_ylabel("Mean Variance (%)")
axes[3].text(0.5, max(stats["H"]["variance_explained"][0], stats["MDD"]["variance_explained"][0]),
             f"p={bonferroni_corrected[2]:.3f}", ha='center')

plt.tight_layout()
plt.show()


Visualizing model performance

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Initialize dictionaries to store metrics
metrics = {model: {} for model in ['RandomForest', 'KNN', 'LogisticRegression', 'GradientBoosting', 'SVM']}

# Models with their best parameters
models = {
    'RandomForest': RandomForestClassifier(max_depth=None, min_samples_split=2, n_estimators=100),
    'KNN': KNeighborsClassifier(n_neighbors=3, p=2, weights='uniform'),
    'LogisticRegression': LogisticRegression(C=1, max_iter=100, penalty='l2', solver='lbfgs'),
    'GradientBoosting': GradientBoostingClassifier(learning_rate=0.01, max_depth=3, n_estimators=50),
    'SVM': SVC(C=1, gamma='auto', kernel='rbf')
}

# Evaluate each model
for model_name, model in models.items():
    scores = []
    precision, recall, f1 = [], [], []
    for train_idx, test_idx in group_kfold.split(X_pca, y, groups):
        X_train, X_test = X_pca[train_idx], X_pca[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        scores.append(accuracy_score(y_test, y_pred))
        precision.append(precision_score(y_test, y_pred))
        recall.append(recall_score(y_test, y_pred))
        f1.append(f1_score(y_test, y_pred))
    
    # Store metrics
    metrics[model_name]['Accuracy'] = f"{np.mean(scores):.4f} ± {np.std(scores):.4f}"
    metrics[model_name]['Precision'] = f"{np.mean(precision):.4f} ± {np.std(precision):.4f}"
    metrics[model_name]['Recall'] = f"{np.mean(recall):.4f} ± {np.std(recall):.4f}"
    metrics[model_name]['F1-Score'] = f"{np.mean(f1):.4f} ± {np.std(f1):.4f}"

# Print results
for model_name, values in metrics.items():
    print(f"=== {model_name} ===")
    for metric_name, metric_value in values.items():
        print(f"{metric_name}: {metric_value}")
    print("\n")


In [None]:
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay
from sklearn.model_selection import cross_val_predict, cross_val_score
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Initialize models
models = {
    "Random Forest": rf_model,
    "KNN": knn,
    "Logistic Regression": log_reg,
    "Gradient Boosting": gb_model,
    "SVM": svm_model
}

# Initialize storage for metrics
metrics_summary = []

for model_name, model in models.items():
    print(f"Running 10-Fold CV for {model_name}...")
    
    # Perform 10-fold CV
    y_pred = cross_val_predict(model, X_pca, y, cv=10)
    scores = cross_val_score(model, X_pca, y, cv=10)
    
    # Confusion Matrix
    cm = confusion_matrix(y, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot()
    plt.title(f"Confusion Matrix (10-Fold CV) - {model_name}")
    plt.show()
    
    # Classification Report
    report = classification_report(y, y_pred, output_dict=True)
    print(f"Classification Report (10-Fold CV) - {model_name}:\n", report)
    
    # Store metrics for summary
    metrics_summary.append({
        "Model": model_name,
        "Mean Accuracy": np.mean(scores),
        "Accuracy Std Dev": np.std(scores),
        "Precision": report["weighted avg"]["precision"],
        "Recall": report["weighted avg"]["recall"],
        "F1-Score": report["weighted avg"]["f1-score"]
    })

# Convert metrics summary to DataFrame
metrics_df = pd.DataFrame(metrics_summary)
print(metrics_df)

# Visualization of Metrics
metrics_df.set_index("Model", inplace=True)
metrics_df[["Mean Accuracy", "Precision", "Recall", "F1-Score"]].plot(kind='bar', figsize=(10, 6))
plt.title("Performance Metrics Comparison Across Models (10-Fold CV)")
plt.ylabel("Score")
plt.xticks(rotation=45)
plt.legend(loc="lower right")
plt.tight_layout()
plt.show()


Visualizing PCA

In [None]:
import matplotlib.pyplot as plt
import numpy as np

explained_variance = pca.explained_variance_ratio_
components = np.arange(1, len(explained_variance) + 1)

plt.figure(figsize=(8, 2))
plt.bar(components, explained_variance, alpha=0.7, color='tab:blue', label='Explained Variance')
plt.xlabel('Principal Component')
plt.ylabel('Variance Ratio')
plt.title('Explained Variance by Principal Components')
plt.xticks(components)
plt.legend()
plt.show()


t-test and visualization

In [None]:
import pandas as pd
from scipy.stats import ttest_ind
from statsmodels.stats.multitest import multipletests
import numpy as np

# Load the data
file_path = 'AllData/Figshare/Processed_FIF/all_combined_features_filled.csv'  # Replace with the actual path to your CSV file
data = pd.read_csv(file_path)

# Separate the "Group" column
group_column = 'Group'  # Column name for the group
data_features = data.drop(columns=[group_column])  # Drop the group column to keep only features

# Remove non-numeric columns
data_features = data_features.select_dtypes(include=[np.number])  # Keep only numeric columns

# Drop rows with missing or invalid feature values
data = pd.concat([data_features, data[[group_column]]], axis=1).dropna()

# Split data by group
group1 = data[data[group_column] == 'H']  # Group H
group2 = data[data[group_column] == 'MDD']  # Group MDD

# Extract feature values for the two groups
group1_features = group1.drop(columns=[group_column]).values
group2_features = group2.drop(columns=[group_column]).values

# Ensure that arrays are numeric
group1_features = group1_features.astype(float)
group2_features = group2_features.astype(float)

# Perform t-tests for all features
p_values = []
t_statistics = []
for feature_idx in range(data_features.shape[1]):
    # Extract data for the current feature
    feature1 = group1_features[:, feature_idx]
    feature2 = group2_features[:, feature_idx]
    
    # Perform t-test
    t_stat, p_val = ttest_ind(
        feature1,
        feature2,
        equal_var=False  # Use Welch's t-test for unequal variance
    )
    t_statistics.append(t_stat)
    p_values.append(p_val)

# Apply FDR correction for multiple comparisons
rejected, corrected_pvals, _, _ = multipletests(p_values, alpha=0.05, method='fdr_bh')

# Prepare results for output
results = pd.DataFrame({
    'Feature': data_features.columns,
    't_statistic': t_statistics,
    'p_value': p_values,
    'corrected_p_value': corrected_pvals,
    'significant': rejected
})

# Save results to a new CSV file
results.to_csv('t_test_results_corrected.csv', index=False)

# Print summary of significant features
print("Summary of significant features:")
print(results[results['significant']])



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load your results data
results_path = "t_test_results_corrected.csv"  # Replace with the correct file path
results = pd.read_csv(results_path)

# Group your features
aperiodic_features = results[results['Feature'].str.contains("Aperiodic", case=False)]
entropy_features = results[results['Feature'].str.contains("Entropy", case=False)]
coherence_features = results[results['Feature'].str.contains("Coherence", case=False)]

# Define a helper function to plot boxplots for groups
def plot_group(data, group_name, group1_name='H', group2_name='MDD', group1_data=None, group2_data=None):
    significant_features = data[data['significant']]
    features = significant_features['Feature']
    
    # Collect group data for significant features
    group1_values = group1_data[features]
    group2_values = group2_data[features]
    
    # Combine into a DataFrame for plotting
    combined = pd.concat([
        group1_values.melt(var_name="Feature", value_name="Value").assign(Group=group1_name),
        group2_values.melt(var_name="Feature", value_name="Value").assign(Group=group2_name)
    ])
    
    # Create the boxplot
    plt.figure(figsize=(22, 8))
    sns.boxplot(x="Feature", y="Value", hue="Group", data=combined, showmeans=True)
    plt.xticks(rotation=45)  # Rotate x-axis labels for better visibility
    plt.title(f"{group_name} Features")
    plt.xlabel("Features")
    plt.ylabel("Values")
    plt.tight_layout()
    plt.show()

# Load your original data
data_path = "AllData/Figshare/Processed_FIF/all_combined_features_filled.csv"  # Replace with the correct file path
data = pd.read_csv(data_path)

# Group separation
group_column = 'Group'
group1_name = 'H'
group2_name = 'MDD'

# Extract groups
group1_data = data[data[group_column] == group1_name].drop(columns=[group_column])
group2_data = data[data[group_column] == group2_name].drop(columns=[group_column])

# Convert to numeric to avoid issues
group1_data = group1_data.apply(pd.to_numeric, errors='coerce')
group2_data = group2_data.apply(pd.to_numeric, errors='coerce')

# Plot for Aperiodic
plot_group(aperiodic_features, "Aperiodic", group1_name, group2_name, group1_data, group2_data)

# Plot for Entropy
plot_group(entropy_features, "Entropy", group1_name, group2_name, group1_data, group2_data)

# Plot for Coherence
plot_group(coherence_features, "Coherence", group1_name, group2_name, group1_data, group2_data)


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np

# Load the data
significant_results_path = 't_test_results_corrected.csv'  # Replace with actual file path
all_features_path = 'AllData/Figshare/Processed_FIF/all_combined_features_filled.csv'    # Replace with actual file path

significant_results = pd.read_csv(significant_results_path)
all_features = pd.read_csv(all_features_path)

# Filter significant coherence features
significant_coherence = significant_results[
    significant_results['Feature'].str.contains('Coherence') &
    significant_results['significant'] == True
]

# Extract bands
bands = ['Alpha', 'Beta', 'Delta', 'Gamma', 'Theta']

channel_positions = {
    'Fp1': (0.2, 0.9),
    'Fp2': (0.8, 0.9),
    'F7': (0.1, 0.7),
    'F3': (0.35, 0.8),
    'Fz': (0.5, 0.85),
    'F4': (0.65, 0.8),
    'F8': (0.9, 0.7),
    'T3': (0.05, 0.5),
    'C3': (0.35, 0.5),
    'Cz': (0.5, 0.5),
    'C4': (0.65, 0.5),
    'T4': (0.95, 0.5),
    'T5': (0.1, 0.3),
    'P3': (0.35, 0.2),
    'Pz': (0.5, 0.15),
    'P4': (0.65, 0.2),
    'T6': (0.9, 0.3),
    'O1': (0.2, 0.1),
    'O2': (0.8, 0.1),
}
# Function to extract and calculate mean difference
def get_band_connectivity(band):
    edges = []
    for _, row in significant_coherence.iterrows():
        feature = row['Feature']
        if band in feature:
            channels = feature.split('_')[0].split('-')
            group_1_mean = all_features.loc[all_features['Subject'].str.startswith('H'), feature].mean()
            group_2_mean = all_features.loc[all_features['Subject'].str.startswith('MDD'), feature].mean()
            mean_diff = group_2_mean - group_1_mean
            edges.append((channels[0], channels[1], mean_diff))
    return edges

# Function to plot functional connectivity map
def plot_connectivity(edges, title):
    G = nx.Graph()
    for edge in edges:
        G.add_edge(edge[0], edge[1], weight=abs(edge[2]))

    pos = channel_positions
    weights = [G[u][v]['weight'] for u, v in G.edges()]
    edge_colors = ['purple' if edge[2] > 0 else 'tab:blue' for edge in edges]  # Red for increase, blue for decrease

    plt.figure(figsize=(5, 5))
    nx.draw_networkx_nodes(G, pos, node_size=700, node_color='grey')
    nx.draw_networkx_edges(
        G, pos, edge_color=edge_colors, width=[w * 15 for w in weights], alpha=0.8
    )
    nx.draw_networkx_labels(G, pos, font_size=8)
    plt.title(title, fontsize=10)
    plt.show()

# Plot functional connectivity maps for each band
for band in bands:
    band_edges = get_band_connectivity(band)
    plot_connectivity(band_edges, f'Functional Connectivity Changes in MDD - {band} Band')


In [None]:
from nilearn import datasets, surface, plotting
import numpy as np
from matplotlib import pyplot as plt

# Fetch the fsaverage brain template
fsaverage = datasets.fetch_surf_fsaverage()

# Load the pial surface (left hemisphere as an example)
pial_surface = surface.load_surf_mesh(fsaverage['pial_left'])

# Define node positions for the 19 channels (10-20 system example)
node_positions = {
    'Fp1': [-30, 90, 60], 'Fp2': [30, 90, 60], 'F3': [-50, 50, 60], 'F4': [50, 50, 60],
    'C3': [-70, 0, 50], 'C4': [70, 0, 50], 'P3': [-50, -50, 60], 'P4': [50, -50, 60],
    'O1': [-30, -90, 60], 'O2': [30, -90, 60], 'F7': [-90, 50, 40], 'F8': [90, 50, 40],
    'T3': [-100, 0, 20], 'T4': [100, 0, 20], 'T5': [-90, -50, 40], 'T6': [90, -50, 40],
    'Fz': [0, 70, 70], 'Cz': [0, 0, 80], 'Pz': [0, -70, 70]
}

# Define example coherence edges with weights
edges = [
    ('Fp1', 'Fp2', 0.8), ('F3', 'F4', 0.7), ('C3', 'Cz', 0.6), ('Cz', 'C4', 0.9),
    ('P3', 'P4', 0.5), ('O1', 'O2', 0.4), ('F7', 'F8', 0.3), ('T3', 'T4', 0.6)
]

# Scale weights for visualization
edge_weights = [edge[2] for edge in edges]
min_weight, max_weight = min(edge_weights), max(edge_weights)
normalized_weights = [(weight - min_weight) / (max_weight - min_weight) for weight in edge_weights]

# Prepare edge coordinates for plotting
edge_coords = []
for edge in edges:
    src, dest, _ = edge
    edge_coords.append((node_positions[src], node_positions[dest]))

# Plot the brain surface
fig = plt.figure(figsize=(12, 12))
ax = fig.add_subplot(111, projection='3d')
plotting.plot_surf(pial_surface, axes=ax, cmap='coolwarm', alpha=0.6)

# Plot the connectivity
for coord, weight in zip(edge_coords, normalized_weights):
    src, dest = coord
    xs, ys, zs = zip(src, dest)
    ax.plot(xs, ys, zs, color=plt.cm.viridis(weight), linewidth=2)

# Plot nodes
for node, position in node_positions.items():
    ax.scatter(*position, color='red', s=50, label=node)

ax.set_title("3D Functional Connectivity")
plt.show()


Flowchart

In [None]:
from graphviz import Digraph

# Create a directed graph
dot = Digraph()

# Define nodes
dot.node("Start", shape="ellipse")
dot.node("Data Preprocessing", shape="box")
dot.node("Feature Extraction", shape="box")
dot.node("Feature Standardization", shape="box")
dot.node("Feature Selection & Dimensional Reduction", shape="box")
dot.node("Model Building & Fine-tuning", shape="box")
dot.node("Discussion & Conclusion", shape="box")
dot.node("End", shape="ellipse")

# Define edges
dot.edges([
    ("Start", "Data Preprocessing"),
    ("Data Preprocessing", "Feature Extraction"),
    ("Feature Extraction", "Feature Standardization"),
    ("Feature Standardization", "Feature Selection & Dimensional Reduction"),
    ("Feature Selection & Dimensional Reduction", "Model Building & Fine-tuning"),
    ("Model Building & Fine-tuning", "Discussion & Conclusion"),
    ("Discussion & Conclusion", "End")
])

# Render and save as a PNG file
dot.render("workflow_diagram", format="png", cleanup=True)

# Display the image
from IPython.display import Image
Image("workflow_diagram.png")
