In [8]:
import pandas as pd
import numpy as np
from scipy.signal import welch
from scipy.stats import linregress

# Step 1: Load EEG data from CSV file
df = pd.read_csv('downsampled_final_125Hz.csv', usecols=lambda column: column not in ['Unnamed: 0.1', 'Unnamed: 0'])

# Ensure data is sorted by participant_id, epoch, and timestep to avoid issues in feature extraction
df = df.sort_values(by=['participant_id', 'epoch', 'timestep']).reset_index(drop=True)

# Define EEG channel columns (assuming 'channel_1' to 'channel_8' are the relevant EEG channels)
eeg_channels = [f'channel_{i}' for i in range(1, 9)]

# Define sampling rate (Hz)
fs = 125  # Assuming 125Hz as the sampling rate

# Step 2: Define a function to extract features from each epoch
def extract_features_by_epoch(df, eeg_channels, fs=125):
    feature_list = []

    # Group data by participant and epoch
    grouped = df.groupby(['participant_id', 'epoch'])

    for (participant_id, epoch), group in grouped:
        features = {}

        # Extract participant ID, epoch, and timestep range (from the first to the last timestep in the epoch)
        features['participant_id'] = participant_id
        features['epoch'] = epoch
        features['timestep_start'] = group['timestep'].iloc[0]
        features['timestep_end'] = group['timestep'].iloc[-1]

        # 1. Time-Domain Features (Max, Min, Mean, Amplitude Range)
        for channel in eeg_channels:
            channel_values = group[channel].values
            features[f'{channel}_max'] = np.max(channel_values)
            features[f'{channel}_min'] = np.min(channel_values)
            features[f'{channel}_mean'] = np.mean(channel_values)
            features[f'{channel}_amplitude_range'] = np.max(channel_values) - np.min(channel_values)

        # 2. Frequency-Domain Features (Alpha, Beta, Gamma Power using Welch method)
        for channel in eeg_channels:
            nperseg = min(len(group[channel].values), fs)
            freqs, psd = welch(group[channel].values, fs=fs, nperseg=nperseg)
            alpha_power = np.sum(psd[(freqs >= 8) & (freqs <= 12)])
            beta_power = np.sum(psd[(freqs >= 13) & (freqs <= 30)])
            gamma_power = np.sum(psd[(freqs >= 30) & (freqs <= 50)])
            
            features[f'{channel}_alpha_power'] = alpha_power
            features[f'{channel}_beta_power'] = beta_power
            features[f'{channel}_gamma_power'] = gamma_power

        # 3. Fast Fourier Transform (FFT) Features
        for channel in eeg_channels:
            channel_values = group[channel].values
            fft_values = np.fft.fft(channel_values)
            fft_freqs = np.fft.fftfreq(len(fft_values), d=1/fs)

            # Consider only the positive frequencies
            positive_fft_values = fft_values[:len(fft_values) // 2]
            positive_fft_freqs = fft_freqs[:len(fft_freqs) // 2]

            # FFT features
            total_power = np.sum(np.abs(positive_fft_values) ** 2)  # Total power in frequency domain
            dominant_freq = positive_fft_freqs[np.argmax(np.abs(positive_fft_values))]  # Frequency with the maximum amplitude

            features[f'{channel}_fft_total_power'] = total_power
            features[f'{channel}_fft_dominant_frequency'] = dominant_freq

        # 4. Signal Slope (Dynamic Feature)
        for channel in eeg_channels:
            slope, intercept, r_value, p_value, std_err = linregress(np.arange(len(channel_values)), channel_values)
            features[f'{channel}_slope'] = slope

        # 5. Inter-Channel Differences (Spatial Features)
        for i in range(len(eeg_channels)):
            for j in range(i + 1, len(eeg_channels)):
                channel_i = group[eeg_channels[i]].values
                channel_j = group[eeg_channels[j]].values
                features[f'{eeg_channels[i]}_{eeg_channels[j]}_diff'] = np.mean(channel_i - channel_j)

        # Add the trigger value (assuming we're taking the trigger from the last timestep in the epoch)
        features['trigger'] = group['trigger'].iloc[-1]

        # Append extracted features for this epoch
        feature_list.append(features)

    # Convert list of feature dicts into a DataFrame
    return pd.DataFrame(feature_list)

# Step 3: Apply feature extraction by epoch
features_df = extract_features_by_epoch(df, eeg_channels, fs=fs)

# Step 4: Save extracted features to a new CSV file
features_df.to_csv('extracted_features.csv', index=False)

# Step 5: Optional - print first few rows of the resulting features DataFrame to verify
print(features_df.head())

   participant_id  epoch  timestep_start  timestep_end  channel_1_max  \
0               1      1     1303.574029   1356.425971      20.649432   
1               1      2     1348.574029   1401.425971      18.230486   
2               1      3     1391.574029   1444.425971      -1.165833   
3               1      4     1432.574029   1485.425971       8.848805   
4               1      5     1479.574029   1532.425971      13.029421   

   channel_1_min  channel_1_mean  channel_1_amplitude_range  channel_2_max  \
0     -13.017932        4.339864                  33.667364      13.707586   
1     -15.943325       -3.003764                  34.173811      13.845616   
2     -13.979912       -6.508167                  12.814079       2.827503   
3      -9.891811       -0.109365                  18.740615       6.261853   
4     -11.033756        1.093455                  24.063177      17.102832   

   channel_2_min  ...  channel_4_channel_6_diff  channel_4_channel_7_diff  \
0     -18.88350

| **Feature Name**                        | **Description**                                                                             |
|-----------------------------------------|---------------------------------------------------------------------------------------------|
| `participant_id`                        | Unique identifier for the participant, used to differentiate signal data from different individuals. |
| `timestep_start`                        | The starting timestep of the sliding window, marking the beginning of the data segment.      |
| `timestep_end`                          | The ending timestep of the sliding window, marking the end of the data segment.              |
| `channel_X_max`                         | Maximum value of channel `X` within the current window, capturing the peak signal.          |
| `channel_X_min`                         | Minimum value of channel `X` within the current window, capturing the lowest signal.        |
| `channel_X_mean`                        | Mean value of channel `X` within the current window, representing the overall signal level. |
| `channel_X_amplitude_range`             | Difference between the max and min values of channel `X`, showing the signal's amplitude range. |
| `channel_X_alpha_power`                 | Power of the alpha band (8-12 Hz) for channel `X`, linked to relaxation and attention.      |
| `channel_X_beta_power`                  | Power of the beta band (13-30 Hz) for channel `X`, associated with cognitive activity and focus. |
| `channel_X_gamma_power`                 | Power of the gamma band (30-50 Hz) for channel `X`, related to high-level cognitive processes. |
| `channel_X_fft_total_power`             | Total power in the frequency domain of channel `X`, representing the signal's overall energy. |
| `channel_X_fft_dominant_frequency`      | Dominant frequency of channel `X`, indicating the most significant frequency component in the signal. |
| `channel_X_slope`                       | Slope of the signal for channel `X` in the current window, indicating the trend (rising or falling) of the signal. |
| `channel_X_channel_Y_diff`              | Average difference between signals of channel `X` and channel `Y`, showing inter-channel relationships. |
| `trigger`                               | Event label or trigger value, indicating whether a specific event (e.g., P300) occurred within the window. |


In [3]:
df

Unnamed: 0,participant_id,timestep,channel_1,channel_2,channel_3,channel_4,channel_5,channel_6,channel_7,channel_8,trigger,epoch
0,1,1303.574029,-7.943176,-10.110693,-8.009797,-10.577976,-6.892772,-18.811226,-11.462834,-0.436110,-1,1
1,1,1309.674672,-6.554362,-6.599419,-3.267752,-8.753163,-6.157648,-12.270321,-8.666925,0.687801,-1,1
2,1,1310.910740,-5.489454,-6.252415,-2.324811,-6.142551,-3.595988,-12.739827,-6.922504,3.100068,-1,1
3,1,1314.023307,-7.141268,-9.458043,-6.022758,-11.024632,-8.385484,-11.659327,-8.548741,0.253816,-1,1
4,1,1314.157032,-11.904786,-16.947476,-10.692527,-21.666799,-13.816173,-14.655011,-10.792701,-1.403710,-1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
149995,5,59613.842968,21.726968,7.991888,7.587836,11.370193,12.626895,10.287490,4.742663,5.315820,-1,1200
149996,5,59613.976693,23.632508,6.045436,3.164952,5.246110,9.995644,8.950257,3.230958,2.587833,-1,1200
149997,5,59617.089260,19.377458,1.491491,0.717728,-3.406795,8.014163,9.036889,5.544034,1.326462,-1,1200
149998,5,59618.325328,25.052931,4.940412,2.978819,-0.966594,9.234394,10.379730,4.078566,1.504858,-1,1200
