In [1]:
import pandas as pd
import numpy as np


# Load EEG data
data = pd.read_csv('https://raw.githubusercontent.com/NolanRink/CS4540/refs/heads/main/HW7/EEG_data.csv')
print("Data shape:", data.shape)
data.head()


Data shape: (12811, 15)


Unnamed: 0,SubjectID,VideoID,Attention,Mediation,Raw,Delta,Theta,Alpha1,Alpha2,Beta1,Beta2,Gamma1,Gamma2,predefinedlabel,user-definedlabeln
0,0.0,0.0,56.0,43.0,278.0,301963.0,90612.0,33735.0,23991.0,27946.0,45097.0,33228.0,8293.0,0.0,0.0
1,0.0,0.0,40.0,35.0,-50.0,73787.0,28083.0,1439.0,2240.0,2746.0,3687.0,5293.0,2740.0,0.0,0.0
2,0.0,0.0,47.0,48.0,101.0,758353.0,383745.0,201999.0,62107.0,36293.0,130536.0,57243.0,25354.0,0.0,0.0
3,0.0,0.0,47.0,57.0,-5.0,2012240.0,129350.0,61236.0,17084.0,11488.0,62462.0,49960.0,33932.0,0.0,0.0
4,0.0,0.0,44.0,53.0,-8.0,1005145.0,354328.0,37102.0,88881.0,45307.0,99603.0,44790.0,29749.0,0.0,0.0


# Part A
**Answer**: The dataset contains EEG recordings from 10 college students who watched video clips. The inputs include details like SubjectID and VideoID, as well as measures such as Attention and Mediation that indicate the students' focus and calmness. It also includes various EEG signal features, like the raw signal and power bands (Delta, Theta, Alpha1, Alpha2, Beta1, Beta2, Gamma1, Gamma2) that show brain activity in different frequency ranges. The outputs are two labels: one (predefinedlabel) shows if the video is expected to be confusing, and the other (user-definedlabeln) shows if the student actually felt confused. Since the target is a binary yes/no label, this is a classification problem.



In [2]:
from sklearn.model_selection import train_test_split

# Define the features and target variable
features = ['Attention', 'Mediation', 'Delta', 'Theta', 'Alpha1', 'Alpha2', 'Beta1', 'Beta2', 'Gamma1', 'Gamma2']
X = data[features]
y = data['user-definedlabeln']

# Split into training and test sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training samples:", X_train.shape[0], "Test samples:", X_test.shape[0])

Training samples: 10248 Test samples: 2563


In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Create and train the logistic regression model
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

# Predict on test data
preds_lr = log_reg.predict(X_test)
accuracy_lr = accuracy_score(y_test, preds_lr)
print("Logistic Regression Accuracy: {:.2f}%".format(accuracy_lr * 100))

Logistic Regression Accuracy: 60.16%


In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

# Convert our training and test data to PyTorch tensors
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

# Create DataLoader for the training set
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

# Define neural network classifier
class NNClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(NNClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

input_dim = X_train.shape[1]
hidden_dim = 16
# For binary classification, we set output_dim = 2.
output_dim = 2

# Initialize the model, loss function, and optimizer
model = NNClassifier(input_dim, hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Training loop
num_epochs = 250
for epoch in range(num_epochs):
    for batch_x, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
    if (epoch+1) % 10 == 0:
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}')

# Evaluate the model on the test set
with torch.no_grad():
    outputs = model(X_test_tensor)
    _, preds_nn = torch.max(outputs, 1)
    accuracy_nn = (preds_nn == y_test_tensor).float().mean()
    print("Neural Network Accuracy: {:.2f}%".format(accuracy_nn.item() * 100))

Epoch 10/250, Loss: 0.6779
Epoch 20/250, Loss: 0.6946
Epoch 30/250, Loss: 0.6830
Epoch 40/250, Loss: 0.6910
Epoch 50/250, Loss: 0.6916
Epoch 60/250, Loss: 0.6940
Epoch 70/250, Loss: 0.6978
Epoch 80/250, Loss: 0.6799
Epoch 90/250, Loss: 0.6879
Epoch 100/250, Loss: 0.6927
Epoch 110/250, Loss: 0.7033
Epoch 120/250, Loss: 0.7270
Epoch 130/250, Loss: 0.6895
Epoch 140/250, Loss: 0.7150
Epoch 150/250, Loss: 0.6966
Epoch 160/250, Loss: 0.6586
Epoch 170/250, Loss: 0.7081
Epoch 180/250, Loss: 0.6935
Epoch 190/250, Loss: 0.7000
Epoch 200/250, Loss: 0.7087
Epoch 210/250, Loss: 0.7185
Epoch 220/250, Loss: 0.6931
Epoch 230/250, Loss: 0.6805
Epoch 240/250, Loss: 0.7019
Epoch 250/250, Loss: 0.6758
Neural Network Accuracy: 49.36%


In [5]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print("Normalized features shape:", X_scaled.shape)

Normalized features shape: (12811, 10)


In [6]:
z_scores = np.abs((X_scaled - np.mean(X_scaled, axis=0)) / np.std(X_scaled, axis=0))
mask = (z_scores < 3).all(axis=1)
print("Number of samples before outlier removal:", X_scaled.shape[0])
print("Number of samples after outlier removal:", np.sum(mask))

# Create new datasets without outliers
X_no_outliers = X_scaled[mask]
y_no_outliers = y[mask]

Number of samples before outlier removal: 12811
Number of samples after outlier removal: 11725


In [7]:
subset_features = ['Delta', 'Theta', 'Alpha1', 'Alpha2', 'Beta1', 'Beta2', 'Gamma1', 'Gamma2']
X_subset = data[subset_features]
X_subset_scaled = scaler.fit_transform(X_subset)
print("Subset features shape:", X_subset_scaled.shape)

Subset features shape: (12811, 8)


In [8]:
mask_subjects = data['SubjectID'] < 5
X_subjects = data.loc[mask_subjects, features]
y_subjects = data.loc[mask_subjects, 'user-definedlabeln']
X_subjects_scaled = scaler.fit_transform(X_subjects)
print("Subset (SubjectID < 5) features shape:", X_subjects_scaled.shape)

Subset (SubjectID < 5) features shape: (6455, 10)


In [9]:
# Split the engineered data into training and test sets
X_train_eng, X_test_eng, y_train_eng, y_test_eng = train_test_split(X_no_outliers, y_no_outliers, test_size=0.2, random_state=42)

# Convert to PyTorch tensors
X_train_eng_tensor = torch.tensor(X_train_eng, dtype=torch.float32)
y_train_eng_tensor = torch.tensor(y_train_eng.values, dtype=torch.long)
X_test_eng_tensor = torch.tensor(X_test_eng, dtype=torch.float32)
y_test_eng_tensor = torch.tensor(y_test_eng.values, dtype=torch.long)

# Create a DataLoader for the engineered training data
train_dataset_eng = TensorDataset(X_train_eng_tensor, y_train_eng_tensor)
train_loader_eng = DataLoader(train_dataset_eng, batch_size=16, shuffle=True)

# Reinitialize the neural network
model_eng = NNClassifier(input_dim, hidden_dim, output_dim)
optimizer_eng = optim.Adam(model_eng.parameters(), lr=0.01)

# Train the neural network on the engineered data
num_epochs = 250
for epoch in range(num_epochs):
    for batch_x, batch_y in train_loader_eng:
        optimizer_eng.zero_grad()
        outputs = model_eng(batch_x)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer_eng.step()
    if (epoch+1) % 10 == 0:
        print(f'[Engineered] Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}')

# Evaluate the engineered model
with torch.no_grad():
    outputs = model_eng(X_test_eng_tensor)
    _, preds_eng = torch.max(outputs, 1)
    accuracy_eng = (preds_eng == y_test_eng_tensor).float().mean()
    print("Neural Network Accuracy after Feature Engineering: {:.2f}%".format(accuracy_eng.item() * 100))

[Engineered] Epoch 10/250, Loss: 0.7677
[Engineered] Epoch 20/250, Loss: 0.6504
[Engineered] Epoch 30/250, Loss: 0.5875
[Engineered] Epoch 40/250, Loss: 0.5807
[Engineered] Epoch 50/250, Loss: 0.5241
[Engineered] Epoch 60/250, Loss: 0.4207
[Engineered] Epoch 70/250, Loss: 0.4775
[Engineered] Epoch 80/250, Loss: 0.8242
[Engineered] Epoch 90/250, Loss: 0.6484
[Engineered] Epoch 100/250, Loss: 0.6640
[Engineered] Epoch 110/250, Loss: 0.7604
[Engineered] Epoch 120/250, Loss: 0.7511
[Engineered] Epoch 130/250, Loss: 0.7709
[Engineered] Epoch 140/250, Loss: 0.9634
[Engineered] Epoch 150/250, Loss: 0.5669
[Engineered] Epoch 160/250, Loss: 0.9132
[Engineered] Epoch 170/250, Loss: 0.7238
[Engineered] Epoch 180/250, Loss: 0.8800
[Engineered] Epoch 190/250, Loss: 0.7723
[Engineered] Epoch 200/250, Loss: 0.5165
[Engineered] Epoch 210/250, Loss: 0.6807
[Engineered] Epoch 220/250, Loss: 0.3110
[Engineered] Epoch 230/250, Loss: 0.5952
[Engineered] Epoch 240/250, Loss: 0.8256
[Engineered] Epoch 250/25

# Part B/C
**Answer:** The logistic regression model achieved an accuracy of about 60.16%, while the initial neural network classifier reached around 49.36% accuracy. After applying feature engineering, normalizing the features to a standard scale, eliminating outliers to remove extreme values, selecting a subset of EEG frequency features, and filtering for a specific range of SubjectIDs the neural network’s accuracy improved to approximately 62.43%. This improvement shows that careful preprocessing and feature selection can significantly boost model performance by reducing noise and focusing on the most informative aspects of the data.
