# Binary Classification of Neutral vs Emotional States Using BVP Signals

This notebook implements a binary classification model to distinguish between neutral and emotional states using Blood Volume Pulse (BVP) signals.

## Import Required Libraries

Import libraries such as NumPy, pandas, matplotlib, scikit-learn, and any other necessary libraries for signal processing and classification.

In [None]:
# Import Required Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import signal
from scipy.stats import skew, kurtosis

# Scikit-learn imports
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, confusion_matrix, classification_report,
                             roc_curve, auc, roc_auc_score)

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

print("Libraries imported successfully!")

## Load and Explore Dataset

Load the dataset containing BVP signals and explore its structure, including the distribution of classes and signal characteristics.

In [None]:
# Load the dataset
# Note: Update the file path according to your dataset location
data_path = r'e:\Final Year Project\MyCodeSpace\Current(2026-02-21)\bvp_data.csv'

# Read the dataset
df = pd.read_csv(data_path)

# Display basic information
print("Dataset Shape:", df.shape)
print("\nFirst few rows:")
print(df.head())
print("\nDataset Info:")
print(df.info())
print("\nBasic Statistics:")
print(df.describe())

In [None]:
# Explore class distribution
print("\nClass Distribution:")
print(df['label'].value_counts())

# Visualize class distribution
plt.figure(figsize=(10, 5))

plt.subplot(1, 2, 1)
df['label'].value_counts().plot(kind='bar', color=['skyblue', 'salmon'])
plt.title('Original Class Distribution')
plt.xlabel('Class')
plt.ylabel('Count')
plt.xticks(rotation=45)

plt.subplot(1, 2, 2)
df['label'].value_counts().plot(kind='pie', autopct='%1.1f%%', colors=['skyblue', 'salmon'])
plt.title('Class Distribution (%)')
plt.ylabel('')

plt.tight_layout()
plt.show()

In [None]:
# Visualize sample BVP signals from different classes
plt.figure(figsize=(15, 6))

# Get sample signals for each class
classes = df['label'].unique()
for idx, cls in enumerate(classes[:2]):  # Show first 2 classes
    sample_signal = df[df['label'] == cls].iloc[0]
    signal_data = sample_signal.drop('label').values
    
    plt.subplot(1, 2, idx + 1)
    plt.plot(signal_data)
    plt.title(f'Sample BVP Signal - Class: {cls}')
    plt.xlabel('Sample Index')
    plt.ylabel('BVP Amplitude')
    plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Preprocess Data

Clean the data by handling missing values, normalizing the BVP signals, and applying any necessary filtering techniques.

In [None]:
# Check for missing values
print("Missing values per column:")
print(df.isnull().sum())

# Handle missing values if any
if df.isnull().sum().sum() > 0:
    print("\nHandling missing values...")
    df = df.dropna()  # or use df.fillna() with appropriate strategy
    print("Missing values after handling:", df.isnull().sum().sum())

In [None]:
# Apply bandpass filter to BVP signals
def apply_bandpass_filter(signal_data, lowcut=0.5, highcut=4.0, fs=64, order=4):
    """
    Apply bandpass filter to BVP signal
    Typical BVP frequency range: 0.5-4.0 Hz (30-240 BPM)
    """
    nyquist = 0.5 * fs
    low = lowcut / nyquist
    high = highcut / nyquist
    b, a = signal.butter(order, [low, high], btype='band')
    filtered_signal = signal.filtfilt(b, a, signal_data)
    return filtered_signal

# Separate features and labels
X = df.drop('label', axis=1).values
y = df['label'].values

# Apply filtering to each signal
print("Applying bandpass filter to BVP signals...")
X_filtered = np.array([apply_bandpass_filter(signal_data) for signal_data in X])

print("Filtering completed!")
print("Filtered data shape:", X_filtered.shape)

In [None]:
# Visualize effect of filtering
sample_idx = 0
plt.figure(figsize=(15, 5))

plt.subplot(1, 2, 1)
plt.plot(X[sample_idx])
plt.title('Original BVP Signal')
plt.xlabel('Sample Index')
plt.ylabel('Amplitude')
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.plot(X_filtered[sample_idx])
plt.title('Filtered BVP Signal')
plt.xlabel('Sample Index')
plt.ylabel('Amplitude')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Balance Classes

Use undersampling to balance the neutral class with other emotional classes, ensuring a balanced dataset for binary classification.

In [None]:
# Create binary labels: Neutral (0) vs Emotional (1)
# Assuming 'neutral' is one specific class and all others are emotional
print("Original unique labels:", np.unique(y))

# Define neutral and emotional classes
# Adjust this based on your dataset's labeling scheme
neutral_label = 'neutral'  # or 0, depending on your dataset

# Create binary labels
y_binary = np.where(y == neutral_label, 0, 1)

print("\nBinary Class Distribution:")
unique, counts = np.unique(y_binary, return_counts=True)
for label, count in zip(unique, counts):
    class_name = "Neutral" if label == 0 else "Emotional"
    print(f"{class_name} (Class {label}): {count}")

In [None]:
# Apply undersampling to balance classes
from sklearn.utils import resample

# Combine features and labels
df_balanced = pd.DataFrame(X_filtered)
df_balanced['label'] = y_binary

# Separate majority and minority classes
df_majority = df_balanced[df_balanced['label'] == 0]
df_minority = df_balanced[df_balanced['label'] == 1]

# Determine which is majority and minority
if len(df_majority) < len(df_minority):
    df_majority, df_minority = df_minority, df_majority

print(f"Majority class size: {len(df_majority)}")
print(f"Minority class size: {len(df_minority)}")

# Undersample majority class
df_majority_downsampled = resample(df_majority,
                                   replace=False,
                                   n_samples=len(df_minority),
                                   random_state=42)

# Combine minority class with downsampled majority class
df_balanced = pd.concat([df_majority_downsampled, df_minority])

# Shuffle the dataset
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"\nBalanced dataset size: {len(df_balanced)}")
print("\nBalanced Class Distribution:")
print(df_balanced['label'].value_counts())

# Separate features and labels again
X_balanced = df_balanced.drop('label', axis=1).values
y_balanced = df_balanced['label'].values

In [None]:
# Visualize balanced class distribution
plt.figure(figsize=(10, 5))

plt.subplot(1, 2, 1)
pd.Series(y_binary).value_counts().plot(kind='bar', color=['skyblue', 'salmon'])
plt.title('Before Balancing')
plt.xlabel('Class (0=Neutral, 1=Emotional)')
plt.ylabel('Count')
plt.xticks(rotation=0)

plt.subplot(1, 2, 2)
pd.Series(y_balanced).value_counts().plot(kind='bar', color=['skyblue', 'salmon'])
plt.title('After Balancing')
plt.xlabel('Class (0=Neutral, 1=Emotional)')
plt.ylabel('Count')
plt.xticks(rotation=0)

plt.tight_layout()
plt.show()

## Feature Extraction

Extract relevant features from the BVP signals, such as time-domain and frequency-domain features, to use as input for the model.

In [None]:
# Define feature extraction functions
def extract_time_domain_features(signal_data):
    """Extract time-domain features from BVP signal"""
    features = {
        'mean': np.mean(signal_data),
        'std': np.std(signal_data),
        'min': np.min(signal_data),
        'max': np.max(signal_data),
        'range': np.max(signal_data) - np.min(signal_data),
        'median': np.median(signal_data),
        'skewness': skew(signal_data),
        'kurtosis': kurtosis(signal_data),
        'rms': np.sqrt(np.mean(signal_data**2)),
        'peak_to_peak': np.ptp(signal_data)
    }
    return features

def extract_frequency_domain_features(signal_data, fs=64):
    """Extract frequency-domain features from BVP signal"""
    # Compute power spectral density
    freqs, psd = signal.welch(signal_data, fs=fs, nperseg=min(256, len(signal_data)))
    
    features = {
        'spectral_mean': np.mean(psd),
        'spectral_std': np.std(psd),
        'spectral_max': np.max(psd),
        'dominant_frequency': freqs[np.argmax(psd)],
        'spectral_energy': np.sum(psd)
    }
    return features

def extract_hrv_features(signal_data, fs=64):
    """Extract heart rate variability features"""
    # Detect peaks (simplified peak detection)
    peaks, _ = signal.find_peaks(signal_data, distance=fs//2)
    
    if len(peaks) < 2:
        return {'hrv_mean': 0, 'hrv_std': 0, 'hrv_rmssd': 0}
    
    # Calculate RR intervals
    rr_intervals = np.diff(peaks) / fs * 1000  # in milliseconds
    
    features = {
        'hrv_mean': np.mean(rr_intervals),
        'hrv_std': np.std(rr_intervals),
        'hrv_rmssd': np.sqrt(np.mean(np.diff(rr_intervals)**2))
    }
    return features

print("Feature extraction functions defined successfully!")

In [None]:
# Extract features from all signals
print("Extracting features from BVP signals...")

all_features = []
for signal_data in X_balanced:
    time_features = extract_time_domain_features(signal_data)
    freq_features = extract_frequency_domain_features(signal_data)
    hrv_features = extract_hrv_features(signal_data)
    
    # Combine all features
    combined_features = {**time_features, **freq_features, **hrv_features}
    all_features.append(combined_features)

# Convert to DataFrame
X_features = pd.DataFrame(all_features)

print("Feature extraction completed!")
print(f"Feature matrix shape: {X_features.shape}")
print("\nExtracted features:")
print(X_features.columns.tolist())
print("\nFeature statistics:")
print(X_features.describe())

In [None]:
# Visualize feature correlations
plt.figure(figsize=(14, 10))
correlation_matrix = X_features.corr()
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            square=True, linewidths=0.5)
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

## Split Data into Training and Testing Sets

Split the dataset into training and testing sets, ensuring that the split is stratified to maintain class balance.

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_features, y_balanced, 
    test_size=0.2, 
    random_state=42, 
    stratify=y_balanced
)

print("Data split completed!")
print(f"Training set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")
print(f"\nTraining set class distribution:")
print(pd.Series(y_train).value_counts())
print(f"\nTesting set class distribution:")
print(pd.Series(y_test).value_counts())

In [None]:
# Normalize features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Feature normalization completed!")
print(f"Scaled training data shape: {X_train_scaled.shape}")
print(f"Scaled testing data shape: {X_test_scaled.shape}")

## Train Binary Classification Model

Train a binary classification model (e.g., logistic regression, SVM, or a neural network) to classify neutral vs emotional states.

In [None]:
# Initialize multiple models for comparison
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'SVM (Linear)': SVC(kernel='linear', random_state=42, probability=True),
    'SVM (RBF)': SVC(kernel='rbf', random_state=42, probability=True),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42)
}

# Train all models
trained_models = {}
print("Training models...\n")

for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train_scaled, y_train)
    trained_models[name] = model
    
    # Calculate training accuracy
    train_acc = model.score(X_train_scaled, y_train)
    print(f"{name} - Training Accuracy: {train_acc:.4f}")
    
print("\nAll models trained successfully!")

In [None]:
# Perform cross-validation for each model
print("\nCross-Validation Results (5-fold):\n")

cv_results = {}
for name, model in trained_models.items():
    cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='accuracy')
    cv_results[name] = cv_scores
    print(f"{name}:")
    print(f"  Mean CV Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
    print(f"  Individual Fold Scores: {cv_scores}")
    print()

## Evaluate Model Performance

Evaluate the model's performance using metrics such as accuracy, precision, recall, F1-score, and a confusion matrix.

In [None]:
# Evaluate all models on test set
print("Model Evaluation on Test Set:\n")
print("="*80)

evaluation_results = {}

for name, model in trained_models.items():
    # Make predictions
    y_pred = model.predict(X_test_scaled)
    y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    
    evaluation_results[name] = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'roc_auc': roc_auc,
        'y_pred': y_pred,
        'y_pred_proba': y_pred_proba
    }
    
    print(f"\n{name}:")
    print(f"  Accuracy:  {accuracy:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall:    {recall:.4f}")
    print(f"  F1-Score:  {f1:.4f}")
    print(f"  ROC-AUC:   {roc_auc:.4f}")

print("\n" + "="*80)

In [None]:
# Create comparison DataFrame
results_df = pd.DataFrame(evaluation_results).T
results_df = results_df[['accuracy', 'precision', 'recall', 'f1_score', 'roc_auc']]

print("\nModel Performance Summary:")
print(results_df)

# Visualize model comparison
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Bar plot
results_df.plot(kind='bar', ax=axes[0], rot=45)
axes[0].set_title('Model Performance Comparison')
axes[0].set_ylabel('Score')
axes[0].set_ylim([0, 1])
axes[0].legend(loc='lower right')
axes[0].grid(True, alpha=0.3)

# Heatmap
sns.heatmap(results_df.T, annot=True, fmt='.3f', cmap='YlGnBu', ax=axes[1], 
            cbar_kws={'label': 'Score'})
axes[1].set_title('Model Performance Heatmap')
axes[1].set_xlabel('Model')
axes[1].set_ylabel('Metric')

plt.tight_layout()
plt.show()

In [None]:
# Display confusion matrices for all models
fig, axes = plt.subplots(2, 2, figsize=(14, 12))
axes = axes.ravel()

for idx, (name, model) in enumerate(trained_models.items()):
    y_pred = evaluation_results[name]['y_pred']
    cm = confusion_matrix(y_test, y_pred)
    
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[idx],
                xticklabels=['Neutral', 'Emotional'],
                yticklabels=['Neutral', 'Emotional'])
    axes[idx].set_title(f'Confusion Matrix - {name}')
    axes[idx].set_ylabel('True Label')
    axes[idx].set_xlabel('Predicted Label')

plt.tight_layout()
plt.show()

In [None]:
# Display detailed classification reports
print("\nDetailed Classification Reports:\n")
print("="*80)

for name, model in trained_models.items():
    y_pred = evaluation_results[name]['y_pred']
    print(f"\n{name}:")
    print(classification_report(y_test, y_pred, 
                                target_names=['Neutral', 'Emotional'],
                                digits=4))
    print("-"*80)

In [None]:
# Plot ROC curves for all models
plt.figure(figsize=(10, 8))

for name, model in trained_models.items():
    y_pred_proba = evaluation_results[name]['y_pred_proba']
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    roc_auc = evaluation_results[name]['roc_auc']
    
    plt.plot(fpr, tpr, linewidth=2, label=f'{name} (AUC = {roc_auc:.3f})')

plt.plot([0, 1], [0, 1], 'k--', linewidth=2, label='Random Classifier (AUC = 0.500)')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title('ROC Curves - Binary Classification (Neutral vs Emotional)', fontsize=14)
plt.legend(loc='lower right', fontsize=10)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Select best model based on F1-score
best_model_name = results_df['f1_score'].idxmax()
best_model = trained_models[best_model_name]

print(f"\nBest Model: {best_model_name}")
print(f"F1-Score: {results_df.loc[best_model_name, 'f1_score']:.4f}")
print(f"\nBest Model Parameters:")
print(best_model.get_params())

In [None]:
# Feature importance (for Random Forest)
if 'Random Forest' in trained_models:
    rf_model = trained_models['Random Forest']
    feature_importance = pd.DataFrame({
        'feature': X_features.columns,
        'importance': rf_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("\nTop 10 Most Important Features (Random Forest):")
    print(feature_importance.head(10))
    
    # Visualize feature importance
    plt.figure(figsize=(12, 6))
    plt.barh(feature_importance['feature'][:10], feature_importance['importance'][:10])
    plt.xlabel('Importance')
    plt.title('Top 10 Feature Importances - Random Forest')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()

## Summary and Conclusions

Summary of the binary classification results for neutral vs emotional states using BVP signals.

In [None]:
# Generate final summary
print("\n" + "="*80)
print("FINAL SUMMARY - BINARY CLASSIFICATION (NEUTRAL VS EMOTIONAL)")
print("="*80)

print(f"\nDataset Information:")
print(f"  Total samples after balancing: {len(X_balanced)}")
print(f"  Number of features extracted: {X_features.shape[1]}")
print(f"  Training samples: {len(X_train)}")
print(f"  Testing samples: {len(X_test)}")

print(f"\nBest Performing Model: {best_model_name}")
print(f"  Accuracy:  {results_df.loc[best_model_name, 'accuracy']:.4f}")
print(f"  Precision: {results_df.loc[best_model_name, 'precision']:.4f}")
print(f"  Recall:    {results_df.loc[best_model_name, 'recall']:.4f}")
print(f"  F1-Score:  {results_df.loc[best_model_name, 'f1_score']:.4f}")
print(f"  ROC-AUC:   {results_df.loc[best_model_name, 'roc_auc']:.4f}")

print("\n" + "="*80)
print("\nClassification task completed successfully!")

In [None]:
# Generate final summary
print("\n" + "="*80)
print("FINAL SUMMARY - BINARY CLASSIFICATION (NEUTRAL VS EMOTIONAL)")
print("="*80)

print(f"\nDataset Information:")
print(f"  Total samples after balancing: {len(X_balanced)}")
print(f"  Number of features extracted: {X_features.shape[1]}")
print(f"  Training samples: {len(X_train)}")
print(f"  Testing samples: {len(X_test)}")

print(f"\nBest Performing Model: {best_model_name}")
print(f"  Accuracy:  {results_df.loc[best_model_name, 'accuracy']:.4f}")
print(f"  Precision: {results_df.loc[best_model_name, 'precision']:.4f}")
print(f"  Recall:    {results_df.loc[best_model_name, 'recall']:.4f}")
print(f"  F1-Score:  {results_df.loc[best_model_name, 'f1_score']:.4f}")
print(f"  ROC-AUC:   {results_df.loc[best_model_name, 'roc_auc']:.4f}")

print("\n" + "="*80)
print("\nClassification task completed successfully!")