# Binary Classification of Neutral vs Emotional States Using BVP Signals

This notebook implements a binary classification system to distinguish between neutral and emotional states using Blood Volume Pulse (BVP) signals.

## Import Required Libraries

Import libraries such as NumPy, pandas, matplotlib, scikit-learn, and any other necessary libraries.

In [None]:
# Import Required Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import signal
from scipy.stats import skew, kurtosis

# Scikit-learn imports
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, confusion_matrix, classification_report)

# Imbalanced-learn for undersampling
from imblearn.under_sampling import RandomUnderSampler

# Set random seed for reproducibility
np.random.seed(42)

# Display settings
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
print("Libraries imported successfully!")

## Load and Explore Dataset

Load the BVP signal dataset, display basic statistics, and visualize the data distribution.

In [None]:
# Load the BVP signal dataset
# Adjust the file path according to your dataset location
data_path = 'path_to_your_bvp_dataset.csv'  # Update this path
df = pd.read_csv(data_path)

# Display basic information
print("Dataset Shape:", df.shape)
print("\nFirst few rows:")
print(df.head())
print("\nDataset Info:")
print(df.info())
print("\nBasic Statistics:")
print(df.describe())

In [None]:
# Visualize class distribution
plt.figure(figsize=(10, 5))

plt.subplot(1, 2, 1)
class_counts = df['emotion'].value_counts()
class_counts.plot(kind='bar')
plt.title('Original Class Distribution')
plt.xlabel('Emotion Class')
plt.ylabel('Count')
plt.xticks(rotation=45)

plt.subplot(1, 2, 2)
class_counts.plot(kind='pie', autopct='%1.1f%%')
plt.title('Class Distribution (%)')
plt.ylabel('')

plt.tight_layout()
plt.show()

print("\nClass Distribution:")
print(class_counts)

## Preprocess Data

Clean the dataset, handle missing values, and normalize the BVP signals.

In [None]:
# Check for missing values
print("Missing values per column:")
print(df.isnull().sum())

# Handle missing values (if any)
if df.isnull().sum().sum() > 0:
    print("\nHandling missing values...")
    # Option 1: Drop rows with missing values
    df = df.dropna()
    # Option 2: Fill with mean/median (uncomment if preferred)
    # df = df.fillna(df.mean())
    print(f"Dataset shape after handling missing values: {df.shape}")

# Check for duplicate rows
duplicates = df.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicates}")
if duplicates > 0:
    df = df.drop_duplicates()
    print(f"Dataset shape after removing duplicates: {df.shape}")

print("\nData preprocessing completed!")

## Extract Neutral and Emotional Classes

Filter the dataset to separate the neutral class and combine all other classes as emotional.

In [None]:
# Create binary classification labels
# Assuming 'neutral' is one of the emotion classes, adjust as needed
neutral_label = 'neutral'  # Update this based on your dataset

# Create binary labels: 0 for neutral, 1 for emotional
df['binary_label'] = df['emotion'].apply(lambda x: 0 if x == neutral_label else 1)

# Display the new distribution
print("Binary Classification Distribution:")
print(df['binary_label'].value_counts())

# Visualize binary distribution
plt.figure(figsize=(10, 4))

plt.subplot(1, 2, 1)
df['binary_label'].value_counts().plot(kind='bar')
plt.title('Binary Class Distribution (Before Balancing)')
plt.xlabel('Class (0: Neutral, 1: Emotional)')
plt.ylabel('Count')
plt.xticks(rotation=0)

plt.subplot(1, 2, 2)
labels = ['Neutral', 'Emotional']
df['binary_label'].value_counts().plot(kind='pie', labels=labels, autopct='%1.1f%%')
plt.title('Binary Class Distribution (%)')
plt.ylabel('')

plt.tight_layout()
plt.show()

## Undersample Emotional Classes

Undersample the emotional class to balance it with the neutral class.

In [None]:
# Separate features and labels
# Assuming BVP signal columns are all except 'emotion' and 'binary_label'
feature_columns = [col for col in df.columns if col not in ['emotion', 'binary_label']]
X = df[feature_columns]
y = df['binary_label']

print(f"Features shape: {X.shape}")
print(f"Labels shape: {y.shape}")

# Apply random undersampling
rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

print(f"\nAfter undersampling:")
print(f"Features shape: {X_resampled.shape}")
print(f"Labels shape: {y_resampled.shape}")
print(f"\nClass distribution after undersampling:")
print(pd.Series(y_resampled).value_counts())

# Visualize balanced distribution
plt.figure(figsize=(8, 4))
pd.Series(y_resampled).value_counts().plot(kind='bar')
plt.title('Balanced Class Distribution (After Undersampling)')
plt.xlabel('Class (0: Neutral, 1: Emotional)')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

## Feature Engineering

Extract relevant features from the BVP signals, such as mean, standard deviation, and frequency-domain features.

In [None]:
def extract_bvp_features(signal_data):
    """
    Extract time-domain and frequency-domain features from BVP signals
    
    Parameters:
    signal_data: numpy array or pandas DataFrame row containing BVP signal
    
    Returns:
    Dictionary of extracted features
    """
    features = {}
    
    # Convert to numpy array if needed
    if isinstance(signal_data, pd.Series):
        signal_data = signal_data.values
    
    # Time-domain features
    features['mean'] = np.mean(signal_data)
    features['std'] = np.std(signal_data)
    features['min'] = np.min(signal_data)
    features['max'] = np.max(signal_data)
    features['median'] = np.median(signal_data)
    features['range'] = np.ptp(signal_data)
    features['skewness'] = skew(signal_data)
    features['kurtosis'] = kurtosis(signal_data)
    
    # First and second derivatives
    first_diff = np.diff(signal_data)
    features['mean_diff'] = np.mean(first_diff)
    features['std_diff'] = np.std(first_diff)
    
    # Frequency-domain features
    fft_vals = np.fft.fft(signal_data)
    fft_freq = np.fft.fftfreq(len(signal_data))
    power_spectrum = np.abs(fft_vals) ** 2
    
    features['dominant_freq'] = np.abs(fft_freq[np.argmax(power_spectrum)])
    features['spectral_energy'] = np.sum(power_spectrum)
    features['spectral_entropy'] = -np.sum(power_spectrum * np.log2(power_spectrum + 1e-10))
    
    return features

# Extract features from the resampled data
print("Extracting features from BVP signals...")
feature_list = []

for idx in range(len(X_resampled)):
    signal_row = X_resampled.iloc[idx] if isinstance(X_resampled, pd.DataFrame) else X_resampled[idx]
    features = extract_bvp_features(signal_row)
    feature_list.append(features)

# Create feature DataFrame
X_features = pd.DataFrame(feature_list)

print(f"\nExtracted features shape: {X_features.shape}")
print("\nFeature names:")
print(X_features.columns.tolist())
print("\nFirst few rows of extracted features:")
print(X_features.head())

In [None]:
# Visualize feature distributions
fig, axes = plt.subplots(3, 4, figsize=(16, 10))
axes = axes.ravel()

for idx, col in enumerate(X_features.columns[:12]):
    axes[idx].hist(X_features[col], bins=30, alpha=0.7, edgecolor='black')
    axes[idx].set_title(f'{col}')
    axes[idx].set_xlabel('Value')
    axes[idx].set_ylabel('Frequency')

plt.tight_layout()
plt.suptitle('Distribution of Extracted Features', y=1.001, fontsize=14)
plt.show()

## Split Data into Training and Testing Sets

Split the dataset into training and testing sets using scikit-learn's train_test_split function.

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_features, y_resampled, 
    test_size=0.2, 
    random_state=42, 
    stratify=y_resampled
)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")
print(f"\nTraining set class distribution:")
print(pd.Series(y_train).value_counts())
print(f"\nTesting set class distribution:")
print(pd.Series(y_test).value_counts())

# Normalize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\nFeatures normalized using StandardScaler")

## Train Binary Classification Model

Train a binary classification model (e.g., logistic regression, SVM, or random forest) on the training data.

In [None]:
# Initialize classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Support Vector Machine': SVC(kernel='rbf', random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42)
}

# Train all classifiers
trained_models = {}

print("Training classifiers...\n")
for name, clf in classifiers.items():
    print(f"Training {name}...")
    clf.fit(X_train_scaled, y_train)
    trained_models[name] = clf
    print(f"{name} trained successfully!\n")

print("All models trained!")

## Evaluate Model Performance

Evaluate the model's performance using metrics such as accuracy, precision, recall, and F1-score.

In [None]:
# Evaluate all models
results = {}

print("=" * 80)
print("MODEL EVALUATION RESULTS")
print("=" * 80)

for name, clf in trained_models.items():
    print(f"\n{name}:")
    print("-" * 80)
    
    # Make predictions
    y_pred = clf.predict(X_test_scaled)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Store results
    results[name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1
    }
    
    # Print metrics
    print(f"Accuracy:  {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1-Score:  {f1:.4f}")
    
    # Print classification report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=['Neutral', 'Emotional']))
    
    # Print confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:")
    print(cm)

print("\n" + "=" * 80)

In [None]:
# Visualize model comparison
results_df = pd.DataFrame(results).T

fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Plot 1: Accuracy comparison
axes[0, 0].bar(results_df.index, results_df['Accuracy'], color='skyblue', edgecolor='black')
axes[0, 0].set_title('Model Accuracy Comparison', fontsize=12, fontweight='bold')
axes[0, 0].set_ylabel('Accuracy')
axes[0, 0].set_ylim([0, 1])
axes[0, 0].tick_params(axis='x', rotation=15)

# Plot 2: Precision comparison
axes[0, 1].bar(results_df.index, results_df['Precision'], color='lightcoral', edgecolor='black')
axes[0, 1].set_title('Model Precision Comparison', fontsize=12, fontweight='bold')
axes[0, 1].set_ylabel('Precision')
axes[0, 1].set_ylim([0, 1])
axes[0, 1].tick_params(axis='x', rotation=15)

# Plot 3: Recall comparison
axes[1, 0].bar(results_df.index, results_df['Recall'], color='lightgreen', edgecolor='black')
axes[1, 0].set_title('Model Recall Comparison', fontsize=12, fontweight='bold')
axes[1, 0].set_ylabel('Recall')
axes[1, 0].set_ylim([0, 1])
axes[1, 0].tick_params(axis='x', rotation=15)

# Plot 4: F1-Score comparison
axes[1, 1].bar(results_df.index, results_df['F1-Score'], color='plum', edgecolor='black')
axes[1, 1].set_title('Model F1-Score Comparison', fontsize=12, fontweight='bold')
axes[1, 1].set_ylabel('F1-Score')
axes[1, 1].set_ylim([0, 1])
axes[1, 1].tick_params(axis='x', rotation=15)

plt.tight_layout()
plt.show()

In [None]:
# Visualize confusion matrices for all models
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for idx, (name, clf) in enumerate(trained_models.items()):
    y_pred = clf.predict(X_test_scaled)
    cm = confusion_matrix(y_test, y_pred)
    
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[idx],
                xticklabels=['Neutral', 'Emotional'],
                yticklabels=['Neutral', 'Emotional'])
    axes[idx].set_title(f'{name}\nConfusion Matrix')
    axes[idx].set_ylabel('True Label')
    axes[idx].set_xlabel('Predicted Label')

plt.tight_layout()
plt.show()

In [None]:
# Overall metrics comparison
print("\nSUMMARY OF MODEL PERFORMANCE:")
print("=" * 80)
print(results_df.to_string())
print("=" * 80)

# Identify best model
best_model_name = results_df['F1-Score'].idxmax()
best_f1_score = results_df['F1-Score'].max()

print(f"\nBest Model: {best_model_name}")
print(f"Best F1-Score: {best_f1_score:.4f}")

## Conclusion

This notebook successfully implemented a binary classification system to distinguish between neutral and emotional states using BVP signals. The workflow included:

1. Data loading and exploration
2. Preprocessing and cleaning
3. Binary class creation (Neutral vs Emotional)
4. Class balancing through undersampling
5. Feature extraction (time-domain and frequency-domain)
6. Model training (Logistic Regression, SVM, Random Forest)
7. Performance evaluation and comparison

The best performing model can be selected based on the evaluation metrics and used for real-time emotion detection applications.