In [133]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler



In [134]:

# Load the data
data = pd.read_csv("pd_speech_features.csv", header=1)

# Scale the features
scaler = StandardScaler()
data.iloc[:, 1:-1] = scaler.fit_transform(data.iloc[:, 1:-1])

# Identify binary/categorical columns and other features
binary_features = ['gender']  # Adjust if needed
other_features = [col for col in data.columns if col not in binary_features + ['id', 'class']]

# Aggregate only non-binary features with mean and std
aggregated_data = data.groupby('id')[other_features].agg(['mean', 'std']).reset_index()

# Flatten multi-level column names
aggregated_data.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in aggregated_data.columns]

# Add binary features back as-is
for feature in binary_features:
    aggregated_data[feature] = data.groupby('id')[feature].first().values

# Add the target variable
aggregated_data['class'] = data.groupby('id')['class'].first().values

# Print shapes to verify
print("Original data shape:", data.shape)
print("Aggregated data shape:", aggregated_data.shape)



  0.96874225  0.96874225  0.96874225 -1.03226633 -1.03226633 -1.03226633
 -1.03226633 -1.03226633 -1.03226633  0.96874225  0.96874225  0.96874225
  0.96874225  0.96874225  0.96874225  0.96874225  0.96874225  0.96874225
  0.96874225  0.96874225  0.96874225  0.96874225  0.96874225  0.96874225
  0.96874225  0.96874225  0.96874225  0.96874225  0.96874225  0.96874225
  0.96874225  0.96874225  0.96874225  0.96874225  0.96874225  0.96874225
 -1.03226633 -1.03226633 -1.03226633  0.96874225  0.96874225  0.96874225
 -1.03226633 -1.03226633 -1.03226633 -1.03226633 -1.03226633 -1.03226633
 -1.03226633 -1.03226633 -1.03226633  0.96874225  0.96874225  0.96874225
 -1.03226633 -1.03226633 -1.03226633  0.96874225  0.96874225  0.96874225
 -1.03226633 -1.03226633 -1.03226633  0.96874225  0.96874225  0.96874225
 -1.03226633 -1.03226633 -1.03226633 -1.03226633 -1.03226633 -1.03226633
  0.96874225  0.96874225  0.96874225 -1.03226633 -1.03226633 -1.03226633
 -1.03226633 -1.03226633 -1.03226633 -1.03226633 -1

Original data shape: (756, 755)
Aggregated data shape: (252, 1507)


In [135]:
from EFSA.main import FeatureSelector

# Feature matrix (X) and target vector (y)
X = aggregated_data.drop('class', axis=1)
y = aggregated_data['class']


selector = FeatureSelector(X=X, y=y)

selected_features = selector.comprehensive_feature_selection(k_features=200)

print(selected_features)


['tqwt_entropy_log_dec_2_std', 'tqwt_entropy_shannon_dec_15_mean', 'tqwt_meanValue_dec_25_std', 'tqwt_kurtosisValue_dec_15_std', 'tqwt_entropy_shannon_dec_25_mean', 'tqwt_entropy_shannon_dec_16_mean', 'tqwt_kurtosisValue_dec_7_mean', 'tqwt_kurtosisValue_dec_26_std', 'tqwt_entropy_shannon_dec_11_mean', 'tqwt_kurtosisValue_dec_22_mean', 'mean_MFCC_1st_coef_mean', 'tqwt_kurtosisValue_dec_14_mean', 'locDbShimmer_std', 'tqwt_kurtosisValue_dec_24_mean', 'rapJitter_std', 'tqwt_kurtosisValue_dec_9_std', 'tqwt_energy_dec_15_mean', 'std_11th_delta_mean', 'tqwt_maxValue_dec_25_mean', 'tqwt_entropy_log_dec_29_std', 'tqwt_maxValue_dec_12_mean', 'tqwt_medianValue_dec_12_std', 'tqwt_maxValue_dec_14_mean', 'tqwt_stdValue_dec_11_mean', 'tqwt_kurtosisValue_dec_10_std', 'tqwt_kurtosisValue_dec_15_mean', 'mean_MFCC_11th_coef_mean', 'tqwt_kurtosisValue_dec_18_mean', 'std_MFCC_8th_coef_mean', 'tqwt_medianValue_dec_31_std', 'mean_12th_delta_delta_mean', 'tqwt_kurtosisValue_dec_18_std', 'tqwt_kurtosisValue_de

In [136]:
from sklearn.decomposition import PCA

# Define PCA to retain 95% of the variance or select a fixed number of components
pca = PCA(n_components=250)  # Or n_components=0.95 for variance-based selection

# Fit and transform the data
X_reduced = pca.fit_transform(X)

# Check the explained variance ratio
print("Explained variance ratio:", sum(pca.explained_variance_ratio_))
print("Reduced shape:", X_reduced.shape)

# Full variance analysis
explained_variance = pca.explained_variance_ratio_

# Print individual component contributions
for i, var in enumerate(explained_variance[:10], 1):
    print(f"Component {i}: {var*100:.2f}% variance")

# Cumulative variance
print(f"\nCumulative variance (first 10 components): {np.sum(explained_variance[:10])*100:.2f}%")


Explained variance ratio: 0.9999905550765321
Reduced shape: (252, 250)
Component 1: 87.42% variance
Component 2: 1.69% variance
Component 3: 1.42% variance
Component 4: 1.26% variance
Component 5: 0.61% variance
Component 6: 0.48% variance
Component 7: 0.45% variance
Component 8: 0.36% variance
Component 9: 0.31% variance
Component 10: 0.30% variance

Cumulative variance (first 10 components): 94.31%


In [137]:
from acp import PCAFromScratch


pca_from_scratch = PCAFromScratch(n_components=250)

# Fit and transform the data
X_reduced2 = pca_from_scratch.fit_transform(X)

print("Original shape:", X.shape)
print("Reduced shape:", X_reduced2.shape)

Original shape: (252, 1506)
Reduced shape: (252, 250)


In [139]:
# Import necessary modules
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report
from sklearn.model_selection import train_test_split

# Assuming X_reduced (e.g., PCA-transformed features) and y are ready
# Split the data into train and test sets
X_reduced = pd.DataFrame(X_reduced)
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=42)

# Train an SVM model on X_reduced (which is the PCA-reduced data)
model1 = SVC(kernel='linear', random_state=42)

# For X_reduced
model1.fit(X_train, y_train)  # Train on the reduced training data
y_pred_reduced = model1.predict(X_test)  # Make predictions on the test data

print(f"SVM on X_reduced - Accuracy:\n {classification_report(y_test, y_pred_reduced)}")


# Split the data into train and test sets
X_reduced2 = pd.DataFrame(X_reduced2)
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_reduced2, y, test_size=0.2, random_state=42)
  
model2 = SVC(kernel='linear', random_state=42)
  
X_train2 = np.real(X_train2)  # Remove imaginary part if present
X_test2 = np.real(X_test2)

model2.fit(X_train2, y_train2)  # Train on the second reduced dataset
y_pred_reduced2 = model2.predict(X_test2)  # Make predictions on the test data


print(f"SVM on X_reduced2 - Accuracy:\n {classification_report(y_test2, y_pred_reduced2)}")


SVM on X_reduced - Accuracy:
               precision    recall  f1-score   support

           0       0.73      0.57      0.64        14
           1       0.85      0.92      0.88        37

    accuracy                           0.82        51
   macro avg       0.79      0.75      0.76        51
weighted avg       0.82      0.82      0.82        51

SVM on X_reduced2 - Accuracy:
               precision    recall  f1-score   support

           0       0.73      0.57      0.64        14
           1       0.85      0.92      0.88        37

    accuracy                           0.82        51
   macro avg       0.79      0.75      0.76        51
weighted avg       0.82      0.82      0.82        51

