In [173]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler



In [174]:
# Load the data
data = pd.read_csv("pd_speech_features.csv", header=1)

# Scale the features
scaler = StandardScaler()

numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns[1:-1]
data[numeric_columns] = scaler.fit_transform(data[numeric_columns])



# Identify binary/categorical columns and other features
binary_features = ['gender']  # Adjust if needed
other_features = [col for col in data.columns if col not in binary_features + ['id', 'class']]

# Apply PCA to non-binary features
pca = PCA(n_components=20)  # Retain 95% of the explained variance
pca_features = pca.fit_transform(data[other_features])
pca_feature_names = [f'PC{i+1}' for i in range(pca_features.shape[1])]

# Create a PCA DataFrame and add the ID column for grouping
pca_df = pd.DataFrame(pca_features, columns=pca_feature_names)
pca_df['id'] = data['id']

# Aggregate PCA-transformed features with mean and std
aggregated_pca_data = pca_df.groupby('id').agg(['mean', 'std']).reset_index()

# Flatten multi-level column names
aggregated_pca_data.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in aggregated_pca_data.columns]

# Add binary features back as-is
for feature in binary_features:
    aggregated_pca_data[feature] = data.groupby('id')[feature].first().values.astype(data[feature].dtype)


# Add the target variable
aggregated_pca_data['class'] = data.groupby('id')['class'].first().values

In [178]:
from EFSA.main import FeatureSelector

# Feature matrix (X) and target vector (y)
X = aggregated_pca_data.drop('class', axis=1)
y = aggregated_pca_data['class']


selector = FeatureSelector(X=X, y=y)

selected_features = selector.comprehensive_feature_selection(k_features=20)

print(selected_features)


['PC6_std', 'gender', 'PC18_std', 'PC1_mean', 'PC5_mean', 'PC19_std', 'PC19_mean', 'PC20_mean', 'PC20_std', 'PC6_mean', 'PC4_std', 'PC7_mean']


In [179]:
# Import necessary modules
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train an SVM model on X_reduced (which is the PCA-reduced data)
model1 = SVC(kernel='linear', random_state=42)

# For X_reduced
model1.fit(X_train, y_train)  # Train on the reduced training data
y_pred_reduced = model1.predict(X_test)  # Make predictions on the test data

print(f"SVM on X_reduced - Accuracy:\n {classification_report(y_test, y_pred_reduced)}")


SVM on X_reduced - Accuracy:
               precision    recall  f1-score   support

           0       0.62      0.36      0.45        14
           1       0.79      0.92      0.85        37

    accuracy                           0.76        51
   macro avg       0.71      0.64      0.65        51
weighted avg       0.75      0.76      0.74        51

