In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer, recall_score, f1_score
from sklearn.utils.class_weight import compute_class_weight

In [61]:
df = pd.read_csv("/Users/kevin_smith/Desktop/FSU_Relevant_Stuff/fall_2023/CAP5771/project/gtzan/Data/features_30_sec.csv")


In [62]:
# Split the data into features (X) and labels (y)
X = df.drop(['filename', 'label'], axis=1)
y = df['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Standardize the features (optional but often recommended for SVM)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train an SVM model
svm_model = SVC(kernel='linear', C=1.0)
svm_model.fit(X_train_scaled, y_train)

# Extract coefficients from the SVM model
coefficients = svm_model.coef_[0]

# Create a DataFrame to store feature names and their corresponding coefficients
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Coefficient': coefficients})

# Sort the DataFrame by the absolute values of coefficients in descending order
feature_importance_df = feature_importance_df.reindex(feature_importance_df['Coefficient'].abs().sort_values(ascending=False).index)

# Print the top N most important features and their coefficients
top_n = 10  # Change this value to print a different number of top features
print(f"Top {top_n} Most Important Features:")
print(feature_importance_df.head(top_n))


Top 10 Most Important Features:
            Feature  Coefficient
24       mfcc4_mean     0.249012
27        mfcc5_var     0.235593
28       mfcc6_mean     0.207043
52      mfcc18_mean    -0.197398
50      mfcc17_mean    -0.186486
22       mfcc3_mean     0.168322
2   chroma_stft_var     0.167185
29        mfcc6_var     0.164697
18       mfcc1_mean     0.145843
44      mfcc14_mean     0.133247


In [63]:
# Compute class weights to handle class imbalance
class_weights = compute_class_weight('balanced', classes=pd.unique(y_train), y=y_train)
class_weight_dict = dict(zip(pd.unique(y_train), class_weights))

# Initialize SVM model
svm_model = SVC(class_weight=class_weight_dict)

# Define scoring metrics (recall and F1 score)
scoring = {'recall': make_scorer(recall_score, average='weighted'),
           'f1_score': make_scorer(f1_score, average='weighted')}

# Hyperparameter tuning using grid search
param_grid = {'C': [0.1, 1, 10, 100],
              'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
              'gamma': ['scale', 'auto', 0.1, 1, 10]}  # Adjust gamma for RBF kernel
grid_search = GridSearchCV(svm_model, param_grid, scoring=scoring, cv=5, n_jobs=-1, refit='f1_score')
grid_search.fit(X_train_scaled, y_train)

# Extract the best model
best_svm_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_svm_model.predict(X_test_scaled)

# Evaluate the model performance
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Display the results
print("Best Hyperparameters:", grid_search.best_params_)
print("Recall:", recall)
print("F1 Score:", f1)

Best Hyperparameters: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Recall: 0.695
F1 Score: 0.6927321363374466


In [5]:
df = pd.read_hdf("/Users/kevin_smith/Desktop/FSU_Relevant_Stuff/fall_2023/CAP5771/project/results/STFT_F1024_H520.hdf5")

ImportError: Missing optional dependency 'pytables'.  Use pip or conda to install pytables.