In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Load datasets
df_songs = pd.read_csv('../data/features_30_sec.csv')
df_segments = pd.read_csv('../data/features_3_sec.csv')

# Prepare training and testing data
X = df_songs
y = df_songs['label']

X_train_songs, X_test_songs, y_train_songs, y_test_songs = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

train_filenames = X_train_songs['filename']
test_filenames = X_test_songs['filename']

train_mask = (df_segments['filename'].str.extract(r'(.*\.\d+)\.\d+\.wav')[0] + '.wav').isin(train_filenames)
test_mask = (df_segments['filename'].str.extract(r'(.*\.\d+)\.\d+\.wav')[0] + '.wav').isin(test_filenames)

df_train_segments = df_segments[train_mask].reset_index(drop=True)
df_test_segments = df_segments[test_mask].reset_index(drop=True)

X_train_segments = df_train_segments.drop(columns=['filename', 'length', 'label'])
y_train_segments = df_train_segments['label']

In [2]:
train_mask = (df_segments['filename'].str.extract(r'(.*\.\d+)\.\d+\.wav')[0]+ '.wav').isin(train_filenames)
test_mask = (df_segments['filename'].str.extract(r'(.*\.\d+)\.\d+\.wav')[0]+ '.wav').isin(test_filenames)
df_train_segments = df_segments[train_mask].reset_index(drop=True)
df_test_segments = df_segments[test_mask].reset_index(drop=True)

In [3]:
X_train_segments = df_train_segments.drop(columns=['filename', 'length', 'label'])
y_train_segments = df_train_segments['label']

In [4]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('logreg', LogisticRegression(max_iter=10000))
])

In [5]:
# Define parameter grid for GridSearchCV
param_grid = {
    'logreg__C': [0.1, 1, 10, 100],
    'logreg__solver': ['liblinear', 'lbfgs', 'newton-cg']
}

# Perform Grid Search with Cross-Validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, scoring='accuracy')
grid_search.fit(X_train_segments, y_train_segments)

# Print best parameters and best score
print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best cross-validation accuracy: {grid_search.best_score_}")

Best parameters found: {'logreg__C': 1, 'logreg__solver': 'lbfgs'}
Best cross-validation accuracy: 0.5553191489361702


In [6]:
# Use the best estimator to predict on test set
best_clf = grid_search.best_estimator_
X_test_segments = df_test_segments.drop(columns=['filename', 'length', 'label'])
y_test_segments = df_test_segments['label']

y_pred = best_clf.predict(X_test_segments)

# Define majority vote function
def majority_vote(predictions):
    return predictions.mode().iloc[0]

df_test_segments['predicted_label'] = y_pred

df_test_segments['song'] = df_test_segments['filename'].str.extract(r'(.*\.\d{5})')[0] + '.wav'
song_predictions = df_test_segments.groupby('song')['predicted_label'].apply(majority_vote)

song_true_labels = X_test_songs.set_index('filename').loc[song_predictions.index, 'label']
song_pred_labels = song_predictions

# Print classification report and accuracy score
print("Classification Report:")
print(classification_report(song_true_labels, song_pred_labels))

print("Accuracy Score:")
print(accuracy_score(song_true_labels, song_pred_labels))

Classification Report:
              precision    recall  f1-score   support

       blues       0.70      0.80      0.74        20
   classical       0.91      1.00      0.95        20
     country       0.70      0.70      0.70        20
       disco       0.92      0.55      0.69        20
      hiphop       0.64      0.70      0.67        20
        jazz       0.82      0.90      0.86        20
       metal       0.90      0.90      0.90        20
         pop       0.74      0.85      0.79        20
      reggae       0.65      0.75      0.70        20
        rock       0.69      0.45      0.55        20

    accuracy                           0.76       200
   macro avg       0.77      0.76      0.75       200
weighted avg       0.77      0.76      0.75       200

Accuracy Score:
0.76
