# This notebook uses the CNN as feature extractor and runs traditional ML algorithms for prediction.

Install the following if not already on machine.

In [38]:
!pip install soundfile
!pip install tqdm
!pip install -U scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.2.2-cp39-cp39-macosx_10_9_x86_64.whl (9.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m34.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting joblib>=1.1.1
  Using cached joblib-1.2.0-py3-none-any.whl (297 kB)
Collecting scipy>=1.3.2
  Downloading scipy-1.10.1-cp39-cp39-macosx_10_9_x86_64.whl (35.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.2/35.2 MB[0m [31m40.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting threadpoolctl>=2.0.0
  Using cached threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Installing collected packages: threadpoolctl, scipy, joblib, scikit-learn
Successfully installed joblib-1.2.0 scikit-learn-1.2.2 scipy-1.10.1 threadpoolctl-3.1.0


In [1]:
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.multiclass import OneVsRestClassifier
import xgboost as xgb

In [3]:
import torch

# import the pre-trained vggish pytorch port.
model = torch.hub.load('harritaylor/torchvggish', 'vggish', postprocess=False)
model.eval()

Using cache found in C:\Users\SKPC/.cache\torch\hub\harritaylor_torchvggish_master


VGGish(
  (features): Sequential(
    (0): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): ReLU(inplace=True)
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU(inplace=True)
    (8): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU(inplace=True)
    (10): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (11): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (12): ReLU(inplace=True)
    (13): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (14): ReLU(inplace=True)
    (15): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False

In [4]:
# this removes the final ReLU activation layer to match original Tensorflow architecture
model.embeddings = torch.nn.Sequential(*list(model.embeddings.children())[:-1])

# Load Data

In [5]:
import os
import pandas as pd
import ast

def load(filepath):

    filename = os.path.basename(filepath)

    if 'features' in filename:
        return pd.read_csv(filepath, index_col=0, header=[0, 1, 2])

    if 'echonest' in filename:
        return pd.read_csv(filepath, index_col=0, header=[0, 1, 2])

    if 'genres' in filename:
        return pd.read_csv(filepath, index_col=0)

    if 'tracks' in filename:
        tracks = pd.read_csv(filepath, index_col=0, header=[0, 1])

        COLUMNS = [('track', 'tags'), ('album', 'tags'), ('artist', 'tags'),
                   ('track', 'genres'), ('track', 'genres_all')]
        for column in COLUMNS:
            tracks[column] = tracks[column].map(ast.literal_eval)

        COLUMNS = [('track', 'date_created'), ('track', 'date_recorded'),
                   ('album', 'date_created'), ('album', 'date_released'),
                   ('artist', 'date_created'), ('artist', 'active_year_begin'),
                   ('artist', 'active_year_end')]
        for column in COLUMNS:
            tracks[column] = pd.to_datetime(tracks[column])

        SUBSETS = ('small', 'medium', 'large')
        try:
            tracks['set', 'subset'] = tracks['set', 'subset'].astype(
                    'category', categories=SUBSETS, ordered=True)
        except (ValueError, TypeError):
            # the categories and ordered arguments were removed in pandas 0.25
            tracks['set', 'subset'] = tracks['set', 'subset'].astype(
                     pd.CategoricalDtype(categories=SUBSETS, ordered=True))

        COLUMNS = [('track', 'genre_top'), ('track', 'license'),
                   ('album', 'type'), ('album', 'information'),
                   ('artist', 'bio')]
        for column in COLUMNS:
            tracks[column] = tracks[column].astype('category')

        return tracks

tracks = load('../data/fma_metadata/tracks.csv')
genres = load('../data/fma_metadata/genres.csv')

In [6]:
# subset to just the small dataset
small = tracks[tracks['set', 'subset'] <= 'small']

In [7]:
def get_audio_path(audio_dir, track_id):
    """
    Return the path to the mp3 given the directory where the audio is stored
    and the track ID.
    Examples
    --------
    >>> import utils
    >>> AUDIO_DIR = os.environ.get('AUDIO_DIR')
    >>> utils.get_audio_path(AUDIO_DIR, 2)
    '../data/fma_small/000/000002.mp3'
    """
    tid_str = '{:06d}'.format(track_id)
    return os.path.join(audio_dir, tid_str[:3], tid_str + '.mp3')

In [8]:
# train, validation, test split
train = small.index[small['set', 'split'] == 'training']
val = small.index[small['set', 'split'] == 'validation']
test = small.index[small['set', 'split'] == 'test']

print('{} training examples, {} validation examples, {} testing examples'.format(*map(len, [train, val, test])))

6400 training examples, 800 validation examples, 800 testing examples


# Running the CNN to get embeddings

In [26]:
from tqdm import tqdm
import numpy as np

AUDIO_DIR = "../data/fma_small"

for i in tqdm(small.index):
    try:
        embed = model.forward(get_audio_path(AUDIO_DIR, i))
        with open(f'../data/small_embeds/{i}.npy', 'wb') as f:
                np.save(f, embed.detach().numpy())
    except:
        print("error with audio stream, continuing")

  6%|▌         | 490/8000 [06:11<1:33:14,  1.34it/s][src/libmpg123/layer3.c:INT123_do_layer3():1841] error: dequantization failed!
 11%|█▏        | 901/8000 [11:21<1:28:11,  1.34it/s][src/libmpg123/layer3.c:INT123_do_layer3():1801] error: dequantization failed!
 15%|█▍        | 1181/8000 [14:57<1:25:58,  1.32it/s][src/libmpg123/layer3.c:INT123_do_layer3():1801] error: dequantization failed!
 28%|██▊       | 2265/8000 [28:41<1:10:43,  1.35it/s][src/libmpg123/layer3.c:INT123_do_layer3():1773] error: part2_3_length (3360) too large for available bit count (3240)
 28%|██▊       | 2267/8000 [28:42<1:05:08,  1.47it/s][src/libmpg123/layer3.c:INT123_do_layer3():1773] error: part2_3_length (3328) too large for available bit count (3240)
 55%|█████▌    | 4423/8000 [56:07<43:54,  1.36it/s]  Note: Illegal Audio-MPEG-Header 0x00000000 at offset 33361.
Note: Trying to resync...
Note: Skipped 1024 bytes in input.
[src/libmpg123/parse.c:wetwork():1365] error: Giving up resync after 1024 bytes - your s

error with audio stream, continuing
error with audio stream, continuing
error with audio stream, continuing




error with audio stream, continuing




error with audio stream, continuing




error with audio stream, continuing


100%|██████████| 8000/8000 [1:40:48<00:00,  1.32it/s]


In [12]:
# this function flattens each embeds to a one dimensional tensor, because it returns a 31*128 tensor
def load_one(idx):
    try:
        embed = np.load(f'../data/small_embeds_final/{idx}.npy').flatten()
        return embed
    except:
        return np.zeros(31*128)
    
embed_series = pd.Series(small.index).apply(load_one)

In [13]:
# load the embeds into a dataframe
embed_df = pd.DataFrame.from_dict(dict(zip(embed_series.index, embed_series.values))).T

# set the index
embed_df = embed_df.set_index(small.index)

In [14]:
# filter out tracks that couldn't be read / embedded.

embed_df = embed_df[~(embed_df.sum(axis=1) == 0)]

In [15]:
# One-hot encoding for genre labels
labels_onehot = LabelBinarizer().fit_transform(small['track', 'genre_top'])
labels_onehot = pd.DataFrame(labels_onehot, index=small.index)
labels_onehot.columns = [f'label_{i}' for i in labels_onehot.columns]
labels_onehot

Unnamed: 0_level_0,label_0,label_1,label_2,label_3,label_4,label_5,label_6,label_7
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2,0,0,0,1,0,0,0,0
5,0,0,0,1,0,0,0,0
10,0,0,0,0,0,0,1,0
140,0,0,1,0,0,0,0,0
141,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...
154308,0,0,0,1,0,0,0,0
154309,0,0,0,1,0,0,0,0
154413,0,0,0,0,0,0,1,0
154414,0,0,0,0,0,0,1,0


In [16]:
# labels_onehot = LabelBinarizer().fit_transform(small['track', 'genre_top'])
# labels_onehot

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       ...,
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [17]:
# encode labels
labels_encoded = LabelEncoder().fit_transform(small['track', 'genre_top'])

In [18]:
small['labels_encoded'] = labels_encoded

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  small['labels_encoded'] = labels_encoded


In [19]:
small['labels_encoded']

track_id
2         3
5         3
10        6
140       2
141       2
         ..
154308    3
154309    3
154413    6
154414    6
155066    3
Name: labels_encoded, Length: 8000, dtype: int32

In [20]:
data_df = embed_df
data_df['labels_encoded'] = small['labels_encoded']
data_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,3959,3960,3961,3962,3963,3964,3965,3966,3967,labels_encoded
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,-0.698666,-0.066649,-0.000531,-0.280849,0.091604,-0.727752,0.643959,0.01736,-0.528933,-0.572499,...,-0.513161,-0.479931,-0.435369,-1.071859,-0.07052,-0.442723,-1.105817,-0.038214,-0.309881,3
5,-0.442708,0.052575,0.071074,-0.344115,0.261202,-0.417586,0.36362,0.284545,-0.176374,-0.622329,...,-0.41763,-0.482327,-0.182185,-0.879638,0.016094,-0.33677,-1.275318,-0.309379,-0.221681,3
10,-0.339631,0.127341,0.222564,-0.18403,0.290061,-0.120663,-0.083058,0.180418,0.218018,-0.356597,...,-0.171288,-0.110426,-0.048964,-0.350825,0.030792,0.134033,-0.561954,0.129264,0.175977,6
140,-0.132327,-0.095099,0.798407,-0.331704,0.295047,-0.441052,-0.265907,-0.015102,-0.52764,-0.500278,...,-0.018307,0.16693,-0.106614,-0.304236,0.107357,0.393789,-0.353353,0.011052,-0.156305,2
141,-0.266731,0.226023,0.360905,-0.33478,0.032314,-0.293675,-0.288571,0.106768,-0.164602,-0.539508,...,0.089232,-0.048423,-0.136854,-0.179497,0.151896,-0.140548,-0.45216,0.139479,0.198652,2


In [21]:
# train, validation, test split for embeds
train_df = data_df[data_df.index.isin(train)]
val_df = data_df[data_df.index.isin(val)]
test_df = data_df[data_df.index.isin(test)]

train_df.shape[0], val_df.shape[0], test_df.shape[0]

(6394, 800, 800)

In [22]:
DATA_COLS = [col for col in train_df.columns if type(col) == int]
LABEL_COLS = [col for col in train_df.columns if type(col) != int]

In [23]:
X_train = train_df[DATA_COLS].values
y_train = train_df['labels_encoded'].values

In [26]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [27]:
# run PCA with 128 principal components
pca = PCA(n_components=128)
X_pca = pca.fit_transform(X_train_scaled)

In [28]:
X_test = test_df[DATA_COLS].values
y_test = test_df['labels_encoded'].values

In [29]:
X_test_pca = pca.transform(scaler.transform(X_test))

# Then now we can finally run traditional ML algorithms on the embeds. We do this with PCA and without.

### SVM RBF

In [30]:
clf = SVC(C=1, kernel='rbf', gamma='scale')
clf.fit(X_pca, y_train)

In [31]:
rbf_svc_accuracy_pca = accuracy_score(y_test, clf.predict(X_test_pca))
rbf_svc_accuracy_pca

0.57

In [32]:
clf = SVC(C=1, kernel='rbf', gamma='scale')
clf.fit(X_train, y_train)

In [33]:
rbf_svc_accuracy = accuracy_score(y_test, clf.predict(X_test))
rbf_svc_accuracy

0.5475

### Random Forest

In [34]:
clf = RandomForestClassifier()
clf.fit(X_pca, y_train)

In [35]:
rf_accuracy_pca = accuracy_score(y_test, clf.predict(X_test_pca))
rf_accuracy_pca

0.515

In [38]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

In [45]:
rf_accuracy = accuracy_score(y_test, clf.predict(X_test))
rf_accuracy

0.50625

## XGBoost

In [39]:
# Create classification matrices
dtrain_clf_pca = xgb.DMatrix(X_pca, y_train)
dtest_clf_pca = xgb.DMatrix(X_test_pca, y_test)

In [40]:
params = {"objective": "multi:softmax", "num_class": 8, "eval_metric": "mlogloss"}

In [41]:
xgb_model_pca = xgb.train(params=params, dtrain=dtrain_clf_pca)

In [42]:
y_pred = xgb_model_pca.predict(dtest_clf_pca)

In [43]:
xgb_accuracy_pca = accuracy_score(y_test, y_pred)
xgb_accuracy_pca

0.5325

In [44]:
dtrain_clf = xgb.DMatrix(X_train, y_train)
dtest_clf = xgb.DMatrix(X_test, y_test)

In [45]:
xgb_model = xgb.train(params=params, dtrain=dtrain_clf)

In [46]:
y_pred = xgb_model.predict(dtest_clf)

In [47]:
xgb_accuracy = accuracy_score(y_test, y_pred)
xgb_accuracy

0.50625

### Logistic Regression

In [51]:
logreg = LogisticRegression()

In [52]:
logreg.fit(X_pca, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [50]:
y_pred = logreg.predict(X_test_pca)

In [53]:
logreg_accuracy_pca = accuracy_score(y_test, y_pred)
logreg_accuracy_pca

0.54125

In [54]:
logreg.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [55]:
y_pred = logreg.predict(X_test)
logreg_accuracy = accuracy_score(y_test, y_pred)
logreg_accuracy

0.47625

# KNN

In [57]:
knn = KNeighborsClassifier()
knn.fit(X_pca, y_train)
y_pred = knn.predict(X_test_pca)
knn_accuracy_pca = accuracy_score(y_test, y_pred)
knn_accuracy_pca

0.5025

In [58]:
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
knn_accuracy = accuracy_score(y_test, y_pred)
knn_accuracy

0.5

# SVC Poly

In [59]:
svc_poly = SVC(kernel='poly', degree=1)
svc_poly.fit(X_pca, y_train)
y_pred = svc_poly.predict(X_test_pca)
svc_poly_accuracy_pca = accuracy_score(y_test, y_pred)
svc_poly_accuracy_pca

0.54625

In [61]:
svc_poly.fit(X_train, y_train)
y_pred = svc_poly.predict(X_test)
svc_poly_accuracy = accuracy_score(y_test, y_pred)
svc_poly_accuracy

0.53625

# Lin SVC1

In [62]:
lin_svc = SVC(kernel='linear')
lin_svc.fit(X_pca, y_train)
y_pred = lin_svc.predict(X_test_pca)
lin_svc_accuracy_pca = accuracy_score(y_test, y_pred)
lin_svc_accuracy_pca

0.5225

In [63]:
lin_svc.fit(X_train, y_train)
y_pred = lin_svc.predict(X_test)
lin_svc_accuracy = accuracy_score(y_test, y_pred)
lin_svc_accuracy

0.47625

# Lin SVC2

In [64]:
lin_svc2 = LinearSVC()
lin_svc2.fit(X_pca, y_train)
y_pred = lin_svc2.predict(X_test_pca)
lin_svc2_accuracy_pca = accuracy_score(y_test, y_pred)
lin_svc2_accuracy_pca



0.4775

In [65]:
lin_svc2.fit(X_train, y_train)
y_pred = lin_svc2.predict(X_test)
lin_svc2_accuracy = accuracy_score(y_test, y_pred)
lin_svc2_accuracy



0.4375

# Decision Tree

In [66]:
dec_tree = DecisionTreeClassifier()
dec_tree.fit(X_pca, y_train)
y_pred = dec_tree.predict(X_test_pca)
dec_tree_accuracy_pca = accuracy_score(y_test, y_pred)
dec_tree_accuracy_pca

0.39625

In [67]:
dec_tree.fit(X_train, y_train)
y_pred = dec_tree.predict(X_test)
dec_tree_accuracy = accuracy_score(y_test, y_pred)
dec_tree_accuracy

0.35875

# AdaBoost

In [68]:
ada_boost = AdaBoostClassifier()
ada_boost.fit(X_pca, y_train)
y_pred = ada_boost.predict(X_test_pca)
ada_boost_accuracy_pca = accuracy_score(y_test, y_pred)
ada_boost_accuracy_pca

0.46625

In [69]:
ada_boost.fit(X_train, y_train)
y_pred = ada_boost.predict(X_test)
ada_boost_accuracy = accuracy_score(y_test, y_pred)
ada_boost_accuracy

0.45375

# MLP1

In [70]:
mlp1 = MLPClassifier(hidden_layer_sizes=(100,), max_iter=2000)
mlp1.fit(X_pca, y_train)
y_pred = mlp1.predict(X_test_pca)
mlp1_accuracy_pca = accuracy_score(y_test, y_pred)
mlp1_accuracy_pca

0.4825

In [71]:
mlp1.fit(X_train, y_train)
y_pred = mlp1.predict(X_test)
mlp1_accuracy = accuracy_score(y_test, y_pred)
mlp1_accuracy

0.5

# MLP2

In [72]:
mlp2 = MLPClassifier(hidden_layer_sizes=(200, 50), max_iter=2000)
mlp2.fit(X_pca, y_train)
y_pred = mlp2.predict(X_test_pca)
mlp2_accuracy_pca = accuracy_score(y_test, y_pred)
mlp2_accuracy_pca

0.51

In [73]:
mlp2.fit(X_train, y_train)
y_pred = mlp2.predict(X_test)
mlp2_accuracy = accuracy_score(y_test, y_pred)
mlp2_accuracy

0.49125

# NB

In [74]:
NB = GaussianNB()
NB.fit(X_pca, y_train)
y_pred = NB.predict(X_test_pca)
NB_accuracy_pca = accuracy_score(y_test, y_pred)
NB_accuracy_pca

0.42

In [75]:
NB.fit(X_train, y_train)
y_pred = NB.predict(X_test)
NB_accuracy = accuracy_score(y_test, y_pred)
NB_accuracy

0.50125

# QDA

In [76]:
QDA = QuadraticDiscriminantAnalysis()
QDA.fit(X_pca, y_train)
y_pred = QDA.predict(X_test_pca)
QDA_accuracy_pca = accuracy_score(y_test, y_pred)
QDA_accuracy_pca

0.47375

In [77]:
QDA.fit(X_train, y_train)
y_pred = QDA.predict(X_test)
QDA_accuracy = accuracy_score(y_test, y_pred)
QDA_accuracy



0.135

# Table of Results

In [1]:
import pandas as pd

# Define the dimensions of each feature set
feature_dims = {
    'extracted features from pretrained': 31*128,
    'extracted features from pretrained after PCA': 128
}

# Define the accuracies for each model and feature set
accuracies = {
    'extracted features from pretrained': {
        'LR': logreg_accuracy,
        'kNN': knn_accuracy,
        'SVCrbf': rbf_svc_accuracy,
        'SVCpoly1': svc_poly_accuracy,
        'linSVC1': lin_svc_accuracy,
        'linSVC2': lin_svc2_accuracy,
        'DT': dec_tree_accuracy,
        'RF': rf_accuracy,
        'AdaBoost': ada_boost_accuracy,
        'MLP1': mlp1_accuracy,
        'MLP2': mlp2_accuracy,
        'NB': NB_accuracy,
        'QDA': QDA_accuracy,
    },
    'extracted features from pretrained after PCA': {
        'LR': logreg_accuracy_pca,
        'kNN': knn_accuracy_pca,
        'SVCrbf': rbf_svc_accuracy_pca,
        'SVCpoly1': svc_poly_accuracy_pca,
        'linSVC1': lin_svc_accuracy_pca,
        'linSVC2': lin_svc2_accuracy_pca,
        'DT': dec_tree_accuracy_pca,
        'RF': rf_accuracy_pca,
        'AdaBoost': ada_boost_accuracy_pca,
        'MLP1': mlp1_accuracy_pca,
        'MLP2': mlp2_accuracy_pca,
        'NB': NB_accuracy_pca,
        'QDA': QDA_accuracy_pca,
    }
}

# Create a list of dictionaries for each row of the DataFrame
rows = []
for feature_set, model_accs in accuracies.items():
    row_dict = {'Feature Set': feature_set, 'Dimension': feature_dims[feature_set]}
    row_dict.update(model_accs)
    rows.append(row_dict)

# Create the DataFrame from the list of row dictionaries
df = pd.DataFrame(rows)

# Set the index to be the Feature Set column
df.set_index('Feature Set', inplace=True)

# Print the DataFrame
print(df)


NameError: name 'logreg_accuracy' is not defined

In [None]:
confusion_matrix()

In [16]:
accuracy_score(y_test, y_pred)

0.53875