In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.model_selection import KFold
import numpy as np
import pandas as pd

In [9]:
#after running the feature extraction script, load the data
df = pd.read_csv('embeddings.csv')

#drop the filename column 
df = df.drop(columns=['filename'])

#use the label column as the y value
y = df['label']
#drop the label column
X = df.drop(columns=['label'])

In [10]:
#prepare the data for training using KFold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
#initialize the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
#initialize a list to store the results
results = []
metrics = ['precision', 'recall', 'f1-score', 'accuracy']
#initialize a list to store the metrics
metrics_list = []


#loop through the folds
for train_index, test_index in kf.split(X):
    #split the data into training and testing sets
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    #fit the model to the training data
    model.fit(X_train, y_train)
    
    #make predictions on the test data
    y_pred = model.predict(X_test)
    
    #calculate the metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')
    #append the metrics to the list
    metrics_list.append([accuracy, precision, recall, f1])
    #append the results to the list
    results.append({
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1-score': f1
    })



    
    


In [11]:
# get the mean of the metrics and standard deviation and print them
metrics_df = pd.DataFrame(metrics_list, columns=metrics)
mean_metrics = metrics_df.mean()
std_metrics = metrics_df.std()

for metric in metrics:
    print(f'{metric} {mean_metrics[metric]:.4f} ± {std_metrics[metric]:.4f}')
    
   


precision 0.9607 ± 0.0052
recall 0.9512 ± 0.0095
f1-score 0.9461 ± 0.0117
accuracy 0.9483 ± 0.0066
