In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder
import os
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV

In [2]:
train_df = pd.read_csv("mfcc_features_buzz2/Train_mfcc_features.csv")
test_df = pd.read_csv("mfcc_features_buzz2/Test_mfcc_features.csv")
val_df = pd.read_csv("mfcc_features_buzz2/Val_mfcc_features.csv")

In [3]:
train_df = train_df.drop(columns=['file_name'])
test_df = test_df.drop(columns=['file_name'])
val_df = val_df.drop(columns=['file_name'])

In [4]:
X_train, y_train = train_df.drop(columns=['label']), train_df['label']
X_test, y_test = test_df.drop(columns=['label']), test_df['label']
X_val, y_val = val_df.drop(columns=['label']), val_df['label']

In [5]:
print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}, Val shape: {X_val.shape}")

Train shape: (19002, 180), Test shape: (11200, 180), Val shape: (4449, 180)


## KNN

In [6]:
scaler = StandardScaler()
X_train_Scaled = scaler.fit_transform(X_train)
X_test_Scaled = scaler.transform(X_test)
X_val_Scaled = scaler.transform(X_val)

In [7]:
param_grid = {
    'n_neighbors': list(range(3, 100, 2))
}

knn = KNeighborsClassifier()

grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, cv=5, scoring='accuracy')

grid_search.fit(X_train_Scaled, y_train)

best_n_neighbors = grid_search.best_params_['n_neighbors']
best_score = grid_search.best_score_

print(f"Best n_neighbors: {best_n_neighbors} with Cross-Validation Accuracy: {best_score:.4f}")

Best n_neighbors: 7 with Cross-Validation Accuracy: 0.9989


In [9]:
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train_Scaled, y_train)

y_pred_test = knn.predict(X_test_Scaled)

y_pred_val = knn.predict(X_val_Scaled)

def evaluate_model(y_true, y_pred, dataset_name):
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    report = classification_report(y_true, y_pred)

    print(f"{dataset_name} Performance:")
    print(f"   Accuracy: {accuracy:.4f}")
    print(f"   F1 Score: {f1:.4f}\n")
    print("Classification Report:\n", report)
    
    return {"accuracy": accuracy, "f1_score": f1, "report": report}

test_results = evaluate_model(y_test, y_pred_test, "Test Set (Raw MFCC)")

val_results = evaluate_model(y_val, y_pred_val, "Validation Set (Raw MFCC)")

Test Set (Raw MFCC) Performance:
   Accuracy: 0.9438
   F1 Score: 0.9468

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.89      0.94      5600
           1       0.90      1.00      0.95      5600

    accuracy                           0.94     11200
   macro avg       0.95      0.94      0.94     11200
weighted avg       0.95      0.94      0.94     11200

Validation Set (Raw MFCC) Performance:
   Accuracy: 0.9735
   F1 Score: 0.9744

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.95      0.97      2200
           1       0.95      1.00      0.97      2249

    accuracy                           0.97      4449
   macro avg       0.97      0.97      0.97      4449
weighted avg       0.97      0.97      0.97      4449



## SVM

In [10]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV

In [11]:
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}

scaler = StandardScaler()
X_Val_Scaled = scaler.fit_transform(X_val)

svm = SVC(kernel='linear')
grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_Val_Scaled, y_val)

best_C = grid_search.best_params_['C']
print(f"Best C found: {best_C}")

Best C found: 0.01


In [12]:
svm = SVC(kernel='linear', C=0.01, shrinking=False)
svm.fit(X_train_Scaled, y_train)

y_pred_test = svm.predict(X_test_Scaled)
y_pred_val = svm.predict(X_val_Scaled)

def evaluate_model(y_true, y_pred, dataset_name):
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    report = classification_report(y_true, y_pred)

    print(f"{dataset_name} Performance:")
    print(f"   Accuracy: {accuracy:.4f}")
    print(f"   F1 Score: {f1:.4f}\n")
    print("Classification Report:\n", report)

    return {"accuracy": accuracy, "f1_score": f1, "report": report}

test_results = evaluate_model(y_test, y_pred_test, "Test Set (Scaled MFCC)")
val_results = evaluate_model(y_val, y_pred_val, "Validation Set (Scaled MFCC)")

Test Set (Scaled MFCC) Performance:
   Accuracy: 0.6466
   F1 Score: 0.7389

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.29      0.45      5600
           1       0.59      1.00      0.74      5600

    accuracy                           0.65     11200
   macro avg       0.79      0.65      0.60     11200
weighted avg       0.79      0.65      0.60     11200

Validation Set (Scaled MFCC) Performance:
   Accuracy: 0.9903
   F1 Score: 0.9905

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.98      0.99      2200
           1       0.98      1.00      0.99      2249

    accuracy                           0.99      4449
   macro avg       0.99      0.99      0.99      4449
weighted avg       0.99      0.99      0.99      4449



## NB

In [13]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, f1_score, precision_score

nb_raw = GaussianNB()

nb_raw.fit(X_train, y_train)

y_test_pred_raw = nb_raw.predict(X_test)
y_val_pred_raw = nb_raw.predict(X_val)

def evaluate_model(y_true, y_pred, model_type):
    precision = precision_score(y_true, y_pred, average="binary")
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    print(f"\n{model_type} Model Performance:")
    print(f"   Accuracy: {accuracy:.4f}")
    print(f"   F1 Score: {f1:.4f}")
    return accuracy, f1

print("\nEvaluating Naïve Bayes on Raw MFCC Features:")
test_results_raw = evaluate_model(y_test, y_test_pred_raw, "NB Raw (Test)")
val_results_raw = evaluate_model(y_val, y_val_pred_raw, "NB Raw (Validation)")



Evaluating Naïve Bayes on Raw MFCC Features:

NB Raw (Test) Model Performance:
   Accuracy: 0.9724
   F1 Score: 0.9732

NB Raw (Validation) Model Performance:
   Accuracy: 0.9863
   F1 Score: 0.9866


## RF

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, f1_score
import pandas as pd
import numpy as np

In [14]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

param_grid = {
    'n_estimators': list(range(25, 201, 5)),
    'random_state': [42],
    'max_depth': [None]
}

rf = RandomForestClassifier()

grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

grid_search.fit(X_val, y_val)

best_params = grid_search.best_params_
print("Best n_estimators:", best_params['n_estimators'])
print("Best random_state:", best_params['random_state'])


Best n_estimators: 155
Best random_state: 42


In [15]:
rf_base = RandomForestClassifier(n_estimators=best_params['n_estimators'], random_state=42)

rf_base.fit(X_train, y_train)

y_test_pred_base = rf_base.predict(X_test)
y_val_pred_base = rf_base.predict(X_val)


def evaluate_model(y_true, y_pred, model_type):
    precision = precision_score(y_true, y_pred, average="binary")
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    print(f"\n{model_type} Model Performance:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Classification Report:\n{classification_report(y_true, y_pred)}")
    return precision, accuracy, f1

print("\nEvaluating Base Values:")
base_test_results = evaluate_model(y_test, y_test_pred_base, "rf Base (Test)")
base_val_results = evaluate_model(y_val, y_val_pred_base, "rf Base (Validation)")



Evaluating Base Values:

rf Base (Test) Model Performance:
Accuracy: 0.9804
F1 Score: 0.9807
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.96      0.98      5600
           1       0.96      1.00      0.98      5600

    accuracy                           0.98     11200
   macro avg       0.98      0.98      0.98     11200
weighted avg       0.98      0.98      0.98     11200


rf Base (Validation) Model Performance:
Accuracy: 0.9960
F1 Score: 0.9960
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      1.00      2200
           1       0.99      1.00      1.00      2249

    accuracy                           1.00      4449
   macro avg       1.00      1.00      1.00      4449
weighted avg       1.00      1.00      1.00      4449



In [16]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, f1_score

In [17]:
gb_clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=50)

gb_clf.fit(X_train, y_train)

y_val_pred = gb_clf.predict(X_val)

y_test_pred = gb_clf.predict(X_test)

val_accuracy = accuracy_score(y_val, y_val_pred)
val_precision = precision_score(y_val, y_val_pred, average="weighted")
val_f1 = f1_score(y_val, y_val_pred, average="weighted")

print(f"Validation Accuracy: {val_accuracy:.4f}")
print(f"Validation Precision: {val_precision:.4f}")
print(f"Validation F1 Score: {val_f1:.4f}")

test_accuracy = accuracy_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred, average="weighted")
test_f1 = f1_score(y_test, y_test_pred, average="weighted")

print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test Precision: {test_precision:.4f}")
print(f"Test F1 Score: {test_f1:.4f}")

Validation Accuracy: 0.9926
Validation Precision: 0.9926
Validation F1 Score: 0.9926
Test Accuracy: 0.9258
Test Precision: 0.9354
Test F1 Score: 0.9254
