In [None]:
# Usual Libraries
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import random

# Libraries for feature extraction
import librosa as lr
import librosa.display

# Libraries for audio playing
import IPython.display as ipd

# Libraries for normalization and dimensionality reduction
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder

# Libraries for splitting data
from sklearn.model_selection import train_test_split

# Libraries for model building and training
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier

# Libraries for model evaluation
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn import tree

# Libraries for saving models
import pickle

### Data visualization


#### 2D representation of the soundwaves


In [None]:
jazz_sample_path = '../dataset/genres_original/jazz/jazz.00020.wav'
jazz_sample, sr = librosa.load(jazz_sample_path)
plt.figure(figsize=(14, 5))
librosa.display.waveshow(jazz_sample, sr=sr)
plt.title('Waveplot for Jazz Music 20')
plt.ylabel('Amplitude')
plt.xlabel('Time (s)')
print("Jazz Music 20")
display(ipd.Audio(jazz_sample_path))

pop_sample_path = "../dataset/genres_original/pop/pop.00020.wav"
pop_sample, sr = librosa.load(pop_sample_path)
plt.figure(figsize=(14, 5))
librosa.display.waveshow(pop_sample, sr=sr)
plt.title("Waveplot for Pop Music 20")
plt.ylabel("Amplitude")
plt.xlabel("Time (s)")
print("Pop Music 20")
display(ipd.Audio(pop_sample_path))

rock_sample_path = "../dataset/genres_original/rock/rock.00020.wav"
rock_sample, sr = librosa.load(rock_sample_path)
plt.figure(figsize=(14, 5))
librosa.display.waveshow(rock_sample, sr=sr)
plt.title("Waveplot for Rock Music 20")
plt.ylabel("Amplitude")
plt.xlabel("Time (s)")
print("Rock Music 20")
display(ipd.Audio(rock_sample_path))

#### Mel-frequency cepstral coefficients (MFCCs)

In [None]:
jazz_mfccs = librosa.feature.mfcc(y=jazz_sample, sr=sr)
jazz_mfccs_normalized = librosa.util.normalize(jazz_mfccs, axis=1)
plt.figure(figsize=(14, 5))
librosa.display.specshow(jazz_mfccs_normalized, x_axis='time')
plt.colorbar()
plt.title('MFCC for Jazz Music 20') 
plt.ylabel('MFCC Coefficients')

pop_mfccs = librosa.feature.mfcc(y=pop_sample, sr=sr)
pop_mfccs_normalized = librosa.util.normalize(pop_mfccs, axis=1)
plt.figure(figsize=(14, 5))
librosa.display.specshow(pop_mfccs_normalized, x_axis='time')
plt.colorbar()
plt.title('MFCC for Pop Music 20')
plt.ylabel('MFCC Coefficients')

rock_mfccs = librosa.feature.mfcc(y=rock_sample, sr=sr)
rock_mfccs_normalized = librosa.util.normalize(rock_mfccs, axis=1)
plt.figure(figsize=(14, 5))
librosa.display.specshow(rock_mfccs_normalized, x_axis='time')
plt.colorbar()
plt.title('MFCC for Rock Music 20')
plt.ylabel('MFCC Coefficients')

#### Spectral features

In [None]:
jazz_centroid = librosa.feature.spectral_centroid(y=jazz_sample, sr=sr)[0]
jazz_frames = range(len(jazz_centroid))
jazz_t = librosa.frames_to_time(jazz_frames, sr=sr)
plt.figure(figsize=(14, 5))
plt.plot(jazz_t, jazz_centroid, color='b')
plt.title('Spectral Centroid for Jazz Music 20')

pop_centroid = librosa.feature.spectral_centroid(y=pop_sample, sr=sr)[0]
pop_frames = range(len(pop_centroid))
pop_t = librosa.frames_to_time(pop_frames, sr=sr)
plt.figure(figsize=(14, 5))
plt.plot(pop_t, pop_centroid, color='b')
plt.title('Spectral Centroid for Pop Music 20')

rock_centroid = librosa.feature.spectral_centroid(y=rock_sample, sr=sr)[0]
rock_frames = range(len(rock_centroid))
rock_t = librosa.frames_to_time(rock_frames, sr=sr)
plt.figure(figsize=(14, 5))
plt.plot(rock_t, rock_centroid, color='b')
plt.title('Spectral Centroid for Rock Music 20')


In [None]:
jazz_bandwidth = librosa.feature.spectral_bandwidth(y=jazz_sample, sr=sr)[0]
plt.figure(figsize=(14, 5))
plt.plot(jazz_t, jazz_bandwidth, color='g')
plt.title('Spectral Bandwidth for Jazz Music 20')

pop_bandwidth = librosa.feature.spectral_bandwidth(y=pop_sample, sr=sr)[0]
plt.figure(figsize=(14, 5))
plt.plot(pop_t, pop_bandwidth, color='g')
plt.title('Spectral Bandwidth for Pop Music 20')

rock_bandwidth = librosa.feature.spectral_bandwidth(y=rock_sample, sr=sr)[0]
plt.figure(figsize=(14, 5))
plt.plot(rock_t, rock_bandwidth, color='g')
plt.title('Spectral Bandwidth for Rock Music 20')

In [None]:
jazz_rolloff = librosa.feature.spectral_rolloff(y=jazz_sample, sr=sr)[0]
plt.figure(figsize=(14, 5))
plt.plot(jazz_t, jazz_rolloff, color='r')
plt.title('Spectral Rolloff for Jazz Music 20')

pop_rolloff = librosa.feature.spectral_rolloff(y=pop_sample, sr=sr)[0]
plt.figure(figsize=(14, 5))
plt.plot(pop_t, pop_rolloff, color='r')
plt.title('Spectral Rolloff for Pop Music 20')

rock_rolloff = librosa.feature.spectral_rolloff(y=rock_sample, sr=sr)[0]
plt.figure(figsize=(14, 5))
plt.plot(rock_t, rock_rolloff, color='r')
plt.title('Spectral Rolloff for Rock Music 20')

In [None]:
zero_crossings_jazz = librosa.feature.zero_crossing_rate(jazz_sample)[0]
total_crossings_jazz = sum(zero_crossings_jazz)

zero_crossings_pop = librosa.feature.zero_crossing_rate(pop_sample)[0]
total_crossings_pop = sum(zero_crossings_pop)

zero_crossings_rock = librosa.feature.zero_crossing_rate(rock_sample)[0]
total_crossings_rock = sum(zero_crossings_rock)

genres = ['Jazz', 'Pop', 'Rock']
total_crossings = [total_crossings_jazz, total_crossings_pop, total_crossings_rock]

plt.figure(figsize=(14, 5))
plt.bar(genres, total_crossings, color=['blue', 'green', 'red'])
plt.title('Total Zero Crossing Rate for Different Genres')
plt.xlabel('Genre')
plt.ylabel('Total Zero Crossing Rate')

#### Rhythmic features

In [None]:
jazz_tempo, _ = librosa.beat.beat_track(y=jazz_sample, sr=sr)
pop_tempo, _ = librosa.beat.beat_track(y=pop_sample, sr=sr)
rock_tempo, _ = librosa.beat.beat_track(y=rock_sample, sr=sr)

jazz_tempo_val = (np.mean(jazz_tempo))
pop_tempo_val = np.mean(pop_tempo)
rock_tempo_val = np.mean(rock_tempo)

print(f"Jazz Tempo: {round(jazz_tempo_val)} BPM")
print(f"Pop Tempo: {round(pop_tempo_val)} BPM")
print(f"Rock Tempo: {round(rock_tempo_val)} BPM")

#### Chroma features

In [None]:
jazz_chroma = librosa.feature.chroma_stft(y=jazz_sample, sr=sr)
plt.figure(figsize=(14, 5))
librosa.display.specshow(jazz_chroma, y_axis='chroma', x_axis='time')
plt.colorbar()
plt.title('Chromagram for Jazz Music 20')

pop_chroma = librosa.feature.chroma_stft(y=pop_sample, sr=sr)
plt.figure(figsize=(14, 5))
librosa.display.specshow(pop_chroma, y_axis='chroma', x_axis='time')
plt.colorbar()
plt.title('Chromagram for Pop Music 20')

rock_chroma = librosa.feature.chroma_stft(y=rock_sample, sr=sr)
plt.figure(figsize=(14, 5))
librosa.display.specshow(rock_chroma, y_axis='chroma', x_axis='time')
plt.colorbar()
plt.title('Chromagram for Rock Music 20')

#### Harmonic and precussive features

In [None]:
jazz_harmony = librosa.effects.harmonic(y=jazz_sample)
jazz_percussive = librosa.effects.percussive(y=jazz_sample)
plt.figure(figsize=(14, 5))
plt.plot(jazz_harmony, color='b')
plt.plot(jazz_percussive, color='r')
plt.title('Harmonic and Percussive for Jazz Music 20')

pop_harmony = librosa.effects.harmonic(y=pop_sample)
pop_percussive = librosa.effects.percussive(y=pop_sample)
plt.figure(figsize=(14, 5))
plt.plot(pop_harmony, color='b')
plt.plot(pop_percussive, color='r')
plt.title('Harmonic and Percussive for Pop Music 20')

rock_harmony = librosa.effects.harmonic(y=rock_sample)
rock_percussive = librosa.effects.percussive(y=rock_sample)
plt.figure(figsize=(14, 5))
plt.plot(rock_harmony, color='b')
plt.plot(rock_percussive, color='r')
plt.title('Harmonic and Percussive for Rock Music 20')

### Data preprocessing

In [None]:
music_data = pd.read_csv('../dataset/features_3_sec.csv')
music_data.head()

In [None]:
music_data.isnull().sum()

#### Correlation matrix

In [None]:
spike_cols = [col for col in music_data.columns if 'mean' in col]
corr = music_data[spike_cols].corr()
f, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(corr, ax=ax)
plt.title('Correlation between the features')

#### Principal component analysis (PCA)


In [None]:
raw_data = music_data.drop(['filename', 'length', 'label'], axis=1)
label = music_data['label']

standard_scaler = StandardScaler()
scaled_data = standard_scaler.fit_transform(raw_data)
df_scaled_data = pd.DataFrame(scaled_data, columns=raw_data.columns)

random_state = 42

pca = PCA(n_components=2, random_state=random_state)
pca_scaled_data = pca.fit_transform(df_scaled_data)
print(pca.explained_variance_ratio_.sum())
print(pca.n_features_in_)
pca_df_scaled_data = pd.DataFrame(data=pca_scaled_data, columns=['PC1', 'PC2'])
final_df = pd.concat([pca_df_scaled_data, label], axis=1)

plt.figure(figsize=(16, 9))
sns.scatterplot(x='PC1', y='PC2', data=final_df, hue = 'label')
plt.title('PCA on Standard Scaled Data')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')

#### Testing, training and validation sets

In [None]:
X_train, X_temp, y_train, y_temp = train_test_split(
    df_scaled_data, label, test_size=0.2, random_state=random_state
)

X_test, X_val, y_test, y_val = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=random_state
    )

print(f"Dataset: {len(df_scaled_data)}")
print(f"Training set: {len(X_train)} ({round(len(X_train)/len(df_scaled_data)*100)}%)")
print(f"Testing set: {len(X_test)} ({round(len(X_test)/len(df_scaled_data)*100)}%)")
print(f"Validation set: {len(X_val)} ({round(len(X_val)/len(df_scaled_data)*100)}%)")

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)
y_val_encoded = label_encoder.transform(y_val)
true_labels = label_encoder.classes_
print("True Labels: ", true_labels)

### Model training

#### Logistic Regression

In [None]:
max_epochs = 200

logistic_regression = LogisticRegression(max_iter=max_epochs, random_state=random_state, solver='lbfgs')
logistic_regression.fit(X_train, y_train_encoded)

train_pred = logistic_regression.predict(X_train)
val_pred = logistic_regression.predict(X_val)
test_pred = logistic_regression.predict(X_test)

train_acc_logistic = accuracy_score(y_train_encoded, train_pred)
val_acc_logistic = accuracy_score(y_val_encoded, val_pred)
test_acc_logistic = accuracy_score(y_test_encoded, test_pred)

print(f"Accuracy on Training Set: {round(train_acc_logistic * 100, 2)}%")
print(f"Accuracy on Validation Set: {round(val_acc_logistic * 100, 2)}%")
print(f"Accuracy on Testing Set: {round(test_acc_logistic * 100, 2)}%")

print(f"Aantal iteraties per klasse: {logistic_regression.n_iter_}")

conf_matrix = confusion_matrix(y_test_encoded, test_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt='d', xticklabels=true_labels, yticklabels=true_labels)
plt.title('Confusion Matrix Testing set')
plt.xlabel('Predicted')
plt.ylabel('Actual')

#### Stochastic Gradient Descent Classifier


In [None]:
max_epochs = 200
learning_rate = 0.01

sgd_classifier = SGDClassifier(max_iter=max_epochs, random_state=random_state, learning_rate='constant', eta0=learning_rate)

training_scores = []
validation_scores = []
testing_scores = []
epochs = []

for epoch in range(1, max_epochs + 1):
    sgd_classifier.partial_fit(X_train, y_train_encoded, classes=np.unique(y_train_encoded))
    train_score = sgd_classifier.score(X_train, y_train_encoded)
    val_score = sgd_classifier.score(X_val, y_val_encoded)
    test_score = sgd_classifier.score(X_test, y_test_encoded)
    training_scores.append(train_score)
    validation_scores.append(val_score)
    testing_scores.append(test_score)
    epochs.append(epoch)

    if epoch % 10 == 0:
        print(f"Epoch {epoch}: Train accuracy = {train_score:.4f}, Validation accuracy = {val_score:.4f}, Test accuracy = {test_score:.4f}")

train_acc_sgd = accuracy_score(y_train_encoded, sgd_classifier.predict(X_train))
val_acc_sgd = accuracy_score(y_val_encoded, sgd_classifier.predict(X_val))
test_acc_sgd = accuracy_score(y_test_encoded, sgd_classifier.predict(X_test))

print(f"Accuracy on Training Set: {round(train_acc_sgd * 100, 2)}%")
print(f"Accuracy on Validation Set: {round(val_acc_sgd * 100, 2)}%")
print(f"Accuracy on Testing Set: {round(test_acc_sgd * 100, 2)}%")

conf_matrix = confusion_matrix(y_test_encoded, sgd_classifier.predict(X_test))
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt='d', xticklabels=true_labels, yticklabels=true_labels)
plt.title('Confusion Matrix Testing set')
plt.xlabel('Predicted')
plt.ylabel('Actual')

plt.figure(figsize=(14, 7))
plt.plot(epochs, training_scores, label='Training Accuracy')
plt.plot(epochs, validation_scores, label='Validation Accuracy')
plt.title('Accuracy over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')

#### Random Forest Classifier


In [None]:
max_depth = 10
n_estimators = 100
tree_count = list(range(1, n_estimators + 1, 5))
training_scores = []
val_scores = []
testing_scores = []

for trees in tree_count:
    random_forest = RandomForestClassifier(n_estimators=trees, max_depth=max_depth, random_state=0)
    random_forest.fit(X_train, y_train_encoded)
    train_score = random_forest.score(X_train, y_train_encoded)
    val_score = random_forest.score(X_val, y_val_encoded)
    test_score = random_forest.score(X_test, y_test_encoded)

    training_scores.append(train_score)
    val_scores.append(val_score)
    testing_scores.append(test_score)

    if trees % 10 == 1:
        print(f" {trees} trees: Train accuracy = {training_scores[-1]:.4f}, Validation accuracy = {val_scores[-1]:.4f}, Test accuracy = {testing_scores[-1]:.4f}")

max_estimators = tree_count[np.argmax(val_scores)]
print(f"Optimal number of trees: {max_estimators}")

random_forest = RandomForestClassifier(n_estimators=max_estimators, max_depth=max_depth, random_state=random_state)
random_forest.fit(X_train, y_train_encoded)

train_acc_random = accuracy_score(y_train_encoded, random_forest.predict(X_train))
val_acc_random = accuracy_score(y_val_encoded, random_forest.predict(X_val))
test_acc_random = accuracy_score(y_test_encoded, random_forest.predict(X_test))

print(f"Accuracy on Training Set: {round(train_acc_random * 100, 2)}%")
print(f"Accuracy on Validation Set: {round(val_acc_random * 100, 2)}%")
print(f"Accuracy on Testing Set: {round(test_acc_random * 100, 2)}%")

conf_matrix = confusion_matrix(y_test_encoded, random_forest.predict(X_test))
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt='d', xticklabels=true_labels, yticklabels=true_labels)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')

plt.figure(figsize=(14, 7))
plt.plot(tree_count, training_scores, label="Training Accuracy")
plt.plot(tree_count, val_scores, label="Validation Accuracy")
plt.title("Accuracy vs Number of Trees in Random Forest")
plt.xlabel("Number of Trees")
plt.ylabel("Accuracy")
plt.legend()

#### Support Vector Machine Classifier

In [None]:
param_grid = {
    'C': [0.1, 0.2, 0.5, 1, 2, 5, 10, 20, 50, 100],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma': ['scale', 'auto'],
}

grid_search = GridSearchCV(SVC(random_state=random_state), param_grid, cv=10)
grid_search.fit(X_train, y_train_encoded)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_}")

svm_model = grid_search.best_estimator_

train_acc_svm = accuracy_score(y_train_encoded, svm_model.predict(X_train))
val_acc_svm = accuracy_score(y_val_encoded, svm_model.predict(X_val))
test_acc_svm = accuracy_score(y_test_encoded, svm_model.predict(X_test))

print(f"The final model is a SVM model with C = {grid_search.best_params_['C']} and kernel = {grid_search.best_params_['kernel']}")
print(f"Accuracy on Training Set: {round(train_acc_svm * 100, 2)}%")
print(f"Accuracy on Validation Set: {round(val_acc_svm * 100, 2)}%")
print(f"Accuracy on Testing Set: {round(test_acc_svm * 100, 2)}%")

conf_matrix = confusion_matrix(y_test_encoded, svm_model.predict(X_test))
plt.figure(figsize=(10, 8))
sns.heatmap(
    conf_matrix, annot=True, fmt="d", xticklabels=true_labels, yticklabels=true_labels
)
plt.title("Confusion Matrix for SVM")
plt.xlabel("Predicted")
plt.ylabel("Actual")

training_scores = []
val_scores = []

for C in param_grid['C']:
    svm = SVC(C=C, kernel=grid_search.best_params_['kernel'], random_state=random_state)
    svm.fit(X_train, y_train_encoded)
    training_scores.append(svm.score(X_train, y_train_encoded))
    val_scores.append(svm.score(X_val, y_val_encoded))

plt.figure(figsize=(14, 7))
plt.plot(param_grid['C'], training_scores, label="Training Accuracy")
plt.plot(param_grid['C'], val_scores, label="Validation Accuracy")
plt.title("Accuracy vs C Parameter in SVM")
plt.xlabel("C Parameter")
plt.ylabel("Accuracy")
plt.legend()

svm_rmse = -cross_val_score(svm_model, X_train, y_train_encoded, cv=10, scoring='neg_mean_squared_error')
pd.Series(svm_rmse).describe()
print(f"Mean RMSE: {svm_rmse.mean()}")
print(f"Standard Deviation of RMSE: {svm_rmse.std()}")
print(f"Minimum RMSE: {svm_rmse.min()}")
print(f"Maximum RMSE: {svm_rmse.max()}")
print(f"RMSE for each fold: {svm_rmse}")

#### K-Nearest Neighbors

In [None]:
max_neighbors = 20
k_values = list(range(1, max_neighbors + 1))
weights = 'distance'

training_scores = []
val_scores = []
testing_scores = []

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k, weights=weights)
    knn.fit(X_train, y_train_encoded)
    train_score = knn.score(X_train, y_train_encoded)
    val_score = knn.score(X_val, y_val_encoded)
    test_score = knn.score(X_test, y_test_encoded)

    training_scores.append(train_score)
    val_scores.append(val_score)
    testing_scores.append(test_score)

    if k % 2 == 0:
        print(f"K = {k}: Train accuracy = {train_score:.4f}, Validation accuracy = {val_score:.4f}, Test accuracy = {test_score:.4f}")
    
best_k = 6
print(f"Best K: {best_k}")

knn = KNeighborsClassifier(n_neighbors=best_k, weights=weights)
knn.fit(X_train, y_train_encoded)

train_acc_knn = accuracy_score(y_train_encoded, knn.predict(X_train))
val_acc_knn = accuracy_score(y_val_encoded, knn.predict(X_val))
test_acc_knn = accuracy_score(y_test_encoded, knn.predict(X_test))

print(f"Accuracy on Training Set: {round(train_acc_knn * 100, 2)}%")
print(f"Accuracy on Validation Set: {round(val_acc_knn * 100, 2)}%")
print(f"Accuracy on Testing Set: {round(test_acc_knn * 100, 2)}%")

conf_matrix = confusion_matrix(y_test_encoded, knn.predict(X_test))
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt='d', xticklabels=true_labels, yticklabels=true_labels)
plt.title('Confusion Matrix for KNN')
plt.xlabel('Predicted')
plt.ylabel('Actual')

plt.figure(figsize=(14, 7))
plt.plot(k_values, val_scores, marker='o')
plt.title('Validation Accuracy vs Number of Neighbors')
plt.xlabel('Number of Neighbors')
plt.ylabel('Validation Accuracy')
plt.grid(True)

#### Decission Tree

In [None]:
criteria = "entropy"
max_depth = [5, 10, 15, 20, 25]
class_weight = 'balanced'

training_scores = []
val_scores = []
testing_scores = []

for depth in max_depth:
    decision_tree = DecisionTreeClassifier(criterion=criteria, max_depth=depth, class_weight=class_weight, random_state=random_state)
    decision_tree.fit(X_train, y_train_encoded)
    train_score = decision_tree.score(X_train, y_train_encoded)
    val_score = decision_tree.score(X_val, y_val_encoded)
    test_score = decision_tree.score(X_test, y_test_encoded)

    training_scores.append(train_score)
    val_scores.append(val_score)
    testing_scores.append(test_score)

    print(f"Max Depth = {depth}: Train accuracy = {train_score:.4f}, Validation accuracy = {val_score:.4f}, Test accuracy = {test_score:.4f}")

best_depth = max_depth[np.argmax(val_scores)]
print(f"Best Max Depth: {best_depth}")

decision_tree = DecisionTreeClassifier(criterion=criteria, max_depth=best_depth, class_weight=class_weight, random_state=random_state)
decision_tree.fit(X_train, y_train_encoded)

train_acc_decission = accuracy_score(y_train_encoded, decision_tree.predict(X_train))
val_acc_decission = accuracy_score(y_val_encoded, decision_tree.predict(X_val))
test_acc_decission = accuracy_score(y_test_encoded, decision_tree.predict(X_test))

print(f"Accuracy on Training Set: {round(train_acc_decission * 100, 2)}%")
print(f"Accuracy on Validation Set: {round(val_acc_decission * 100, 2)}%")
print(f"Accuracy on Testing Set: {round(test_acc_decission * 100, 2)}%")

conf_matrix = confusion_matrix(y_test_encoded, decision_tree.predict(X_test))
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt='d', xticklabels=true_labels, yticklabels=true_labels)
plt.title('Confusion Matrix for Decision Tree')
plt.xlabel('Predicted')
plt.ylabel('Actual')


#### Gradient Boosting

In [None]:
loss = "log_loss"
learning_rate = 0.2
n_estimators = 300

gradient_boosting = GradientBoostingClassifier(loss=loss, learning_rate=learning_rate, n_estimators=n_estimators, random_state=random_state)
gradient_boosting.fit(X_train, y_train_encoded)

train_acc_gradient = accuracy_score(y_train_encoded, gradient_boosting.predict(X_train))
val_acc_gradient = accuracy_score(y_val_encoded, gradient_boosting.predict(X_val))
test_acc_gradient = accuracy_score(y_test_encoded, gradient_boosting.predict(X_test))

print(f"Accuracy on Training Set: {round(train_acc_gradient * 100, 2)}%")
print(f"Accuracy on Validation Set: {round(val_acc_gradient * 100, 2)}%")
print(f"Accuracy on Testing Set: {round(test_acc_gradient * 100, 2)}%")

conf_matrix = confusion_matrix(y_test_encoded, gradient_boosting.predict(X_test))
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt='d', xticklabels=true_labels, yticklabels=true_labels)
plt.title('Confusion Matrix for Gradient Boosting')
plt.xlabel('Predicted')
plt.ylabel('Actual')

plt.figure(figsize=(14, 7))
plt.plot(gradient_boosting.train_score_)
plt.title('Loss over Iterations')
plt.xlabel('Iteration')
plt.ylabel('Loss')


### Model evaluation

In [None]:
models = ['logistic_regression', 'sgd_classifier', 'random_forest', 'svm_model', 'knn', 'decision_tree', 'gradient_boosting']

train_accuracies = [train_acc_logistic, train_acc_sgd, train_acc_random, train_acc_svm, train_acc_knn, train_acc_decission, train_acc_gradient]
val_accuracies = [val_acc_logistic, val_acc_sgd, val_acc_random, val_acc_svm, val_acc_knn, val_acc_decission, val_acc_gradient]
test_accuracies = [test_acc_logistic, test_acc_sgd, test_acc_random, test_acc_svm, test_acc_knn, test_acc_decission, test_acc_gradient]

df_comparison = pd.DataFrame({
    'Model': models,
    'Training Accuracy': train_accuracies,
    'Validation Accuracy': val_accuracies,
    'Testing Accuracy': test_accuracies
})

df_comparison = df_comparison.sort_values(by='Testing Accuracy', ascending=False).reset_index(drop=True)

plt.figure(figsize=(14, 10))
sns.set_style("whitegrid")
bar_width = 0.25
x = np.arange(len(models))

plt.bar(x - bar_width, df_comparison['Training Accuracy'], width=bar_width, label='Training', color='#3498db', alpha=0.8)
plt.bar(x, df_comparison['Validation Accuracy'], width=bar_width, label='Validation', color='#2ecc71', alpha=0.8)
plt.bar(x + bar_width, df_comparison['Test Accuracy'], width=bar_width, label='Test', color='#e74c3c', alpha=0.8)

plt.xlabel('Models', fontsize=14)
plt.ylabel('Accuracy', fontsize=14)
plt.title('Model Performance Comparison', fontsize=16)
plt.xticks(x, df_comparison['Model'], rotation=45, ha='right', fontsize=12)
plt.legend(fontsize=12)
plt.tight_layout()
plt.grid(True, linestyle='--', alpha=0.7)
plt.axhline(y=0.80, color='gray', linestyle='--', alpha=0.7)
plt.text(len(models)-1, 0.81, 'Benchmark (80%)', fontsize=10)

for i, model in enumerate(df_comparison['Model']):
    plt.text(i, df_comparison['Test Accuracy'][i] + 0.01, f"{df_comparison['Test Accuracy'][i]:.4f}", 
             ha='center', va='bottom', fontsize=10)
    
plt.ylim(0.6, 1.0)
plt.savefig('model_comparison.png', dpi=300, bbox_inches='tight')

plt.figure(figsize=(12, 8))
df_heatmap = df_comparison.copy()
df_heatmap['Train-Test Gap'] = df_heatmap['Training Accuracy'] - df_heatmap['Test Accuracy']
df_heatmap = df_heatmap[['Model', 'Training Accuracy', 'Validation Accuracy', 'Test Accuracy', 'Train-Test Gap']]
df_heatmap = df_heatmap.set_index('Model')

sns.heatmap(df_heatmap, annot=True, cmap='YlGnBu', fmt='.4f', linewidths=0.5)
plt.title('Model Performance Metrics', fontsize=16)
plt.tight_layout()



### Choosing the model based on performance



In [None]:
kernel = 'rbf'
C_value = 10
gamma = 'scale'
final_svm_model = SVC(C=C_value, kernel=kernel, gamma=gamma, random_state=random_state)
final_model = final_svm_model.fit(X_train, y_train_encoded)

print(f"The final model is a SVM model with C = {C_value} and kernel = {kernel}")

print(f"The accuracy of the model on the test data is {round(final_model.score(X_test, y_test_encoded)*100,2)}%")

y_true = y_test_encoded
y_pred = final_model.predict(X_test)
conf_matrix = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt='d', xticklabels=true_labels, yticklabels=true_labels)
plt.title('Confusion Matrix for Final SVM Model')
plt.xlabel('Predicted')
plt.ylabel('Actual')

MODEL_DIR = os.path.join('..', 'models')
if not os.path.exists(MODEL_DIR):
    os.makedirs(MODEL_DIR)

model_filename = os.path.join(MODEL_DIR, 'svm_final_model.sav')
pickle.dump(final_model, open(model_filename, 'wb'))
print(f"Model saved as {model_filename}")

label_filename = os.path.join(MODEL_DIR, 'label_encoder.sav')
pickle.dump(label_encoder, open(label_filename, 'wb'))
print(f"Label Encoder saved as {label_filename}")

scaler_filename = os.path.join(MODEL_DIR, 'standard_scaler.sav')
pickle.dump(standard_scaler, open(scaler_filename, 'wb'))
print(f"Standard Scaler saved as {scaler_filename}")
