In [None]:
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [15]:
def load_pickle_data(file_path):
    with open(file_path, 'rb') as f:
        data = pickle.load(f)
    return data

In [16]:
train_data = load_pickle_data("Train_stft_features.pkl")
val_data = load_pickle_data("Val_stft_features.pkl")
test_data = load_pickle_data("Test_stft_features.pkl")

# SVM (Support vector machine)

In [17]:
from scipy.stats import kurtosis, skew
from sklearn.svm import SVC
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, classification_report

def extract_features(stft_matrix):
    mean_stft = np.mean(stft_matrix, axis=1)
    var_stft = np.var(stft_matrix, axis=1)
    max_stft = np.max(stft_matrix, axis=1)

    return np.concatenate([mean_stft, var_stft, max_stft])

X_train, y_train = [], []
for item in train_data:
    stft = item['stft']
    label = item['category']
    X_train.append(extract_features(stft))
    y_train.append(label)

X_train = np.array(X_train)
y_train = np.array(y_train)

# Xử lý dữ liệu Validation
X_val, y_val = [], []
for item in val_data:
    stft = item['stft']
    label = item['category']
    X_val.append(extract_features(stft))
    y_val.append(label)

X_val = np.array(X_val)
y_val = np.array(y_val)

# Huấn luyện SVM với kernel tuyến tính
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)

# Dự đoán trên tập Validation
y_pred = svm_model.predict(X_val)

# Đánh giá mô hình
accuracy = accuracy_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred, average='weighted')

print("Classification report for Validation Set:")
print(classification_report(y_val, y_pred))
print(f"Accuracy: {accuracy}")
print(f"F1-Score: {f1}")

# Xử lý dữ liệu Test
X_test, y_test = [], []
for item in test_data:
    stft = item['stft']
    label = item['category']
    X_test.append(extract_features(stft))
    y_test.append(label)

X_test = np.array(X_test)
y_test = np.array(y_test)

# Dự đoán trên tập Test
y_test_pred = svm_model.predict(X_test)

# Đánh giá mô hình trên tập Test
accuracy_test = accuracy_score(y_test, y_test_pred)
f1_test = f1_score(y_test, y_test_pred, average='weighted')

print("\nClassification report for Test Set:")
print(classification_report(y_test, y_test_pred))
print(f"Accuracy (Test): {accuracy_test}")
print(f"F1-Score (Test): {f1_test}")


Classification report for Validation Set:
               precision    recall  f1-score   support

None_swarming       1.00      0.50      0.67      1800
     Swarming       0.67      1.00      0.80      1800

     accuracy                           0.75      3600
    macro avg       0.83      0.75      0.73      3600
 weighted avg       0.83      0.75      0.73      3600

Accuracy: 0.75
F1-Score: 0.7333333333333333

Classification report for Test Set:
               precision    recall  f1-score   support

None_swarming       1.00      0.33      0.50      2400
     Swarming       0.60      1.00      0.75      2441

     accuracy                           0.67      4841
    macro avg       0.80      0.67      0.63      4841
 weighted avg       0.80      0.67      0.63      4841

Accuracy (Test): 0.6701094815120843
F1-Score (Test): 0.6285257241154586


In [8]:
print(f"Shape sau khi PCA (Train): {X_train.shape}")
print(f"Shape sau khi PCA (Validation): {X_val.shape}")
print(f"Shape sau khi PCA (Test): {X_test.shape}")


Shape sau khi PCA (Train): (12600, 771)
Shape sau khi PCA (Validation): (3600, 771)
Shape sau khi PCA (Test): (4841, 771)


In [13]:
import matplotlib.pyplot as plt
import numpy as np

explained_variance = pca.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance)

plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.bar(range(1, len(explained_variance) + 1), explained_variance, alpha=0.7, label="Phương sai từng thành phần")
plt.xlabel("Thành phần chính")
plt.ylabel("Tỷ lệ phương sai")
plt.title("Tỷ lệ phương sai từng thành phần chính")
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker='o', linestyle='--', color='b', label="Phương sai tích lũy")
plt.axhline(y=0.95, color='r', linestyle='--', label="Ngưỡng 95% phương sai")
plt.xlabel("Số thành phần chính")
plt.ylabel("Tỷ lệ phương sai tích lũy")
plt.title("Phương sai tích lũy theo số thành phần chính")
plt.legend()

plt.tight_layout()
plt.show()


NameError: name 'pca' is not defined

In [None]:
print("Số lượng đặc trưng của mỗi mẫu trong tập huấn luyện:", X_train.shape[1])
print("Số lượng đặc trưng của mỗi mẫu trong tập validation:", X_val.shape[1])
print("Số lượng đặc trưng của mỗi mẫu trong tập test:", X_test.shape[1])


In [None]:
n_components = 400
pca = PCA(n_components=n_components)

X_train_pca = pca.fit_transform(X_train)
X_val_pca = pca.transform(X_val)
X_test_pca = pca.transform(X_test)

print(f"Shape sau khi PCA (Train): {X_train_pca.shape}")
print(f"Shape sau khi PCA (Validation): {X_val_pca.shape}")
print(f"Shape sau khi PCA (Test): {X_test_pca.shape}")


Shape sau khi PCA (Train): (12600, 400)
Shape sau khi PCA (Validation): (3600, 400)
Shape sau khi PCA (Test): (4841, 400)


In [76]:
# Huấn luyện SVM với dữ liệu đã giảm chiều
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train_pca, y_train)

# Đánh giá trên tập Validation
y_val_pred = svm_model.predict(X_val_pca)
accuracy_val = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {accuracy_val}")

# Đánh giá trên tập Test
y_test_pred = svm_model.predict(X_test_pca)
accuracy_test = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {accuracy_test}")


Validation Accuracy: 0.75
Test Accuracy: 0.6701094815120843


In [77]:
print("Số lượng đặc trưng của mỗi mẫu trong tập huấn luyện:", X_train.shape[1])
print("Số lượng đặc trưng của mỗi mẫu trong tập validation:", X_val.shape[1])
print("Số lượng đặc trưng của mỗi mẫu trong tập test:", X_test.shape[1])


Số lượng đặc trưng của mỗi mẫu trong tập huấn luyện: 771
Số lượng đặc trưng của mỗi mẫu trong tập validation: 771
Số lượng đặc trưng của mỗi mẫu trong tập test: 771


# KNN (K-Nearest Neighbors)

In [68]:
best_k = None
best_f1 = 0
results = []

for k in range(1, 149, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)

    y_val_pred = knn.predict(X_val)
    f1_val = f1_score(y_val, y_val_pred, average='weighted')

    results.append((k, f1_val))
    print(k)
    if f1_val > best_f1:
        best_f1 = f1_val
        best_k = k

print(f"Best k: {best_k} with F1-score: {best_f1}")

1
3
5
7
9
11
13
15
17
19
21
23
25
27
29
31
33
35
37
39
41
43
45
47
49
51
53
55
57
59
61
63
65
67
69
71
73
75
77
79
81
83
85
87
89
91
93
95
97
99
101
103
105
107
109
111
113
115
117
119
121
123
125
127
129
131
133
135
137
139
141
143
145
147
Best k: 117 with F1-score: 0.6914219487324557


In [78]:
knn_model = KNeighborsClassifier(n_neighbors=117)
knn_model.fit(X_train, y_train)

y_val_pred_knn = knn_model.predict(X_val)

print("Classification report for Validation Set:")
print(classification_report(y_val, y_val_pred_knn))

accuracy_val_knn = accuracy_score(y_val, y_val_pred_knn)
f1_val_knn = f1_score(y_val, y_val_pred_knn, average='weighted')

print(f"Accuracy on Validation Set (KNN): {accuracy_val_knn}")
print(f"F1-Score on Validation Set (KNN): {f1_val_knn}")

y_test_pred_knn = knn_model.predict(X_test)

print("Classification report for Test Set:")
print(classification_report(y_test, y_test_pred_knn))

accuracy_test_knn = accuracy_score(y_test, y_test_pred_knn)
f1_test_knn = f1_score(y_test, y_test_pred_knn, average='weighted')

print(f"Accuracy on Test Set (KNN): {accuracy_test_knn}")
print(f"F1-Score on Test Set (KNN): {f1_test_knn}")

Classification report for Validation Set:
               precision    recall  f1-score   support

None_swarming       0.83      0.51      0.63      1800
     Swarming       0.65      0.90      0.75      1800

     accuracy                           0.70      3600
    macro avg       0.74      0.70      0.69      3600
 weighted avg       0.74      0.70      0.69      3600

Accuracy on Validation Set (KNN): 0.7030555555555555
F1-Score on Validation Set (KNN): 0.6914219487324557
Classification report for Test Set:
               precision    recall  f1-score   support

None_swarming       0.95      0.33      0.49      2400
     Swarming       0.60      0.98      0.74      2441

     accuracy                           0.66      4841
    macro avg       0.77      0.66      0.62      4841
 weighted avg       0.77      0.66      0.62      4841

Accuracy on Test Set (KNN): 0.6610204503201818
F1-Score on Test Set (KNN): 0.6208596432922614


In [81]:
from sklearn.neighbors import KNeighborsClassifier

# Áp dụng KNN sau PCA
knn_model_pca = KNeighborsClassifier(n_neighbors=51)
knn_model_pca.fit(X_train_pca, y_train)

# Dự đoán trên tập Validation
y_val_pred_knn_pca = knn_model_pca.predict(X_val_pca)

print("Classification report for Validation Set (KNN - PCA):")
print(classification_report(y_val, y_val_pred_knn_pca))

accuracy_val_knn_pca = accuracy_score(y_val, y_val_pred_knn_pca)
f1_val_knn_pca = f1_score(y_val, y_val_pred_knn_pca, average='weighted')

print(f"Accuracy on Validation Set (KNN - PCA): {accuracy_val_knn_pca}")
print(f"F1-Score on Validation Set (KNN - PCA): {f1_val_knn_pca}")

# Dự đoán trên tập Test
y_test_pred_knn_pca = knn_model_pca.predict(X_test_pca)

print("Classification report for Test Set (KNN - PCA):")
print(classification_report(y_test, y_test_pred_knn_pca))

accuracy_test_knn_pca = accuracy_score(y_test, y_test_pred_knn_pca)
f1_test_knn_pca = f1_score(y_test, y_test_pred_knn_pca, average='weighted')

print(f"Accuracy on Test Set (KNN - PCA): {accuracy_test_knn_pca}")
print(f"F1-Score on Test Set (KNN - PCA): {f1_test_knn_pca}")


Classification report for Validation Set (KNN - PCA):
               precision    recall  f1-score   support

None_swarming       0.80      0.52      0.63      1800
     Swarming       0.64      0.87      0.74      1800

     accuracy                           0.70      3600
    macro avg       0.72      0.70      0.69      3600
 weighted avg       0.72      0.70      0.69      3600

Accuracy on Validation Set (KNN - PCA): 0.6955555555555556
F1-Score on Validation Set (KNN - PCA): 0.685747382183941
Classification report for Test Set (KNN - PCA):
               precision    recall  f1-score   support

None_swarming       0.92      0.33      0.49      2400
     Swarming       0.60      0.97      0.74      2441

     accuracy                           0.66      4841
    macro avg       0.76      0.65      0.61      4841
 weighted avg       0.76      0.66      0.62      4841

Accuracy on Test Set (KNN - PCA): 0.6550299524891552
F1-Score on Test Set (KNN - PCA): 0.6158241931260705


# NB (Naive Bayes)

In [None]:
from sklearn.naive_bayes import GaussianNB

X_train = []
y_train = []

for item in train_data:
    stft = item['stft'].flatten()
    label = item['category']

    X_train.append(stft)
    y_train.append(label)

X_train = np.array(X_train)
y_train = np.array(y_train)

X_val = []
y_val = []

for item in val_data:
    stft = item['stft'].flatten()
    label = item['category']

    X_val.append(stft)
    y_val.append(label)

X_val = np.array(X_val)
y_val = np.array(y_val)

nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

y_pred = nb_model.predict(X_val)

accuracy = accuracy_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred, average='weighted')

print("Classification report for Validation Set:")
print(classification_report(y_val, y_pred))

print(f"Accuracy: {accuracy}")
print(f"F1-Score: {f1}")

X_test = []
y_test = []

for item in test_data:
    stft = item['stft'].flatten()
    label = item['category']

    X_test.append(stft)
    y_test.append(label)

X_test = np.array(X_test)
y_test = np.array(y_test)

y_test_pred = nb_model.predict(X_test)

accuracy_test = accuracy_score(y_test, y_test_pred)
f1_test = f1_score(y_test, y_test_pred, average='weighted')

print("\nClassification report for Test Set:")
print(classification_report(y_test, y_test_pred))

print(f"Accuracy (Test): {accuracy_test}")
print(f"F1-Score (Test): {f1_test}")


Classification report for Validation Set:
               precision    recall  f1-score   support

None_swarming       1.00      0.25      0.40      1800
     Swarming       0.57      1.00      0.73      1800

     accuracy                           0.62      3600
    macro avg       0.79      0.62      0.56      3600
 weighted avg       0.79      0.62      0.56      3600

Accuracy: 0.6241666666666666
F1-Score: 0.5623480206598905

Classification report for Test Set:
               precision    recall  f1-score   support

None_swarming       1.00      0.33      0.50      2400
     Swarming       0.60      1.00      0.75      2441

     accuracy                           0.67      4841
    macro avg       0.80      0.67      0.63      4841
 weighted avg       0.80      0.67      0.63      4841

Accuracy (Test): 0.6701094815120843
F1-Score (Test): 0.6285257241154586


# Random Forest(RF)

In [None]:
from sklearn.ensemble import RandomForestClassifier

X_train = []
y_train = []

for item in train_data:
    stft = item['stft'].flatten()
    label = item['category']

    X_train.append(stft)
    y_train.append(label)

X_train = np.array(X_train)
y_train = np.array(y_train)

X_val = []
y_val = []

for item in val_data:
    stft = item['stft'].flatten()
    label = item['category']

    X_val.append(stft)
    y_val.append(label)

X_val = np.array(X_val)
y_val = np.array(y_val)

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_val)

accuracy = accuracy_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred, average='weighted')

print("Classification report for Validation Set:")
print(classification_report(y_val, y_pred))

print(f"Accuracy: {accuracy}")
print(f"F1-Score: {f1}")

X_test = []
y_test = []

for item in test_data:
    stft = item['stft'].flatten()
    label = item['category']

    X_test.append(stft)
    y_test.append(label)

X_test = np.array(X_test)
y_test = np.array(y_test)

y_test_pred = rf_model.predict(X_test)

accuracy_test = accuracy_score(y_test, y_test_pred)
f1_test = f1_score(y_test, y_test_pred, average='weighted')

print("\nClassification report for Test Set:")
print(classification_report(y_test, y_test_pred))

print(f"Accuracy (Test): {accuracy_test}")
print(f"F1-Score (Test): {f1_test}")


Classification report for Validation Set:
               precision    recall  f1-score   support

None_swarming       1.00      0.41      0.59      1800
     Swarming       0.63      1.00      0.77      1800

     accuracy                           0.71      3600
    macro avg       0.82      0.71      0.68      3600
 weighted avg       0.82      0.71      0.68      3600

Accuracy: 0.7072222222222222
F1-Score: 0.6797727149204108

Classification report for Test Set:
               precision    recall  f1-score   support

None_swarming       1.00      0.33      0.50      2400
     Swarming       0.60      1.00      0.75      2441

     accuracy                           0.67      4841
    macro avg       0.80      0.67      0.63      4841
 weighted avg       0.80      0.67      0.63      4841

Accuracy (Test): 0.6701094815120843
F1-Score (Test): 0.6285257241154586


# Gradient Boosting, GB

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

X_train = []
y_train = []

for item in train_data:
    stft = item['stft'].flatten()
    label = item['category']

    X_train.append(stft)
    y_train.append(label)

X_train = np.array(X_train)
y_train = np.array(y_train)

X_val = []
y_val = []

for item in val_data:
    stft = item['stft'].flatten()
    label = item['category']

    X_val.append(stft)
    y_val.append(label)

X_val = np.array(X_val)
y_val = np.array(y_val)

gb_model = GradientBoostingClassifier(
    n_estimators=50,
    learning_rate=0.1,
    max_depth=3,
    max_features="sqrt",
    random_state=42
)
gb_model.fit(X_train, y_train)

y_pred = gb_model.predict(X_val)

accuracy = accuracy_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred, average='weighted')

print("Classification report for Validation Set:")
print(classification_report(y_val, y_pred))

print(f"Accuracy: {accuracy}")
print(f"F1-Score: {f1}")

X_test = []
y_test = []

for item in test_data:
    stft = item['stft'].flatten()
    label = item['category']

    X_test.append(stft)
    y_test.append(label)

X_test = np.array(X_test)
y_test = np.array(y_test)

y_test_pred = gb_model.predict(X_test)

accuracy_test = accuracy_score(y_test, y_test_pred)
f1_test = f1_score(y_test, y_test_pred, average='weighted')

print("\nClassification report for Test Set:")
print(classification_report(y_test, y_test_pred))

print(f"Accuracy (Test): {accuracy_test}")
print(f"F1-Score (Test): {f1_test}")