### Google Drive Mounting and File Paths Setup in Colab

In [None]:
"""
Mounting the Google drive
"""
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
root_path = "/content/drive/MyDrive/Colab Notebooks/COEN240_TA/"
train_path = root_path + "train/"
validate_path = root_path + "validate/"
test_path = root_path + "test/"
new_path = "/content/drive/MyDrive/"

In [None]:
with open(train_path + 'train_cleaned_final.npy', 'rb') as f:
    trainX = np.load(f)
    trainY = np.load(f)

with open(validate_path + 'validate_cleaned_final.npy', 'rb') as f:
    testX = np.load(f)
    testY = np.load(f)

print(trainX.shape, trainY.shape, testX.shape, testY.shape)
print(trainX.shape)

#### Applying PCA

In [None]:
from sklearn.decomposition import PCA
from skimage.exposure import rescale_intensity
from imutils import build_montages

import time

print("[INFO] creating eigenfaces...")
pca = PCA(
	svd_solver="randomized",
	n_components=100,
	whiten=True)
start = time.time()
trainX_PCA = pca.fit_transform(trainX)
end = time.time()

[INFO] creating eigenfaces...


In [None]:
testX_PCA = pca.transform(testX)

In [None]:
print(trainX.shape)
print(trainX_PCA.shape)
print(testX_PCA.shape)

(10289, 40000)
(10289, 100)
(1781, 100)


##### LDA on PCA

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import matplotlib.pyplot as plt

lda = LinearDiscriminantAnalysis()

# Fit LDA to the training data
trainX_LDA = lda.fit_transform(trainX_PCA, trainY)

In [None]:
testX_LDA = lda.transform(testX_PCA)

In [None]:
print("Original Dimensions:", trainX.shape)
print("Reduced Dimensions PCA:", trainX_PCA.shape)
print("Reduced Dimensions LDA on PCA:", trainX_LDA.shape)
print("labels dim:", trainY.shape)
print("testX:",testX.shape)

Original Dimensions: (10289, 40000)
Reduced Dimensions PCA: (10289, 100)
Reduced Dimensions LDA on PCA: (10289, 31)
labels dim: (10289,)
testX: (1781, 40000)


## Applying models

### KNN

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler


scaler = StandardScaler()

# Fit and transform the training data
trainX_scaled = scaler.fit_transform(trainX_LDA)

# Transform the test data using the same scaler
testX_scaled = scaler.transform(testX_LDA)


# Define the parameter grid
param_grid = {'n_neighbors': [3, 5, 7, 9],
              'weights': ['uniform', 'distance'],
              'p': [1, 2]}

# Create the KNN model
knn_model = KNeighborsClassifier()

# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(knn_model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(trainX_LDA, trainY)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Train the KNN model with the best hyperparameters
best_knn_model = KNeighborsClassifier(**best_params)
best_knn_model.fit(trainX_LDA, trainY)

# Make predictions and calculate accuracy
best_knn_predictions = best_knn_model.predict(testX_LDA)
best_knn_accuracy = accuracy_score(testY, best_knn_predictions)
print("Best KNN Accuracy:", best_knn_accuracy)

Best Hyperparameters: {'n_neighbors': 3, 'p': 2, 'weights': 'distance'}
Best KNN Accuracy: 0.9320606400898371


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {'n_neighbors': [3, 5, 7, 9],
              'weights': ['uniform', 'distance'],
              'p': [1, 2]}

# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(knn_model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(trainX_LDA, trainY)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Train the KNN model with the best hyperparameters
best_knn_model = KNeighborsClassifier(**best_params)
best_knn_model.fit(trainX_LDA, trainY)

# Training Score
train_score = best_knn_model.score(trainX_LDA, trainY)
print("Training Score:", train_score)

# Make predictions and calculate accuracy
best_knn_predictions = best_knn_model.predict(testX_LDA)
best_knn_accuracy = accuracy_score(testY, best_knn_predictions)
print("Best KNN Accuracy:", best_knn_accuracy)

# Calculate precision, recall, and f1-score
precision = precision_score(testY, best_knn_predictions, average='weighted')
recall = recall_score(testY, best_knn_predictions, average='weighted')
f1 = f1_score(testY, best_knn_predictions, average='weighted')

print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)

Best Hyperparameters: {'n_neighbors': 3, 'p': 2, 'weights': 'distance'}
Training Score: 1.0
Best KNN Accuracy: 0.9320606400898371
Precision: 0.9408369455468092
Recall: 0.9320606400898371
F1-Score: 0.9301185530221724


### SVM RBF

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler


# Define the parameter grid for SVM with RBF kernel
param_grid_svm = {'C': [0.1, 1, 10],
                  'gamma': [0.001, 0.01, 0.1, 1],
                  'kernel': ['rbf']}

# Create the SVM model with RBF kernel
svm_model = SVC()

# Use GridSearchCV to find the best hyperparameters
grid_search_svm = GridSearchCV(svm_model, param_grid_svm, cv=5, scoring='accuracy')
grid_search_svm.fit(trainX_LDA, trainY)

# Get the best parameters for SVM
best_params_svm = grid_search_svm.best_params_
print("Best Hyperparameters for SVM:", best_params_svm)

# Train the SVM model with the best hyperparameters
best_svm_model = SVC(**best_params_svm)
best_svm_model.fit(trainX_LDA, trainY)

# Make predictions and calculate accuracy for SVM
best_svm_predictions = best_svm_model.predict(testX_LDA)
best_svm_accuracy = accuracy_score(testY, best_svm_predictions)
print("Best SVM Accuracy:", best_svm_accuracy)

# Training Score
train_score = best_svm_model.score(trainX_LDA, trainY)
print("Training Score for SVM:", train_score)

# Precision, Recall, and F1-Score
precision = precision_score(testY, best_svm_predictions, average='weighted')
recall = recall_score(testY, best_svm_predictions, average='weighted')
f1 = f1_score(testY, best_svm_predictions, average='weighted')

print("Precision for SVM:", precision)
print("Recall for SVM:", recall)
print("F1-Score for SVM:", f1)

Best Hyperparameters for SVM: {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}
Best SVM Accuracy: 0.9371139809096013
Training Score for SVM: 0.9999028088249587
Precision for SVM: 0.9421626100929031
Recall for SVM: 0.9371139809096013
F1-Score for SVM: 0.9352613333660087


### Logistic Regression





In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# Define the parameter grid for Logistic Regression
param_grid_logreg = {'C': [0.1, 1, 10],
                     'penalty': ['l2'],  # Use 'l2' penalty for lbfgs solver
                     'max_iter': [100, 200, 300]}  # Increase max_iter

# Create the Logistic Regression model
logreg_model = LogisticRegression(solver='lbfgs')  # Specify lbfgs solver

# Use GridSearchCV to find the best hyperparameters
grid_search_logreg = GridSearchCV(logreg_model, param_grid_logreg, cv=5, scoring='accuracy')
grid_search_logreg.fit(trainX_LDA, trainY)

# Get the best parameters for Logistic Regression
best_params_logreg = grid_search_logreg.best_params_
print("Best Hyperparameters for Logistic Regression:", best_params_logreg)

# Train the Logistic Regression model with the best hyperparameters
best_logreg_model = LogisticRegression(**best_params_logreg, solver='lbfgs')  # Specify lbfgs solver
best_logreg_model.fit(trainX_LDA, trainY)

# Make predictions and calculate accuracy for Logistic Regression
best_logreg_predictions = best_logreg_model.predict(testX_LDA)
best_logreg_accuracy = accuracy_score(testY, best_logreg_predictions)
print("Best Logistic Regression Accuracy:", best_logreg_accuracy)

# Calculate training score
train_logreg_score = best_logreg_model.score(trainX_LDA, trainY)


# Calculate precision, recall, and F1-score for test set
precision_test = precision_score(testY, best_logreg_predictions, average='weighted')
recall_test = recall_score(testY, best_logreg_predictions, average='weighted')
f1_test = f1_score(testY, best_logreg_predictions, average='weighted')

# Print the results
print("Logistic Regression Precision (Test):", precision_test)
print("Logistic Regression Recall (Test):", recall_test)
print("Logistic Regression F1-Score (Test):", f1_test)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best Hyperparameters for Logistic Regression: {'C': 1, 'max_iter': 200, 'penalty': 'l2'}
Best Logistic Regression Accuracy: 0.914093206064009
Logistic Regression Precision (Test): 0.9163647550262675
Logistic Regression Recall (Test): 0.914093206064009
Logistic Regression F1-Score (Test): 0.9122309936030805


In [None]:
train_logreg_score

0.9705510739624842

### Ensemble - KNN, SVM

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


# Create KNN model
knn_model = KNeighborsClassifier(n_neighbors=3, p=2, weights='distance')  # You can adjust the parameters accordingly

# Create SVM model
svm_model = SVC(C=1, gamma=0.01, kernel='rbf',probability=True)  # Use the best hyperparameters or tune them

# Create an ensemble using a soft voting strategy
ensemble_model1 = VotingClassifier(estimators=[('knn', knn_model), ('svm', svm_model)], voting='soft')

# Fit the ensemble model on the training data
ensemble_model1.fit(trainX_LDA, trainY)

# Make predictions and calculate accuracy for the ensemble
ensemble_predictions = ensemble_model1.predict(testX_LDA)
ensemble_accuracy = accuracy_score(testY, ensemble_predictions)
print("Ensemble Accuracy:", ensemble_accuracy)

# Calculate and print ensemble training score
ensemble_train_score = ensemble_model1.score(trainX_LDA, trainY)
print("Ensemble Training Score:", ensemble_train_score)

Ensemble Accuracy: 0.9460976979225154
Ensemble Training Score: 1.0


### Ensemble - KNN, SVM, Logistic Regression

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Assuming you have already defined trainX_LDA, testX_LDA, trainY, and testY
# Assuming you have standardized your data using StandardScaler

# Create KNN model
knn_model = KNeighborsClassifier(n_neighbors=3, p=2, weights='distance')  # You can adjust the parameters accordingly

# Create SVM model
svm_model = SVC(C=1, gamma=0.01, kernel='rbf', probability=True)  # Use the best hyperparameters or tune them

# Create Logistic Regression model
logreg_model = LogisticRegression(C=0.1, max_iter=100, penalty='l2')  # Parameters provided

# Create an ensemble using a soft voting strategy
ensemble_model = VotingClassifier(estimators=[('knn', knn_model), ('svm', svm_model), ('logreg', logreg_model)], voting='soft')

# Fit the ensemble model on the training data
ensemble_model.fit(trainX_LDA, trainY)

# Make predictions and calculate accuracy for the ensemble
ensemble_predictions = ensemble_model.predict(testX_LDA)
ensemble_accuracy = accuracy_score(testY, ensemble_predictions)
print("Ensemble Accuracy:", ensemble_accuracy)

# Calculate and print ensemble training score
ensemble_train_score = ensemble_model.score(trainX_LDA, trainY)
print("Ensemble Training Score:", ensemble_train_score)

import pickle

# Specify the full path for saving the model
file_path = root_path+"ensemble_model2.pkl"

# Save the trained ensemble model to the specified location
with open(file_path, 'wb') as file:
    pickle.dump(ensemble_model2, file)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Ensemble Accuracy: 0.9466591802358225
Ensemble Training Score: 0.9970842647487608


### F1 Score, Precision, Recall

In [None]:
# from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, classification_report
# import seaborn as sns
# import matplotlib.pyplot as plt

# # Make predictions on the test set
# ensemble_predictions1 = ensemble_model1.predict(testX_LDA)

# # Define class names
# class_names = [
#     "amarisian", "banmingkai", "chenziang", "chientingwei", "gowdarachandrashekarappasrivarsha",
#     "huangjiaoyan", "kodipunzulanandini", "lishumeng", "liuhongji", "lozanoroberto",
#     "manglaniroshanlakhi", "mendonakshay", "negiparth", "oraisisaac", "perambuduruvishnu",
#     "pereiranerissagodfrey", "ravijayanthidhanasekar", "sampagaonrahul", "selinayu", "shahmanali",
#     "sivarajusairevanth", "somaniachal", "upadhyevaishnavi", "vanderlindenilona",
#     "vennavellirajashekarreddy", "virvadianisargjyotin", "wukaiyue", "yashasvi", "zhangyuanzhen",
#     "zhouchuandi", "zotaharsh", "zuluagagonzalezisabel"
# ]

# # Compute precision, recall, and f1 score
# precision = precision_score(testY, ensemble_predictions1, average='weighted')
# recall = recall_score(testY, ensemble_predictions1, average='weighted')
# f1 = f1_score(testY, ensemble_predictions1, average='weighted')

# print("Precision:", precision)
# print("Recall:", recall)
# print("F1 Score:", f1)

# # # Create a confusion matrix heatmap
# # cm = confusion_matrix(testY, ensemble_predictions)
# # plt.figure(figsize=(10, 8))
# # sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=class_names, yticklabels=class_names)
# # plt.title("Confusion Matrix")
# # plt.xlabel("Predicted")
# # plt.ylabel("True")
# # plt.show()

# # # Display the classification report with class names
# # print("Classification Report:")
# # print(classification_report(testY, ensemble_predictions, target_names=class_names))

Precision: 0.9522166833269612
Recall: 0.9466591802358225
F1 Score: 0.9453336951180759


In [None]:
# import pickle

# # Save the trained SVM model to a Pickle file
# with open(new_path + 'model1.pkl', 'wb') as file:
#     pickle.dump(ensemble_model1, file)
