In [None]:
import pandas as pd
import numpy as np
import cv2
import matplotlib.pyplot as plt
# from memory_profiler import memory_usage
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier, StackingClassifier
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.applications import EfficientNetB7
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
import psutil
# import resource
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_curve, auc
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import validation_curve, learning_curve


# Define the image size
import os
IMG_SIZE = 50

from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Colab-Notebooks/Anomaly-Detection/CSV


train_df = pd.read_csv("./train_feature_index.csv")
test_df = pd.read_csv("./test_feature_index.csv")
# train_df = pd.read_csv("./train_final_index.csv")
# test_df = pd.read_csv("./test_final_index.csv")


path = "/content/drive/MyDrive/Colab-Notebooks/Anomaly-Detection/fiveTwelve"
print("found", os.path.exists(path=path))

# import pandas as pd
X_train = train_df.drop(columns=["label", "path"])
y_train = train_df["label"]

X_test = test_df.drop(columns=["label", "path"])
y_test = test_df["label"]

# plotCorrelationMatrix(train_df, 8)
# plotScatterMatrix(train_df, 20, 10)

def load_and_preprocess_image(path):
    image = cv2.imread(path)
    if(image is not None):
      image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
      image = cv2.resize(image, (224, 224))
      image = image / 255.0
      return image



# Initialize lists to store accuracy and loss for each n_neighbors value
accuracy_history = []
loss_history = []

X_train_reorder = np.random.permutation(X_train)

knn = KNeighborsClassifier()

params_knn = {"n_neighbors": np.arange(1, 50)}

knn_gs = GridSearchCV(knn, params_knn, cv=20)

knn_gs.fit(X_train, y_train)
knn_gs.fit(X_train_reorder, y_train)

best_params = knn_gs.best_params_
best_score = knn_gs.best_score_

print("Best Parameters:", best_params)
print("Best Score:", best_score)

used_algorithm = knn.algorithm

knn_best = knn_gs.best_estimator_
print(knn_gs.best_params_)


X_train_reorder = np.random.permutation(X_train)
# params_rf = {"n_estimators": [10, 20, 40, 60, 80, 100, 140, 180, 200]}
rf = RandomForestClassifier()
params_rf = {"n_estimators": [50, 100, 150, 200, 250, 300, 350]}
rf_gs = GridSearchCV(rf, params_rf, cv=20)
rf_gs.fit(X_train, y_train)
rf_gs.fit(X_train_reorder, y_train)


rf_best = rf_gs.best_estimator_
print(rf_gs.best_params_)
rf_best_params = rf_gs.best_params_
rf_best_score = rf_gs.best_score_

print("Best Parameters:", rf_best_params)
print("Best Score:", rf_best_score)
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)

print("knn: {}".format(knn_best.score(X_test, y_test)))
print("rf: {}".format(rf_best.score(X_test, y_test)))


# ROC curve for KNN
knn_probs = knn_best.predict_proba(X_test)[:, 1]
fpr_knn, tpr_knn, _ = roc_curve(y_test, knn_probs)
roc_auc_knn = auc(fpr_knn, tpr_knn)

# ROC curve for Random Forest
rf_probs = rf_best.predict_proba(X_test)[:, 1]
fpr_rf, tpr_rf, _ = roc_curve(y_test, rf_probs)
roc_auc_rf = auc(fpr_rf, tpr_rf)

# Plot ROC curves
plt.figure(figsize=(10, 6))
plt.plot(fpr_knn, tpr_knn, color='blue', lw=2, label='KNN ROC curve (area = {:.2f})'.format(roc_auc_knn))
plt.plot(fpr_rf, tpr_rf, color='green', lw=2, label='Random Forest ROC curve (area = {:.2f})'.format(roc_auc_rf))
plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

# Validation curve for KNN (parameter: n_neighbors)
param_range = np.arange(1, 25)
train_scores, test_scores = validation_curve(
    knn_best, X_train, y_train, param_name="n_neighbors", param_range=param_range,
    cv=10, scoring="accuracy", n_jobs=-1
)

# Plot validation curve for KNN
plt.figure(figsize=(10, 6))
plt.plot(param_range, np.mean(train_scores, axis=1), label="Training score")
plt.plot(param_range, np.mean(test_scores, axis=1), label="Cross-validation score")
plt.xlabel("Number of Neighbors")
plt.ylabel("Score")
plt.title("Validation Curve for KNN")
plt.legend(loc="best")
plt.show()

# Learning curve for KNN
train_sizes, train_scores, test_scores = learning_curve(
    knn_best, X_train, y_train, cv=10, n_jobs=-1,
    train_sizes=np.linspace(0.1, 1.0, 10), scoring="accuracy"
)

# Plot learning curve for KNN
plt.figure(figsize=(10, 6))
plt.plot(train_sizes, np.mean(train_scores, axis=1), label="Training score")
plt.plot(train_sizes, np.mean(test_scores, axis=1), label="Cross-validation score")
plt.xlabel("Number of Training Examples")
plt.ylabel("Score")
plt.title("Learning Curve for KNN")
plt.legend(loc="best")
plt.show()

# ROC curve for Random Forest
rf_probs = rf_best.predict_proba(X_test)[:, 1]
fpr_rf, tpr_rf, _ = roc_curve(y_test, rf_probs)
roc_auc_rf = auc(fpr_rf, tpr_rf)

# Plot ROC curves for KNN and Random Forest
plt.figure(figsize=(12, 8))
plt.plot(fpr_knn, tpr_knn, color='blue', lw=2, label='KNN ROC curve (area = {:.2f})'.format(roc_auc_knn))
plt.plot(fpr_rf, tpr_rf, color='green', lw=2, label='Random Forest ROC curve (area = {:.2f})'.format(roc_auc_rf))
plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

# Validation curve for Random Forest (parameter: n_estimators)
param_range_rf = np.arange(50, 351, 50)
train_scores_rf, test_scores_rf = validation_curve(
    rf_best, X_train, y_train, param_name="n_estimators", param_range=param_range_rf,
    cv=10, scoring="accuracy", n_jobs=-1
)

# Plot validation curve for Random Forest
plt.figure(figsize=(10, 6))
plt.plot(param_range_rf, np.mean(train_scores_rf, axis=1), label="Training score")
plt.plot(param_range_rf, np.mean(test_scores_rf, axis=1), label="Cross-validation score")
plt.xlabel("Number of Estimators")
plt.ylabel("Score")
plt.title("Validation Curve for Random Forest")
plt.legend(loc="best")
plt.show()

# Learning curve for Random Forest
train_sizes_rf, train_scores_rf, test_scores_rf = learning_curve(
    rf_best, X_train, y_train, cv=10, n_jobs=-1,
    train_sizes=np.linspace(0.1, 1.0, 10), scoring="accuracy"
)

# Plot learning curve for Random Forest
plt.figure(figsize=(10, 6))
plt.plot(train_sizes_rf, np.mean(train_scores_rf, axis=1), label="Training score")
plt.plot(train_sizes_rf, np.mean(test_scores_rf, axis=1), label="Cross-validation score")
plt.xlabel("Number of Training Examples")
plt.ylabel("Score")
plt.title("Learning Curve for Random Forest")
plt.legend(loc="best")
plt.show()



Mounted at /content/drive
/content/drive/MyDrive/Colab-Notebooks/Anomaly-Detection/CSV
found True
Best Parameters: {'n_neighbors': 6}
Best Score: 0.9788868869488164
{'n_neighbors': 6}


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-1-3984a5988d87>", line 94, in <cell line: 94>
    rf_gs.fit(X_train, y_train)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_search.py", line 874, in fit
    self._run_search(evaluate_candidates)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_search.py", line 1388, in _run_search
    evaluate_candidates(ParameterGrid(self.param_grid))
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_search.py", line 821, in evaluate_candidates
    out = parallel(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/parallel.py", line 63, in __call__
    return super().__call__(iterable_with_config)
  File "/usr/local/lib/python3.10/dist-packages/joblib/parallel.py", line 1863, in __call__
    return outpu

TypeError: ignored