In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score


data = pd.read_csv('emails.csv')
data = data.drop(columns=['Email No.'])

train_size = int(0.8 * len(data))
train_set = data[:train_size]
test_set = data[train_size:]

X_train = train_set.drop(columns=['Prediction']).values
y_train = train_set['Prediction'].values
X_test = test_set.drop(columns=['Prediction']).values
y_test = test_set['Prediction'].values

def euclidean_distance(a, b):
    return np.sqrt(np.sum((a - b) ** 2))

def knn(X_train, y_train, X_test, k=5):
    predictions = []
    
    for test_instance in X_test:
        distances = [euclidean_distance(test_instance, train_instance) for train_instance in X_train]
        k_nearest_indices = np.argsort(distances)[:k]
        k_nearest_labels = y_train[k_nearest_indices]
        most_common_label = np.sum(k_nearest_labels == 1) /k
        predictions.append(most_common_label)
        
    return predictions

# Logistic Regression Algorithm
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def logistic_regression(X_train, y_train, X_test, lr=0.01, iterations=3000):
    theta = np.zeros(X_train.shape[1])
    for _ in range(iterations):
        z = np.dot(X_train, theta)
        predictions = sigmoid(z)
        errors = predictions - y_train
        gradient = np.dot(X_train.T, errors/len(predictions))
        theta -= lr * gradient
    return sigmoid(np.dot(X_test, theta))

knn_preds = knn(X_train, y_train, X_test, k=5)
logistic_preds = logistic_regression(X_train, y_train, X_test)

fpr_knn, tpr_knn, _ = roc_curve(y_test, knn_preds)
roc_auc_knn = roc_auc_score(y_test, knn_preds)

fpr_logistic, tpr_logistic, _ = roc_curve(y_test, logistic_preds)
roc_auc_logistic = roc_auc_score(y_test, logistic_preds)

# Plot ROC curves
plt.figure()
plt.plot(fpr_knn, tpr_knn, label=f'kNN (AUC = {roc_auc_knn:.2f})')
plt.plot(fpr_logistic, tpr_logistic, label=f'Logistic Regression (AUC = {roc_auc_logistic:.2f})')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Logistic vs KNN')
plt.legend(loc='lower right')
plt.savefig('ROC&AUC')
plt.show()