In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load dataset
hamspam_df = pd.read_csv("hamspam.csv")

# Drop the ID column if it exists
if 'ID' in hamspam_df.columns:
    hamspam_df = hamspam_df.drop(columns=['ID'])

# Trim any extra spaces from column names
hamspam_df.columns = hamspam_df.columns.str.strip()

# Rename columns if necessary
column_rename_map = {'Contains Money Words': 'Contains Money', 'Length': 'Words Length'}
for old_name, new_name in column_rename_map.items():
    if old_name in hamspam_df.columns:
        hamspam_df.rename(columns={old_name: new_name}, inplace=True)

# Encode categorical variables
le = LabelEncoder()
hamspam_df['Contains Link'] = le.fit_transform(hamspam_df['Contains Link'])
hamspam_df['Contains Money'] = le.fit_transform(hamspam_df['Contains Money'])
hamspam_df['Words Length'] = le.fit_transform(hamspam_df['Words Length'])
hamspam_df['Class'] = le.fit_transform(hamspam_df['Class'])  # Spam=1, Ham=0

# Split data into training and testing sets
X = hamspam_df.drop(columns=['Class'])
y = hamspam_df['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Naïve Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
nb_predictions = nb_model.predict(X_test)

# Train KNN model (K=2, Euclidean Distance)
knn_model = KNeighborsClassifier(n_neighbors=2, metric='euclidean')
knn_model.fit(X_train, y_train)
knn_predictions = knn_model.predict(X_test)

# Evaluate the models
nb_accuracy = accuracy_score(y_test, nb_predictions)
knn_accuracy = accuracy_score(y_test, knn_predictions)

# Display classification reports
nb_report = classification_report(y_test, nb_predictions)
knn_report = classification_report(y_test, knn_predictions)

# Print results
print(f"Naïve Bayes Accuracy: {nb_accuracy:.2f}")
print(f"KNN Accuracy: {knn_accuracy:.2f}")
print("\nNaïve Bayes Classification Report:\n", nb_report)
print("\nKNN Classification Report:\n", knn_report)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_curve, auc, accuracy_score, classification_report

# Load dataset
hamspam_df = pd.read_csv("hamspam.csv")

# Drop the ID column if it exists
if 'ID' in hamspam_df.columns:
    hamspam_df = hamspam_df.drop(columns=['ID'])

# Trim any extra spaces from column names
hamspam_df.columns = hamspam_df.columns.str.strip()

# Rename columns if necessary
column_rename_map = {'Contains Money Words': 'Contains Money', 'Length': 'Words Length'}
for old_name, new_name in column_rename_map.items():
    if old_name in hamspam_df.columns:
        hamspam_df.rename(columns={old_name: new_name}, inplace=True)

# Encode categorical variables
le = LabelEncoder()
hamspam_df['Contains Link'] = le.fit_transform(hamspam_df['Contains Link'])
hamspam_df['Contains Money'] = le.fit_transform(hamspam_df['Contains Money'])
hamspam_df['Words Length'] = le.fit_transform(hamspam_df['Words Length'])
hamspam_df['Class'] = le.fit_transform(hamspam_df['Class'])  # Spam=1, Ham=0

# Split data into training and testing sets
X = hamspam_df.drop(columns=['Class'])
y = hamspam_df['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Decision Tree model with limited depth
dt_model = DecisionTreeClassifier(max_depth=5, random_state=42)
dt_model.fit(X_train, y_train)
dt_probs = dt_model.predict_proba(X_test)[:, 1]

# Compute ROC Curve and AUC for Decision Tree
fpr_dt, tpr_dt, _ = roc_curve(y_test, dt_probs)
roc_auc_dt = auc(fpr_dt, tpr_dt)

# Plot ROC Curve
plt.figure()
plt.plot(fpr_dt, tpr_dt, color='blue', label=f'Decision Tree (AUC = {roc_auc_dt:.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.grid()
plt.show()

# Output results
print(f"Decision Tree AUC: {roc_auc_dt:.2f}")