In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Load Dataset
df = pd.read_csv('spam.csv', encoding='latin-1')
df.head()
df = df[['v1', 'v2']]
df.columns = ['label', 'text']

# Data Preprocessing
df['label'] = df['label'].map({'ham': 0, 'spam': 1})
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

# Feature Extraction
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Models to evaluate
models = {
    'Naive Bayes': MultinomialNB(),
    'SVM': LinearSVC(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(),
    'K-Nearest Neighbors': KNeighborsClassifier()
}

# Dictionary to store results
results = {}

# Evaluate each model
for model_name, model in models.items():
    model.fit(X_train_vec, y_train)
    y_pred = model.predict(X_test_vec)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    results[model_name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }

    print(f'{model_name} Performance:')
    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'F1 Score: {f1}')
    print('Classification Report:')
    print(classification_report(y_test, y_pred))
    print('-'*60)

# Find the best model
best_model_name = max(results, key=lambda k: results[k]['Accuracy'])
best_model = models[best_model_name]
print(f'The best model is: {best_model_name} with Accuracy: {results[best_model_name]["Accuracy"]}')

# Function to predict new email
def predict_email(email_text):
    email_vec = vectorizer.transform([email_text])
    prediction = best_model.predict(email_vec)
    return 'Spam' if prediction[0] == 1 else 'Ham'

# User input prediction
user_input = input("Enter the email content for spam prediction: ")
prediction = predict_email(user_input)
print(f'The email is predicted to be: {prediction}')


Naive Bayes Performance:
Accuracy: 0.9623318385650225
Precision: 1.0
Recall: 0.72
F1 Score: 0.8372093023255813
Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.72      0.84       150

    accuracy                           0.96      1115
   macro avg       0.98      0.86      0.91      1115
weighted avg       0.96      0.96      0.96      1115

------------------------------------------------------------
SVM Performance:
Accuracy: 0.9829596412556054
Precision: 0.9851851851851852
Recall: 0.8866666666666667
F1 Score: 0.9333333333333333
Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.99      0.89      0.93       150

    accuracy                           0.98      1115
   macro avg       0.98      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      11