In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Load dataset (You can replace this with your dataset)
data = pd.read_csv("spam.csv", encoding='latin-1')
data = data[['label', 'text']]
data.columns = ['label', 'message']  # Rename for consistency

data['label'] = data['label'].map({'ham': 0, 'spam': 1})

# Text preprocessing
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X = vectorizer.fit_transform(data['message'])
y = data['label']

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train SVM model
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

# Predictions
y_pred = svm_model.predict(X_test)

# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')
print(classification_report(y_test, y_pred))

def predict_email(email_text):
    email_vector = vectorizer.transform([email_text])
    prediction = svm_model.predict(email_vector)
    return "Spam" if prediction[0] == 1 else "Ham"

test_email = "Congratulations! You won a lottery. Claim now!"
print(test_email)
print(predict_email(test_email))
test_email2 = "Heloo, we have a meeting tonight"
print(test_email2)
print(predict_email(test_email2))

Accuracy: 99.03%
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       742
           1       0.98      0.98      0.98       293

    accuracy                           0.99      1035
   macro avg       0.99      0.99      0.99      1035
weighted avg       0.99      0.99      0.99      1035

Congratulations! You won a lottery. Claim now!
Spam
Heloo, we have a meeting tonight
Ham
