# import useful libraries

In [None]:
import re
import nltk
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score,confusion_matrix


In [None]:
data=pd.read_csv('spam.csv')

In [None]:
data = data[['v1', 'v2']]

In [None]:
data.columns = ['label', 'text']

In [None]:
spam_count = data['label'].where(data['label'] == 'spam').count()
print(spam_count)
ham_count = data['label'].where(data['label'] == 'ham').count()
print(ham_count)

In [None]:
data.head()

In [None]:
# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

# Data cleaning

In [None]:
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', '', text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = nltk.word_tokenize(text)
    text = ' '.join([word for word in words if word not in stop_words])
    return text

In [None]:
labels = data['label']

In [None]:
emails_cleaned = [clean_text(email) for email in data['text']]
print("Cleaned Emails:")
print(emails_cleaned)

In [None]:
# 1. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(emails_cleaned, labels, test_size=0.2, random_state=42)

# Vectorization

In [None]:
# Initialize the vectorizer
vectorizer = TfidfVectorizer()

# Transform the text data
X_train_transformed = vectorizer.fit_transform(X_train).toarray()
X_test_transformed= vectorizer.transform(X_test).toarray()

# Build model pipeline

In [None]:
# 2. Define Models
models = {
    "Logistic Regression": LogisticRegression(max_iter=500),
    "Naive Bayes": MultinomialNB(),
    "SVM": SVC(kernel='linear', probability=True),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
}

# Evaluation

In [None]:
# 3. Build and Evaluate Pipelines
for model_name, model in models.items():
    print(f"Evaluating {model_name}...")
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer()),  # Text embedding
        ('classifier', model)         # Classifier
    ])
    
    # Train the pipeline
    pipeline.fit(X_train, y_train)
    
    # Predict and evaluate
    y_pred = pipeline.predict(X_test)
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("\n")

# Fine Tuning

In [None]:
# Define the parameter grid
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto'],
}

# Create the SVM model
svm_model = SVC()

# Perform Grid Search with 5-fold cross-validation
grid_search = GridSearchCV(estimator=svm_model, param_grid=param_grid, cv=5, scoring='accuracy', verbose=2)

# Fit the Grid Search on  training data
grid_search.fit(X_train_transformed, y_train)

# Display the best parameters and the best score
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

# Evaluation of Accuracy Metrics

In [None]:
# Evaluate on test data
test_accuracy = grid_search.score(X_test_transformed, y_test)
print("Test Accuracy with Best Parameters:", test_accuracy)

# Random Example

In [None]:
input_your_mail = ["PayPal Your access has been limited Dear Client, Our technical support and customer department has recently suspected activities in your account. Your Paypal account has been limited because we've noticed significant changes in your account activity. As Your payment processor, we need to understand these change better. We're always concerned about our customers security so please help us recover your account by following the link below. Restore Payment To PayPal Copyright © 1999-2020 PayPal. All rights reserved"]
mail = clean_text(" ".join(input_your_mail))
mail_transformed = vectorizer.transform([mail]).toarray()
# Make predictions
y_pred = grid_search.predict(mail_transformed)
print(y_pred)