# import useful libraries

In [30]:
import re
import nltk
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score,confusion_matrix


In [3]:
data=pd.read_csv('spam.csv')

In [4]:
data = data[['v1', 'v2']]

In [5]:
data.columns = ['label', 'text']

In [6]:
spam_count = data['label'].where(data['label'] == 'spam').count()
print(spam_count)
ham_count = data['label'].where(data['label'] == 'ham').count()
print(ham_count)

747
4825


In [7]:
data.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /home/suhail/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/suhail/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/suhail/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

# Data cleaning

In [9]:
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', '', text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = nltk.word_tokenize(text)
    text = ' '.join([word for word in words if word not in stop_words])
    return text

In [10]:
labels = data['label']

In [11]:
emails_cleaned = [clean_text(email) for email in data['text']]
print("Cleaned Emails:")
print(emails_cleaned)

Cleaned Emails:


In [12]:
# 1. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(emails_cleaned, labels, test_size=0.2, random_state=42)

# Vectorization

In [38]:
# Initialize the vectorizer
vectorizer = TfidfVectorizer()

# Transform the text data
X_train_transformed = vectorizer.fit_transform(X_train).toarray()
X_test_transformed= vectorizer.transform(X_test).toarray()

# Build model pipeline

In [14]:
# 2. Define Models
models = {
    "Logistic Regression": LogisticRegression(max_iter=500),
    "Naive Bayes": MultinomialNB(),
    "SVM": SVC(kernel='linear', probability=True),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
}

# Evaluation

In [15]:
# 3. Build and Evaluate Pipelines
for model_name, model in models.items():
    print(f"Evaluating {model_name}...")
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer()),  # Text embedding
        ('classifier', model)         # Classifier
    ])
    
    # Train the pipeline
    pipeline.fit(X_train, y_train)
    
    # Predict and evaluate
    y_pred = pipeline.predict(X_test)
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("\n")

Evaluating Logistic Regression...
Accuracy: 0.95
Classification Report:
              precision    recall  f1-score   support

         ham       0.95      1.00      0.97       965
        spam       0.96      0.67      0.79       150

    accuracy                           0.95      1115
   macro avg       0.96      0.83      0.88      1115
weighted avg       0.95      0.95      0.95      1115



Evaluating Naive Bayes...
Accuracy: 0.97
Classification Report:
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98       965
        spam       1.00      0.76      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.97      1115



Evaluating SVM...
Accuracy: 0.98
Classification Report:
              precision    recall  f1-score   support

         ham       0.98      1.00      0.99       965
        spam       0.98      0.87      0.

# Fine Tuning

In [16]:
# Define the parameter grid
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto'],
}

# Create the SVM model
svm_model = SVC()

# Perform Grid Search with 5-fold cross-validation
grid_search = GridSearchCV(estimator=svm_model, param_grid=param_grid, cv=5, scoring='accuracy', verbose=2)

# Fit the Grid Search on  training data
grid_search.fit(X_train_transformed, y_train)

# Display the best parameters and the best score
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=  35.0s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=  34.7s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=  35.0s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=  34.1s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=  36.4s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time= 1.3min
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time= 1.1min
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time= 1.1min
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=  59.5s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time= 1.0min
[CV] END ...................C=0.1, gamma=auto, kernel=linear; total time=  35.3s
[CV] END ...................C=0.1, gamma=auto, k

ValueError: could not convert string to float: 'funny fact nobody teaches volcanoes 2 erupt tsunamis 2 arise hurricanes 2 sway aroundn 1 teaches hw 2 choose wife natural disasters happens'

# Evaluation of Accuracy Metrics

In [32]:
# Evaluate on test data
test_accuracy = grid_search.score(X_test_transformed, y_test)
print("Test Accuracy with Best Parameters:", test_accuracy)

Test Accuracy with Best Parameters: 0.9775784753363229


# Random Example

In [41]:
input_your_mail = ["PayPal Your access has been limited Dear Client, Our technical support and customer department has recently suspected activities in your account. Your Paypal account has been limited because we've noticed significant changes in your account activity. As Your payment processor, we need to understand these change better. We're always concerned about our customers security so please help us recover your account by following the link below. Restore Payment To PayPal Copyright © 1999-2020 PayPal. All rights reserved"]
mail = clean_text(" ".join(input_your_mail))
mail_transformed = vectorizer.transform([mail]).toarray()
# Make predictions
y_pred = grid_search.predict(mail_transformed)
print(y_pred)

['ham']
