In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import joblib

data = pd.read_csv('/content/spam.csv', encoding='ISO-8859-1')
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [6]:

data = data[['v1', 'v2']]
data.columns = ['label', 'message']

# Handle missing values
data.dropna(inplace=True)

# Encode labels
data['label'] = data['label'].map({'ham': 0, 'spam': 1})

# Text Vectorization using TF-IDF
tfidf = TfidfVectorizer(stop_words='english')
X = tfidf.fit_transform(data['message'])
y = data['label']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['label'] = data['label'].map({'ham': 0, 'spam': 1})


In [7]:
# Naive Bayes Classifier
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
nb_predictions = nb_model.predict(X_test)
nb_accuracy = accuracy_score(y_test, nb_predictions)
nb_report = classification_report(y_test, nb_predictions)

# Logistic Regression Classifier
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)
lr_predictions = lr_model.predict(X_test)
lr_accuracy = accuracy_score(y_test, lr_predictions)
lr_report = classification_report(y_test, lr_predictions)

# Support Vector Machine Classifier
svm_model = SVC()
svm_model.fit(X_train, y_train)
svm_predictions = svm_model.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_predictions)
svm_report = classification_report(y_test, svm_predictions)

# Hyperparameter Tuning (example for Naive Bayes)
from sklearn.model_selection import GridSearchCV

param_grid = {'alpha': [0.1, 0.5, 1.0]}
grid_search = GridSearchCV(MultinomialNB(), param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_nb_model = grid_search.best_estimator_

# Save the best model
joblib.dump(best_nb_model, 'nb_model.joblib')

# Evaluation results
print("Naive Bayes Accuracy:", nb_accuracy)
print("Logistic Regression Accuracy:", lr_accuracy)
print("SVM Accuracy:", svm_accuracy)

print("Naive Bayes Classification Report:\n", nb_report)
print("Logistic Regression Classification Report:\n", lr_report)
print("SVM Classification Report:\n", svm_report)


Naive Bayes Accuracy: 0.968609865470852
Logistic Regression Accuracy: 0.9443946188340807
SVM Accuracy: 0.9721973094170404
Naive Bayes Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.77      0.87       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.93      1115
weighted avg       0.97      0.97      0.97      1115

Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.94      1.00      0.97       965
           1       0.97      0.61      0.75       150

    accuracy                           0.94      1115
   macro avg       0.96      0.80      0.86      1115
weighted avg       0.95      0.94      0.94      1115

SVM Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98       965
       

Based on the results provided, the Support Vector Machine (SVM) model has better performance metrics than the Logistic Regression model. Therefore, we'll use the SVM model to detect whether an SMS is spam or not.

In [10]:

svm = SVC()
svm.fit(X_train, y_train)

def classify_sms(sms, model, vectorizer):
    sms_tfidf = vectorizer.transform([sms])
    prediction = model.predict(sms_tfidf)
    return "spam" if prediction[0] == 1 else "ham"

# Example
new_sms_messages = [
    "Congratulations! You've won a $1,000 Walmart gift card. Go to http://bit.ly/12345 to claim now.",
    "Hey, are we still on for dinner tonight?",
    "Important information regarding your account has been updated. Please login to verify."
]

for sms in new_sms_messages:
    print(f"SMS: '{sms}' is classified as {classify_sms(sms, svm, tfidf)}")

SMS: 'Congratulations! You've won a $1,000 Walmart gift card. Go to http://bit.ly/12345 to claim now.' is classified as spam
SMS: 'Hey, are we still on for dinner tonight?' is classified as ham
SMS: 'Important information regarding your account has been updated. Please login to verify.' is classified as ham
