SMS_SPAM_COLLECTION_USING_NAIVE_BAYES

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

In [2]:
file_path = "SMSSpamCollection"

In [3]:
columns = ["Label", "Message"]

In [4]:
sms_data = pd.read_csv('data\SMSSpamCollection', sep='\t', names=columns)

In [5]:
print(sms_data.head())

  Label                                            Message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [6]:
sms_data["Label"] = sms_data["Label"].map({"spam": 1, "ham": 0})

In [7]:
X = sms_data["Message"]
y = sms_data["Label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [9]:
# 1. Naive Bayes Classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

In [10]:
svm_classifier = SVC(kernel='linear', C=1.0)
svm_classifier.fit(X_train, y_train)

In [11]:
y_pred_svm = svm_classifier.predict(X_test)

In [12]:
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print("SVM Classifier Results:")
print(f"Accuracy: {accuracy_svm:.2f}")
print(classification_report(y_test, y_pred_svm))

SVM Classifier Results:
Accuracy: 0.99
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       966
           1       1.00      0.92      0.96       149

    accuracy                           0.99      1115
   macro avg       0.99      0.96      0.98      1115
weighted avg       0.99      0.99      0.99      1115



In [13]:
y_pred = nb_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(classification_report(y_test, y_pred))


Accuracy: 0.9919282511210762
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       966
           1       1.00      0.94      0.97       149

    accuracy                           0.99      1115
   macro avg       1.00      0.97      0.98      1115
weighted avg       0.99      0.99      0.99      1115



In [14]:
cross_val_scores = cross_val_score(nb_classifier, X_train, y_train, cv=5, scoring="accuracy")
print(f"Cross-Validation Mean Accuracy: {cross_val_scores.mean()}")

Cross-Validation Mean Accuracy: 0.9800307509575074


In [15]:
param_grid = {"alpha": [0.1, 0.5, 1.0, 2.0]}
grid_search = GridSearchCV(nb_classifier, param_grid, cv=5, scoring="accuracy")
grid_search.fit(X_train, y_train)
best_nb_classifier = grid_search.best_estimator_
best_nb_accuracy = grid_search.best_score_
print(f"Best Naive Bayes Model Accuracy (with Hyperparameter Tuning): {best_nb_accuracy}")

Best Naive Bayes Model Accuracy (with Hyperparameter Tuning): 0.9800307509575074
