<a href="https://colab.research.google.com/github/Prem386/Machine-Learning/blob/main/Spam_Detector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("uciml/sms-spam-collection-dataset")

print("Path to dataset files:", path)


Using Colab cache for faster access to the 'sms-spam-collection-dataset' dataset.
Path to dataset files: /kaggle/input/sms-spam-collection-dataset


Converting the text data into numerical data to better train the model

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

df = pd.read_csv(path + "/spam.csv", encoding="latin-1")
# Drop unnecessary columns if present (some spam datasets have extra columns)
df = df[['v1', 'v2']]

# Features and target
X = df['v2']  # Text messages
y = df['v1']  # Labels (spam or ham)

vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)  # You can adjust max_features
X_tfidf = vectorizer.fit_transform(X)


# Split dataset *after* vectorization
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

Training the model using the logistic regression model

In [13]:
# Logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

Training the model using the Naive Byes Classifier

In [14]:
# Naive Bayes Classifier
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

# Predict using Naive Bayes
y_pred_nb = nb_model.predict(X_test_tfidf)

Training the model using the Support Vector Machine Model

In [15]:
# Support Vector Machine (SVM) Classifier
svm_model = SVC(kernel='linear')
svm_model.fit(X_train_tfidf, y_train)

# Predict using SVM
y_pred_svm = svm_model.predict(X_test_tfidf)

Evaluation of the models

In [18]:
# Evaluate Logistic Regression Model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\n")

# Evaluate Naive Bayes Model
print("Naive Bayes Classifier Evaluation:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_nb)}")
print("Classification Report:")
print(classification_report(y_test, y_pred_nb))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_nb))
print("\n")

# Evaluate SVM Model
print("\nSupport Vector Machine Classifier Evaluation:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_svm)}")
print("Classification Report:")
print(classification_report(y_test, y_pred_svm))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_svm))

Accuracy: 0.9775784753363229
Classification Report:
               precision    recall  f1-score   support

         ham       0.98      1.00      0.99       965
        spam       0.99      0.84      0.91       150

    accuracy                           0.98      1115
   macro avg       0.98      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115

Confusion Matrix:
 [[964   1]
 [ 24 126]]


Naive Bayes Classifier Evaluation:
Accuracy: 0.9748878923766816
Classification Report:
              precision    recall  f1-score   support

         ham       0.97      1.00      0.99       965
        spam       1.00      0.81      0.90       150

    accuracy                           0.97      1115
   macro avg       0.99      0.91      0.94      1115
weighted avg       0.98      0.97      0.97      1115

Confusion Matrix:
[[965   0]
 [ 28 122]]



Support Vector Machine Classifier Evaluation:
Accuracy: 0.9757847533632287
Classification Report:
              precis