<a href="https://colab.research.google.com/github/Sans7349/CODESOFT/blob/My-tasks/TASK_4_SPAM_SMS_DETETCTION.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# DATA PREPROCESSING

import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
file_path = '/content/spam.csv'
df = pd.read_csv(file_path, encoding='latin-1')

# Clean the dataset by selecting only relevant columns
df = df[['v1', 'v2']]
df.columns = ['label', 'message']

# Convert labels to binary: spam=1, ham=0
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['message'], df['label'], test_size=0.2, random_state=42)

# FEATURE ENGINEERING

from sklearn.feature_extraction.text import TfidfVectorizer

# Feature extraction using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# MODEL TRAINING

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Initialize classifiers
nb_classifier = MultinomialNB()
lr_classifier = LogisticRegression(max_iter=1000)
svm_classifier = SVC(kernel='linear')

# Train Naive Bayes classifier
nb_classifier.fit(X_train_tfidf, y_train)

# Train Logistic Regression classifier
lr_classifier.fit(X_train_tfidf, y_train)

# Train SVM classifier
svm_classifier.fit(X_train_tfidf, y_train)

# MODEL EVALUATION

from sklearn.metrics import classification_report, accuracy_score

# Define a function to map binary predictions to labels
def map_labels(predictions):
    return ['spam' if label == 1 else 'legitimate' for label in predictions]

# Evaluate Naive Bayes classifier
nb_predictions = nb_classifier.predict(X_test_tfidf)
nb_accuracy = accuracy_score(y_test, nb_predictions)
nb_report = classification_report(y_test, nb_predictions, target_names=['legitimate', 'spam'])
nb_predictions_labels = map_labels(nb_predictions)

# Evaluate Logistic Regression classifier
lr_predictions = lr_classifier.predict(X_test_tfidf)
lr_accuracy = accuracy_score(y_test, lr_predictions)
lr_report = classification_report(y_test, lr_predictions, target_names=['legitimate', 'spam'])
lr_predictions_labels = map_labels(lr_predictions)

# Evaluate SVM classifier
svm_predictions = svm_classifier.predict(X_test_tfidf)
svm_accuracy = accuracy_score(y_test, svm_predictions)
svm_report = classification_report(y_test, svm_predictions, target_names=['legitimate', 'spam'])
svm_predictions_labels = map_labels(svm_predictions)

# Print the results
print("Naive Bayes Classifier:\n")
print(f"Accuracy: {nb_accuracy}\n")
print(nb_report)
print("Predictions:", nb_predictions_labels[:10])  # Print first 10 predictions for brevity

print("___________________________________________________________________________________________________________________________________________________")
print("\nLogistic Regression Classifier:\n")
print(f"Accuracy: {lr_accuracy}\n")
print(lr_report)
print("Predictions:", lr_predictions_labels[:10])  # Print first 10 predictions for brevity

print("___________________________________________________________________________________________________________________________________________________")
print("\nSupport Vector Machine (SVM) Classifier:\n")
print(f"Accuracy: {svm_accuracy}\n")
print(svm_report)
print("Predictions:", svm_predictions_labels[:10])  # Print first 10 predictions for brevity
print("___________________________________________________________________________________________________________________________________________________")


Naive Bayes Classifier:

Accuracy: 0.9668161434977578

              precision    recall  f1-score   support

  legitimate       0.96      1.00      0.98       965
        spam       1.00      0.75      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115

Predictions: ['legitimate', 'legitimate', 'legitimate', 'legitimate', 'spam', 'legitimate', 'legitimate', 'legitimate', 'legitimate', 'legitimate']
___________________________________________________________________________________________________________________________________________________

Logistic Regression Classifier:

Accuracy: 0.9524663677130045

              precision    recall  f1-score   support

  legitimate       0.95      1.00      0.97       965
        spam       0.97      0.67      0.79       150

    accuracy                           0.95      1115
   macro avg       0.96      0.83      0