<a href="https://colab.research.google.com/github/SAYEDASHRAF1218/CODSOFT.4/blob/main/Spam_Sms_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Spam SMS Detection (for spam.csv dataset from Kaggle)

import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# -----------------------------
# 1. Load Dataset
# -----------------------------
# The Kaggle spam.csv dataset has extra unnamed columns -> we drop them
df = pd.read_csv("spam.csv", encoding='latin-1')

# Keep only the first two columns: v1 = label, v2 = message
df = df[['v1', 'v2']]
df.columns = ['label', 'message']

print("Dataset shape:", df.shape)
print("Sample data:\n", df.head())

# -----------------------------
# 2. Encode Labels & Preprocess
# -----------------------------
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)  # remove special characters
    return text

df['message'] = df['message'].apply(clean_text)

# -----------------------------
# 3. Train-Test Split
# -----------------------------
X = df['message']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# -----------------------------
# 4. Feature Extraction (TF-IDF)
# -----------------------------
tfidf = TfidfVectorizer(max_features=3000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# -----------------------------
# 5. Train Models
# -----------------------------
# Naive Bayes
nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train)
y_pred_nb = nb.predict(X_test_tfidf)

# Logistic Regression
lr = LogisticRegression(max_iter=200)
lr.fit(X_train_tfidf, y_train)
y_pred_lr = lr.predict(X_test_tfidf)

# -----------------------------
# 6. Evaluation
# -----------------------------
print("\n=== Naive Bayes Results ===")
print("Accuracy:", accuracy_score(y_test, y_pred_nb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_nb))
print("Classification Report:\n", classification_report(y_test, y_pred_nb))

print("\n=== Logistic Regression Results ===")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))
print("Classification Report:\n", classification_report(y_test, y_pred_lr))

# -----------------------------
# 7. Test on Custom Messages
# -----------------------------
messages = [
    "Congratulations! You won a $1000 Walmart gift card. Call now to claim your prize!",
    "Hey, are we still meeting for lunch today?",
    "URGENT! Your account has been compromised. Visit http://fakebank.com to verify details."
]

print("\n=== Custom Message Predictions ===")
for msg in messages:
    msg_clean = clean_text(msg)
    msg_tfidf = tfidf.transform([msg_clean])
    pred = nb.predict(msg_tfidf)[0]
    print(f"Message: {msg}\nPrediction: {'SPAM' if pred==1 else 'HAM'}\n")


Dataset shape: (5572, 2)
Sample data:
   label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...

=== Naive Bayes Results ===
Accuracy: 0.9695067264573991
Confusion Matrix:
 [[965   0]
 [ 34 116]]
Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98       965
           1       1.00      0.77      0.87       150

    accuracy                           0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115


=== Logistic Regression Results ===
Accuracy: 0.967713004484305
Confusion Matrix:
 [[965   0]
 [ 36 114]]
Classification Report:
               precision  