In [1]:
import pandas as pd
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [4]:
# 1. Load the dataset
data = pd.read_csv("spam.csv", encoding="latin-1")[["v1", "v2"]]
data.columns = ["label", "message"]

FileNotFoundError: [Errno 2] No such file or directory: 'spam.csv'

In [3]:
# 2. Convert labels (ham = 0, spam = 1)
data["label"] = data["label"].map({"ham": 0, "spam": 1})

NameError: name 'data' is not defined

In [5]:
# 3. Preprocess messages
def clean_text(msg):
    msg = msg.lower()      # lowercase
    msg = msg.translate(str.maketrans("", "", string.punctuation))  # remove punctuation
    return msg

data["message"] = data["message"].apply(clean_text)

In [6]:
# 🔹 3.1 Add custom Indian scam-style spam messages
extra_spam = [
    "you won 10 lakh rupees",
    "congratulations you won 1 crore lottery",
    "claim your free prize of 25 lakh now",
    "dear user you have won 50 lakh lottery",
    "get 1 cr cashback instantly",
    "lottery offer 10 lakh free entry",
    "urgent call now to win 5 lakh",
    "you have won 1 crore lucky draw",
    "lottery"
]

extra_data = pd.DataFrame({
    "label": [1]*len(extra_spam),   # spam = 1
    "message": [clean_text(msg) for msg in extra_spam]
})

# Append extra spam messages to dataset
data = pd.concat([data, extra_data], ignore_index=True)

In [7]:
# 4. Split into training and testing
X_train, X_test, y_train, y_test = train_test_split(
    data["message"], data["label"], test_size=0.2, random_state=42
)

In [8]:
# 5. Vectorization (TF-IDF)
vectorizer = TfidfVectorizer(stop_words="english", max_features=3000, ngram_range=(1,2))
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [9]:
# 6. Train Logistic Regression (balanced)
model = LogisticRegression(max_iter=200, class_weight="balanced")
model.fit(X_train, y_train)

print("✅ Model Accuracy:", accuracy_score(y_test, model.predict(X_test)))

✅ Model Accuracy: 0.9803043867502238


In [None]:
# 7. User input prediction with adjusted threshold + probability printing
while True:
    msg = input("\nEnter a message (or type 'exit' to quit): ")
    if msg.lower() == "exit":
        print("👋 Exiting Spam Detector.")
        break
    
    msg_clean = clean_text(msg)
    msg_vec = vectorizer.transform([msg_clean])
    prob = model.predict_proba(msg_vec)[0][1]  # spam probability
    
    print(f"📊 Spam Probability: {prob:.2f}")  # Always show probability
    
    if prob > 0.35:   # lowered threshold
        print(f"Prediction: 🚨 Spam")
    else:
        print(f"Prediction: ✅ Not Spam")