#### **Train**

In [10]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV

In [11]:
train_df= pd.read_csv("/content/train.csv")
val_df= pd.read_csv("/content/validation.csv")
test_df= pd.read_csv("/content/test.csv")


In [12]:
def clean_text(text):
    text=text.lower()
    text=re.sub(r"\d+", "", text)
    text=re.sub(r"\s+", " ", text)
    text=text.translate(str.maketrans("", "", string.punctuation))
    return text


In [13]:
def extract_features(X):
    if isinstance(X, pd.DataFrame):
        text = X.iloc[:, 0]
    elif isinstance(X, pd.Series):
        text = X
    else:
        text = pd.Series([X])
    return np.column_stack([
        text.str.len(),
        text.str.split().str.len(),
        text.str.count("!"),
        text.str.count(r"\d"),
        text.apply(lambda x: sum(c.isupper() for c in x) / (len(x) + 1))
    ])

In [14]:
preprocessor = ColumnTransformer(
    transformers=[
        ("tfidf", TfidfVectorizer(
            preprocessor=clean_text,
            stop_words="english",
            ngram_range=(1, 2),
            max_features=3000
        ), "message"),

        ("extra", FunctionTransformer(
            extract_features, validate=False
        ), ["message"])
    ]
)


In [15]:
def fit_model(model, X, y):
    pipeline = Pipeline([
        ("features", preprocessor),
        ("model", model)
    ])
    pipeline.fit(X, y)
    return pipeline
def score_model(model, X, y):
    return accuracy_score(y, model.predict(X))
def evaluate_model(model, X, y):
    print(classification_report(y, model.predict(X)))


In [None]:
### We are use three models here
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "NaiveBayes": MultinomialNB(),
    "LinearSVM": LinearSVC()
}
results = {}
for name, model in models.items():
    print("-"*100)
    print(f"\nTRAINING- {name}")
    trained= fit_model(model, train_df[["message"]], train_df["label"])
    train_acc = score_model(trained, train_df[["message"]], train_df["label"])
    val_acc   = score_model(trained, val_df[["message"]], val_df["label"])
    results[name] = (trained, val_acc)
    print("Train Accuracy:", train_acc)
    print("Validation Accuracy:", val_acc)



Training LogisticRegression
Train Accuracy: 0.9792703150912107
Validation Accuracy: 0.9690322580645161

Training NaiveBayes
Train Accuracy: 0.9709784411276948
Validation Accuracy: 0.9625806451612903

Training LinearSVM
Train Accuracy: 0.9983416252072969
Validation Accuracy: 0.9806451612903225


In [16]:
param_grid = {
    "model__C": [0.01, 0.1, 1, 10],
    "model__solver": ["liblinear"]
}

logreg_pipeline = Pipeline([
    ("features", preprocessor),
    ("model", LogisticRegression(max_iter=1000))
])

grid = GridSearchCV(
    logreg_pipeline,
    param_grid,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)

grid.fit(train_df[["message"]], train_df["label"])


In [19]:
### Chosing the best model
best_model = grid.best_estimator_

print("Best Parameters:", grid.best_params_)
print("Validation Accuracy (Tuned):",
      score_model(best_model, val_df[["message"]], val_df
      ["label"]))


Best Parameters: {'model__C': 10, 'model__solver': 'liblinear'}
Validation Accuracy (Tuned): 0.9793548387096774


In [20]:
print("Test Accuracy:",
      score_model(best_model, test_df[["message"]], test_df["label"]))

evaluate_model(best_model, test_df[["message"]], test_df["label"])


Test Accuracy: 0.9871134020618557
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       678
           1       0.98      0.92      0.95        98

    accuracy                           0.99       776
   macro avg       0.98      0.96      0.97       776
weighted avg       0.99      0.99      0.99       776



In [21]:
def predict_sms(model, message):
    df = pd.DataFrame({"message": [message]})
    pred = model.predict(df)[0]
    prob = None

    #### probability only if model supports it
    if hasattr(model.named_steps["model"], "predict_proba"):
        prob = model.predict_proba(df)[0]

    return pred, prob


In [22]:
examples = [
    "Congratulations! You have won ₹50,000 cash prize. Call now!",
    "Hey bro, are we meeting at 6 pm today?",
    "URGENT! Your mobile number has won a free recharge",
    "Don't forget to bring the documents tomorrow",
    "WINNER!! Claim your FREE vacation now!!!"
]
for msg in examples:
    label, prob = predict_sms(best_model, msg)
    print("-" * 80)
    print("Message:", msg)
    print("Prediction:", "SPAM" if label == 1 else "HAM")
    if prob is not None:
        print("Probability [HAM, SPAM]:", prob)


--------------------------------------------------------------------------------
Message: Congratulations! You have won ₹50,000 cash prize. Call now!
Prediction: SPAM
Probability [HAM, SPAM]: [0.37885106 0.62114894]
--------------------------------------------------------------------------------
Message: Hey bro, are we meeting at 6 pm today?
Prediction: HAM
Probability [HAM, SPAM]: [0.99819449 0.00180551]
--------------------------------------------------------------------------------
Message: URGENT! Your mobile number has won a free recharge
Prediction: HAM
Probability [HAM, SPAM]: [0.84655413 0.15344587]
--------------------------------------------------------------------------------
Message: Don't forget to bring the documents tomorrow
Prediction: HAM
Probability [HAM, SPAM]: [0.99465208 0.00534792]
--------------------------------------------------------------------------------
Message: WINNER!! Claim your FREE vacation now!!!
Prediction: SPAM
Probability [HAM, SPAM]: [0.43049103