In [2]:
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# --- Sample dataset ---
data = {
    "message": [
        "Free gift! Click here to claim your prize",
        "Hi John, the meeting is at 3pm tomorrow",
        "URGENT: update your payment details now",
        "Your invoice is attached, please review",
        "Click here to verify your account and login",
    ],
    "spf": ["pass", "pass", "fail", "pass", "fail"],  # header info
    "label": [1, 0, 1, 0, 1]  # 1 = spam, 0 = ham
}

df = pd.DataFrame(data)

# --- Rule engine using pandas ---
def compute_rule_score(row):
    score = 0.0
    text = row["message"].lower()

    # Rule 1: bad phrases
    bad_phrases = ["verify your account", "click here", "act now", "free gift", "update your payment"]
    if any(p in text for p in bad_phrases):
        score += 0.6

    # Rule 2: suspicious URL (IP address in link)
    if re.search(r'http[s]?://\d+\.\d+\.\d+\.\d+', text):
        score += 0.8

    # Rule 3: SPF failed
    if row["spf"] == "fail":
        score += 0.7

    return min(1.0, score)

df["rule_score"] = df.apply(compute_rule_score, axis=1)

# --- ML model (TF-IDF + Naive Bayes) ---
clf = Pipeline([
    ("tfidf", TfidfVectorizer(ngram_range=(1,2), min_df=1)),
    ("nb", MultinomialNB())
])
clf.fit(df["message"], df["label"])

# --- Predict with both ML + rules ---
df["ml_prob"] = clf.predict_proba(df["message"])[:, 1]

# Weighted combination
w_rule, w_ml = 0.6, 0.4
df["final_score"] = w_rule * df["rule_score"] + w_ml * df["ml_prob"]

# --- Classification based on threshold ---
df["predicted_label"] = (df["final_score"] >= 0.6).astype(int)

print(df[["message", "rule_score", "ml_prob", "final_score", "predicted_label", "label"]])


                                       message  rule_score   ml_prob  \
0    Free gift! Click here to claim your prize         0.6  0.782913   
1      Hi John, the meeting is at 3pm tomorrow         0.0  0.322181   
2      URGENT: update your payment details now         1.0  0.751030   
3      Your invoice is attached, please review         0.0  0.349578   
4  Click here to verify your account and login         1.0  0.782913   

   final_score  predicted_label  label  
0     0.673165                1      1  
1     0.128872                0      0  
2     0.900412                1      1  
3     0.139831                0      0  
4     0.913165                1      1  
