In [104]:
!pip install imbalanced-learn openpyxl



In [105]:
import pandas as pd
import numpy as np
import re
import random
import pickle

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, f1_score, precision_recall_curve, recall_score, precision_score
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import VotingClassifier
from sklearn.utils import shuffle

In [106]:
# Load spam.csv
df1 = pd.read_csv("spam.csv", encoding="latin-1")
df1 = df1[['v1', 'v2']]
df1.columns = ['label', 'text']

# Load revisedindiandataset.xls
df2 = pd.read_excel("revisedindiandataset.xls")

# Drop code column safely
if 'code' in df2.columns:
    df2 = df2.drop(columns=['code'])

df2 = df2[['label', 'msg']]
df2.columns = ['label', 'text']

# Merge
df = pd.concat([df1, df2], ignore_index=True)
df = shuffle(df, random_state=42)

# Clean labels
df['label'] = df['label'].astype(str).str.lower().str.strip()
df = df[df['label'].isin(['ham', 'spam'])]
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

print("Original Dataset Size:", len(df))
print(df['label'].value_counts())

Original Dataset Size: 10139
label
0    8181
1    1958
Name: count, dtype: int64


In [107]:
bank_terms = ["bank", "upi", "account", "wallet", "card"]
actions = ["blocked", "suspended", "restricted", "locked", "expired"]
urgency = ["urgent", "important", "security alert", "attention required"]
cta = ["click link", "verify now", "update immediately", "login now", "confirm details"]
threat = [
    "to avoid permanent suspension",
    "to prevent account closure",
    "or your account will be disabled",
    "or services will stop",
    "to restore access"
]

phishing_samples = []

for _ in range(500):
    sample = f"{random.choice(urgency).upper()}: Your {random.choice(bank_terms)} has been {random.choice(actions)}. {random.choice(cta)} {random.choice(threat)}."
    phishing_samples.append(sample)

phishing_df = pd.DataFrame({
    "text": phishing_samples,
    "label": 1
})

# Append
df = pd.concat([df[['text','label']], phishing_df], ignore_index=True)
df = shuffle(df, random_state=42)

print("Augmented Dataset Size:", len(df))

Augmented Dataset Size: 10639


In [108]:
def advanced_clean(text):
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+', ' URLTOKEN ', text)
    text = re.sub(r'\b\d{10}\b', ' PHONETOKEN ', text)
    text = re.sub(r'\b\d{4,6}\b', ' OTPTOKEN ', text)
    text = re.sub(r'₹|\$', ' MONEYTOKEN ', text)
    text = re.sub(r'[^\w\s!]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['clean_text'] = df['text'].apply(advanced_clean)

In [109]:
fraud_keywords = [
    "urgent", "account", "blocked", "verify", "click", "link",
    "otp", "bank", "update", "suspend", "limited",
    "login", "reset", "kyc", "reward", "winner",
    "free", "claim", "prize", "expire"
]

def keyword_score(text):
    text = text.lower()
    return sum(word in text for word in fraud_keywords)

def phishing_pattern(text):
    text = text.lower()
    if "click" in text and ("account" in text or "bank" in text):
        return 1
    return 0

df['keyword_score'] = df['text'].apply(keyword_score)
df['phishing_pattern'] = df['text'].apply(phishing_pattern)

In [110]:
df['length'] = df['text'].apply(len)
df['num_digits'] = df['text'].apply(lambda x: sum(c.isdigit() for c in str(x)))
df['num_exclaim'] = df['text'].apply(lambda x: str(x).count('!'))
df['num_upper'] = df['text'].apply(lambda x: sum(c.isupper() for c in str(x)))
df['num_urls'] = df['text'].apply(lambda x: len(re.findall(r'http\S+|www\S+', str(x))))

In [111]:
X = df[['clean_text','length','num_digits','num_exclaim','num_upper','num_urls','keyword_score','phishing_pattern']]
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [112]:
text_features = 'clean_text'
numeric_features = ['length','num_digits','num_exclaim','num_upper','num_urls','keyword_score','phishing_pattern']

preprocessor = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(
            max_features=30000,
            ngram_range=(1,3),
            min_df=2,
            max_df=0.9,
            sublinear_tf=True
        ), text_features),
        ('num', StandardScaler(), numeric_features)
    ]
)

In [113]:
lr = LogisticRegression(C=2, class_weight='balanced', max_iter=500)
svm = LinearSVC(class_weight='balanced')
svm_calibrated = CalibratedClassifierCV(svm)

ensemble = VotingClassifier(
    estimators=[
        ('lr', lr),
        ('svm', svm_calibrated)
    ],
    voting='soft'
)

In [114]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', ensemble)
])

In [115]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X_train, y_train, cv=cv, scoring='f1')

print("Cross Validation F1:", scores.mean())



Cross Validation F1: 0.9343881730243815




In [116]:
pipeline.fit(X_train, y_train)



In [117]:
probs = pipeline.predict_proba(X_test)[:,1]

precision, recall, thresholds = precision_recall_curve(y_test, probs)
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-10)

best_threshold = thresholds[np.argmax(f1_scores)]
print("Best Threshold:", best_threshold)

Best Threshold: 0.4052312960713096


In [118]:
final_preds = (probs >= best_threshold).astype(int)

print(classification_report(y_test, final_preds))
print("Spam Recall:", recall_score(y_test, final_preds))
print("Spam Precision:", precision_score(y_test, final_preds))
print("Final F1:", f1_score(y_test, final_preds))
print("Confusion Matrix:\n", confusion_matrix(y_test, final_preds))

              precision    recall  f1-score   support

           0       0.99      0.98      0.98      1636
           1       0.93      0.95      0.94       492

    accuracy                           0.97      2128
   macro avg       0.96      0.97      0.96      2128
weighted avg       0.97      0.97      0.97      2128

Spam Recall: 0.9532520325203252
Spam Precision: 0.9324055666003976
Final F1: 0.942713567839196
Confusion Matrix:
 [[1602   34]
 [  23  469]]


In [119]:
def calculate_statistical_score(msg):
    score = 0

    if len(msg) > 120:
        score += 1
    if msg.count('!') >= 2:
        score += 1
    if sum(c.isdigit() for c in msg) > 5:
        score += 1
    if len(re.findall(r'http\S+|www\S+', msg)) > 0:
        score += 2

    return score

In [120]:
# Extract TF-IDF vectorizer
vectorizer = pipeline.named_steps['preprocessor'].transformers_[0][1]

# Extract LogisticRegression from ensemble
ensemble_model = pipeline.named_steps['classifier']
lr_model = ensemble_model.estimators_[0]  # first estimator is LR

feature_names = vectorizer.get_feature_names_out()
coefficients = lr_model.coef_[0]

# Top Spam Words
top_spam = sorted(zip(feature_names, coefficients), key=lambda x: x[1], reverse=True)[:20]

# Top Ham Words
top_ham = sorted(zip(feature_names, coefficients), key=lambda x: x[1])[:20]

print("Top Spam Indicators:")
for word, weight in top_spam:
    print(f"{word}: {round(weight,3)}")

print("\nTop Ham Indicators:")
for word, weight in top_ham:
    print(f"{word}: {round(weight,3)}")

Top Spam Indicators:
to otptoken: 4.701
call: 4.476
offer: 4.459
urltoken: 3.907
now: 3.883
text: 3.791
txt: 3.46
get: 3.25
free: 2.903
556789999: 2.815
vodafone: 2.815
stop: 2.726
ringtone: 2.687
apply: 2.652
win: 2.644
कर: 2.581
new: 2.57
dial: 2.563
akash: 2.446
content: 2.404

Top Ham Indicators:
id: -3.453
my: -3.228
is: -3.227
otp: -2.751
me: -2.679
ltgt: -2.559
jio: -2.425
im: -2.343
ac: -2.305
no: -2.26
not: -2.211
was: -2.159
ok: -2.086
speed: -1.88
that: -1.877
thank you: -1.819
you: -1.818
hrs: -1.808
sbi: -1.748
ill: -1.744


In [121]:
import numpy as np

def explain_prediction(msg):

    cleaned = advanced_clean(msg)

    # Get TF-IDF vector
    tfidf_vector = vectorizer.transform([cleaned])

    # Get LR model
    lr_model = pipeline.named_steps['classifier'].estimators_[0]

    # Get feature names
    feature_names = vectorizer.get_feature_names_out()

    # Convert sparse to dense
    tfidf_array = tfidf_vector.toarray()[0]

    # Multiply by coefficients
    contributions = tfidf_array * lr_model.coef_[0][:len(feature_names)]

    # Get top contributing words
    top_indices = np.argsort(contributions)[-10:]

    explanation = []

    for idx in reversed(top_indices):
        if contributions[idx] > 0:
            explanation.append({
                "word": feature_names[idx],
                "impact": round(float(contributions[idx]), 4)
            })

    return explanation

In [122]:
def predict_with_scores(msg):

    cleaned = advanced_clean(msg)

    temp_df = pd.DataFrame([{
        "clean_text": cleaned,
        "length": len(msg),
        "num_digits": sum(c.isdigit() for c in msg),
        "num_exclaim": msg.count('!'),
        "num_upper": sum(c.isupper() for c in msg),
        "num_urls": len(re.findall(r'http\S+|www\S+', msg)),
        "keyword_score": keyword_score(msg),
        "phishing_pattern": phishing_pattern(msg)
    }])

    ml_prob = pipeline.predict_proba(temp_df)[0][1]

    kw_score = keyword_score(msg)
    pattern_score = phishing_pattern(msg)
    stat_score = calculate_statistical_score(msg)

    rule_score = min((kw_score * 0.1 + pattern_score * 0.2 + stat_score * 0.1), 1)

    final_score = 0.7 * ml_prob + 0.3 * rule_score

    if final_score >= 0.75:
        risk_level = "HIGH RISK"
    elif final_score >= 0.45:
        risk_level = "MEDIUM RISK"
    else:
        risk_level = "LOW RISK"

    explanation = explain_prediction(msg)

    return {
        "ml_probability": round(float(ml_prob), 3),
        "final_risk_score": round(float(final_score), 3),
        "risk_level": risk_level,
        "top_contributing_words": explanation
    }

In [123]:
predict_with_scores("You have received a refund of ₹12,500. To claim the amount, verify your UPI PIN immediately. Click the link below to proceed.")

{'ml_probability': 0.463,
 'final_risk_score': 0.474,
 'risk_level': 'MEDIUM RISK',
 'top_contributing_words': [{'word': 'claim', 'impact': 0.2795},
  {'word': 'to claim', 'impact': 0.2078},
  {'word': 'you have received', 'impact': 0.1462},
  {'word': 'your upi', 'impact': 0.1298},
  {'word': 'to', 'impact': 0.105},
  {'word': 'you have', 'impact': 0.0958},
  {'word': 'link', 'impact': 0.0944},
  {'word': 'immediately', 'impact': 0.0635},
  {'word': 'your', 'impact': 0.0546},
  {'word': 'have received', 'impact': 0.0219}]}

In [124]:
predict_with_scores("Congratulations! You have won ₹50,000 in our lucky draw.Claim your prize now by sharing your bank details.Offer valid for today only.")

{'ml_probability': 0.921,
 'final_risk_score': 0.765,
 'risk_level': 'HIGH RISK',
 'top_contributing_words': [{'word': 'now', 'impact': 0.3712},
  {'word': 'won', 'impact': 0.3517},
  {'word': 'have won', 'impact': 0.3011},
  {'word': 'prize', 'impact': 0.2513},
  {'word': 'our', 'impact': 0.2049},
  {'word': 'congratulations', 'impact': 0.1969},
  {'word': 'only', 'impact': 0.134},
  {'word': 'you have won', 'impact': 0.1093},
  {'word': 'you have', 'impact': 0.1013},
  {'word': 'your', 'impact': 0.0977}]}

In [125]:
predict_with_scores("Dear customer, please update your account details to continue using our servicesFailure to comply may result in temporary suspension.")

{'ml_probability': 0.191,
 'final_risk_score': 0.224,
 'risk_level': 'LOW RISK',
 'top_contributing_words': [{'word': 'our', 'impact': 0.2447},
  {'word': 'suspension', 'impact': 0.2084},
  {'word': 'to', 'impact': 0.1326},
  {'word': 'details to', 'impact': 0.1325},
  {'word': 'your account', 'impact': 0.0952},
  {'word': 'customer', 'impact': 0.0729},
  {'word': 'your', 'impact': 0.0689},
  {'word': 'details', 'impact': 0.0374},
  {'word': 'update', 'impact': 0.0241},
  {'word': 'using', 'impact': 0.0145}]}

In [126]:
predict_with_scores("Your SBI account has been credited with ₹5,000 on 21-Feb-2026.If you did not authorize this transaction, contact customer care at 1800-123-456.")

{'ml_probability': 0.299,
 'final_risk_score': 0.299,
 'risk_level': 'LOW RISK',
 'top_contributing_words': [{'word': 'otptoken', 'impact': 0.266},
  {'word': 'has been', 'impact': 0.174},
  {'word': 'account has been', 'impact': 0.1509},
  {'word': 'account has', 'impact': 0.1509},
  {'word': 'been', 'impact': 0.1436},
  {'word': 'with', 'impact': 0.1419},
  {'word': 'has', 'impact': 0.1273},
  {'word': 'been credited with', 'impact': 0.0879},
  {'word': 'credited with', 'impact': 0.0688},
  {'word': 'customer care', 'impact': 0.0639}]}

In [127]:
predict_with_scores("Your OTP for login is 482931. Do not share this OTP with anyone.")

{'ml_probability': 0.023,
 'final_risk_score': 0.106,
 'risk_level': 'LOW RISK',
 'top_contributing_words': [{'word': 'otptoken', 'impact': 0.1717},
  {'word': 'with', 'impact': 0.155},
  {'word': 'your', 'impact': 0.058},
  {'word': 'for', 'impact': 0.0506},
  {'word': 'this', 'impact': 0.0175}]}

In [128]:
predict_with_scores("प्रिय ग्राहक, आपका बैंक खाता बंद किया जाएगा। तुरंत अपना OTP साझा करें और KYC अपडेट करें।")

{'ml_probability': 0.887,
 'final_risk_score': 0.681,
 'risk_level': 'MEDIUM RISK',
 'top_contributing_words': [{'word': 'कर', 'impact': 1.6226},
  {'word': 'और', 'impact': 0.6684},
  {'word': 'आपक', 'impact': 0.5979},
  {'word': 'kyc', 'impact': 0.193}]}

In [129]:
predict_with_scores("మీ బ్యాంక్ ఖాతా నిలిపివేయబడుతుంది.దయచేసి వెంటనే మీ OTP ను పంపండి.")

{'ml_probability': 0.031,
 'final_risk_score': 0.052,
 'risk_level': 'LOW RISK',
 'top_contributing_words': []}

In [130]:
predict_with_scores("Dear SBI user, your account will be blocked today. Share your OTP immediately to avoid suspension. Click http://sbi-verify-kyc.com to update KYC now.")

{'ml_probability': 0.951,
 'final_risk_score': 0.966,
 'risk_level': 'HIGH RISK',
 'top_contributing_words': [{'word': 'now', 'impact': 0.362},
  {'word': 'urltoken', 'impact': 0.3386},
  {'word': 'suspension', 'impact': 0.1702},
  {'word': 'your account will', 'impact': 0.1461},
  {'word': 'account will be', 'impact': 0.1461},
  {'word': 'account will', 'impact': 0.1461},
  {'word': 'blocked', 'impact': 0.1279},
  {'word': 'to', 'impact': 0.1083},
  {'word': 'kyc', 'impact': 0.1038},
  {'word': 'immediately to', 'impact': 0.0989}]}

In [131]:
import pickle

with open("model.pkl", "wb") as f:
    pickle.dump(pipeline, f)

print("Model saved successfully as model.pkl")

Model saved successfully as model.pkl
