Generating Fraud Data

In [4]:
import random

# Define themes and templates
bank_templates = [
    "Your {bank} account has been suspended. Please verify immediately.",
    "Unusual activity detected in your {bank} account. Confirm your identity.",
    "{bank} Security Alert: Immediate verification needed.",
    "Your {bank} online banking access is locked. Reset now.",
]

package_templates = [
    "{carrier} attempted delivery. Pay small fee to reschedule.",
    "{carrier} parcel undeliverable. Update address to receive package.",
    "Delivery failed. {carrier} needs confirmation of your details.",
]

prize_templates = [
    "Congratulations! You've won a {prize}. Claim your reward now.",
    "{prize} awaits you! Confirm your winning entry.",
    "Official notice: You are the lucky winner of a {prize}.",
]

password_templates = [
    "Reset your {service} password now due to suspicious login.",
    "{service} account security compromised. Change password immediately.",
    "Important: Update your {service} credentials today.",
]

ceo_templates = [
    "URGENT: CEO requests immediate payment to new vendor.",
    "Confidential: Process wire transfer as instructed by CFO.",
    "Emergency: Transfer funds to supplier before end of day.",
]

service_templates = [
    "{service} subscription expired. Update billing information.",
    "Your {service} account will be terminated. Verify payment.",
    "{service} service interrupted. Reactivate now.",
]

health_templates = [
    "Healthcare update: Confirm enrollment today.",
    "Urgent: Final notice for health benefits renewal.",
    "Medical insurance requires immediate verification.",
]

# Value pools
banks = ["Bank of America", "Chase", "Wells Fargo", "Citibank", "HSBC"]
carriers = ["FedEx", "UPS", "DHL", "USPS"]
prizes = ["$500 gift card", "$1000 cash prize", "new iPhone", "Amazon voucher"]
services = ["Netflix", "PayPal", "Amazon", "Apple", "Microsoft"]

# Combine all templates
templates = (
    bank_templates +
    package_templates +
    prize_templates +
    password_templates +
    ceo_templates +
    service_templates +
    health_templates
)

# Generate 500 spam messages
spam_messages = []

for _ in range(500):
    template = random.choice(templates)
    filled = template.format(
        bank=random.choice(banks),
        carrier=random.choice(carriers),
        prize=random.choice(prizes),
        service=random.choice(services)
    )
    spam_messages.append(filled)

# Example output
for i in range(10):
    print(f"{i+1}. {spam_messages[i]}")

# Save to text file (optional)
with open("generated_spam_messages.txt", "w") as f:
    for msg in spam_messages:
        f.write(msg + "\n")


1. URGENT: CEO requests immediate payment to new vendor.
2. Unusual activity detected in your Bank of America account. Confirm your identity.
3. Official notice: You are the lucky winner of a $500 gift card.
4. Congratulations! You've won a Amazon voucher. Claim your reward now.
5. Official notice: You are the lucky winner of a $1000 cash prize.
6. Medical insurance requires immediate verification.
7. Official notice: You are the lucky winner of a $500 gift card.
8. Important: Update your Netflix credentials today.
9. Your Wells Fargo online banking access is locked. Reset now.
10. Medical insurance requires immediate verification.


In [39]:
import pandas as pd


# Load original dataset
df_original = pd.read_csv("dataset/SMSSpamCollection", sep='\t', names=['label', 'text'])


# Load generated spam messages
with open("dataset/generated_spam_messages.txt", "r") as f:
    generated_spam = f.readlines()

# Clean newlines
generated_spam = [msg.strip() for msg in generated_spam]

# Create DataFrame
df_generated = pd.DataFrame({
    'label': ['spam'] * len(generated_spam),
    'text': generated_spam
})

# Combine original + generated
df_full = pd.concat([df_original, df_generated], ignore_index=True)

# Shuffle to mix generated + original
df_full = df_full.sample(frac=1, random_state=42).reset_index(drop=True)

df_full["label"] = df_full["label"].map({"ham": 0, "spam": 1})

print(df_full['label'].value_counts())


label
0    4825
1    1139
Name: count, dtype: int64


Cleaning the text

In [40]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    return text

df_full['text'] = df_full['text'].apply(clean_text)


Split to traint Test

In [41]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
import numpy as np
import joblib
from sklearn.utils import class_weight

# Tokenize text
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(df_full["text"])

sequences = tokenizer.texts_to_sequences(df_full["text"])
padded = pad_sequences(sequences, maxlen=50)

joblib.dump(tokenizer, "tokenizer.pkl")

X_train, X_test, y_train, y_test = train_test_split(
    padded, df_full["label"].values, test_size=0.2, random_state=42
)

# Step 6: Handle class imbalance
class_weights = class_weight.compute_class_weight(
    class_weight="balanced",
    classes=np.unique(y_train),
    y=y_train
)
class_weights_dict = dict(enumerate(class_weights))

# Step 7: Build model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=10000, output_dim=16, input_length=50),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')  
])

model.compile(
    loss="binary_crossentropy",
    optimizer="adam",
    metrics=["accuracy", tf.keras.metrics.AUC(name="auc")]
)



Model Training

In [42]:
history = model.fit(
    X_train, y_train,
    epochs=10,
    validation_data=(X_test, y_test),
    class_weight=class_weights_dict
)

Epoch 1/10
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7965 - auc: 0.6502 - loss: 0.6745 - val_accuracy: 0.9765 - val_auc: 0.9894 - val_loss: 0.6162
Epoch 2/10
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9283 - auc: 0.9498 - loss: 0.5374 - val_accuracy: 0.9774 - val_auc: 0.9901 - val_loss: 0.2973
Epoch 3/10
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9791 - auc: 0.9879 - loss: 0.2483 - val_accuracy: 0.9824 - val_auc: 0.9916 - val_loss: 0.1356
Epoch 4/10
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9869 - auc: 0.9920 - loss: 0.1297 - val_accuracy: 0.9673 - val_auc: 0.9922 - val_loss: 0.1661
Epoch 5/10
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9829 - auc: 0.9944 - loss: 0.0994 - val_accuracy: 0.9849 - val_auc: 0.9946 - val_loss: 0.0644
Epoch 6/10
[1m150/150[0m [3

Save model

In [43]:
model.save("smishing_model.h5")




model Evaluation

In [44]:
loss, accuracy = model.evaluate(X_test_pad, y_test)
print(f"Test Accuracy: {accuracy*100:.2f}%")


[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 831us/step - accuracy: 0.8143 - auc: 0.6913 - loss: 0.5599


ValueError: too many values to unpack (expected 2)

In [45]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# Predict
y_pred_probs = model.predict(X_test_pad)
y_pred = (y_pred_probs > 0.5).astype(int)

# Classification Report
print(classification_report(y_test, y_pred, target_names=['ham', 'spam']))

# Confusion Matrix
print(confusion_matrix(y_test, y_pred))


[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
              precision    recall  f1-score   support

         ham       0.85      0.95      0.90       985
        spam       0.47      0.19      0.27       208

    accuracy                           0.82      1193
   macro avg       0.66      0.57      0.58      1193
weighted avg       0.78      0.82      0.79      1193

[[939  46]
 [168  40]]
