Generating Fraud Data

In [4]:
import random

# Define themes and templates
bank_templates = [
    "Your {bank} account has been suspended. Please verify immediately.",
    "Unusual activity detected in your {bank} account. Confirm your identity.",
    "{bank} Security Alert: Immediate verification needed.",
    "Your {bank} online banking access is locked. Reset now.",
]

package_templates = [
    "{carrier} attempted delivery. Pay small fee to reschedule.",
    "{carrier} parcel undeliverable. Update address to receive package.",
    "Delivery failed. {carrier} needs confirmation of your details.",
]

prize_templates = [
    "Congratulations! You've won a {prize}. Claim your reward now.",
    "{prize} awaits you! Confirm your winning entry.",
    "Official notice: You are the lucky winner of a {prize}.",
]

password_templates = [
    "Reset your {service} password now due to suspicious login.",
    "{service} account security compromised. Change password immediately.",
    "Important: Update your {service} credentials today.",
]

ceo_templates = [
    "URGENT: CEO requests immediate payment to new vendor.",
    "Confidential: Process wire transfer as instructed by CFO.",
    "Emergency: Transfer funds to supplier before end of day.",
]

service_templates = [
    "{service} subscription expired. Update billing information.",
    "Your {service} account will be terminated. Verify payment.",
    "{service} service interrupted. Reactivate now.",
]

health_templates = [
    "Healthcare update: Confirm enrollment today.",
    "Urgent: Final notice for health benefits renewal.",
    "Medical insurance requires immediate verification.",
]

# Value pools
banks = ["Bank of America", "Chase", "Wells Fargo", "Citibank", "HSBC"]
carriers = ["FedEx", "UPS", "DHL", "USPS"]
prizes = ["$500 gift card", "$1000 cash prize", "new iPhone", "Amazon voucher"]
services = ["Netflix", "PayPal", "Amazon", "Apple", "Microsoft"]

# Combine all templates
templates = (
    bank_templates +
    package_templates +
    prize_templates +
    password_templates +
    ceo_templates +
    service_templates +
    health_templates
)

# Generate 500 spam messages
spam_messages = []

for _ in range(500):
    template = random.choice(templates)
    filled = template.format(
        bank=random.choice(banks),
        carrier=random.choice(carriers),
        prize=random.choice(prizes),
        service=random.choice(services)
    )
    spam_messages.append(filled)

# Example output
for i in range(10):
    print(f"{i+1}. {spam_messages[i]}")

# Save to text file (optional)
with open("generated_spam_messages.txt", "w") as f:
    for msg in spam_messages:
        f.write(msg + "\n")


1. URGENT: CEO requests immediate payment to new vendor.
2. Unusual activity detected in your Bank of America account. Confirm your identity.
3. Official notice: You are the lucky winner of a $500 gift card.
4. Congratulations! You've won a Amazon voucher. Claim your reward now.
5. Official notice: You are the lucky winner of a $1000 cash prize.
6. Medical insurance requires immediate verification.
7. Official notice: You are the lucky winner of a $500 gift card.
8. Important: Update your Netflix credentials today.
9. Your Wells Fargo online banking access is locked. Reset now.
10. Medical insurance requires immediate verification.


In [7]:
import pandas as pd

# Load original dataset
df_original = pd.read_csv("SMSSpamCollection", sep='\t', names=['label', 'text'])


# Load generated spam messages
with open("generated_spam_messages.txt", "r") as f:
    generated_spam = f.readlines()

# Clean newlines
generated_spam = [msg.strip() for msg in generated_spam]

# Create DataFrame
df_generated = pd.DataFrame({
    'label': ['spam'] * len(generated_spam),
    'text': generated_spam
})


# Combine original + generated
df_full = pd.concat([df_original, df_generated], ignore_index=True)

# Shuffle to mix generated + original
df_full = df_full.sample(frac=1, random_state=42).reset_index(drop=True)

print(df_full['label'].value_counts())

label
ham     4825
spam    1139
Name: count, dtype: int64


Cleaning the text

In [8]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    return text

df_full['text'] = df_full['text'].apply(clean_text)


Split to traint Test

In [9]:
from sklearn.model_selection import train_test_split

X = df_full['text'].values
y = df_full['label'].map({'ham': 0, 'spam': 1}).values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [12]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout,Input
# Tokenize text
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences to same length
max_length = 50  # SMS are usually short
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length, padding='post')

model = Sequential([
    Input(shape=(max_length,)),   
    Embedding(input_dim=5000, output_dim=64),
    LSTM(64, return_sequences=False),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

Model Training

In [13]:
history = model.fit(
    X_train_pad, y_train,
    validation_data=(X_test_pad, y_test),
    epochs=6,
    batch_size=32
)


Epoch 1/6
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.8023 - loss: 0.5488 - val_accuracy: 0.8256 - val_loss: 0.4672
Epoch 2/6
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.8420 - loss: 0.4321 - val_accuracy: 0.9573 - val_loss: 0.1791
Epoch 3/6
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9690 - loss: 0.1512 - val_accuracy: 0.9606 - val_loss: 0.1672
Epoch 4/6
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9765 - loss: 0.1228 - val_accuracy: 0.9698 - val_loss: 0.1459
Epoch 5/6
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9842 - loss: 0.0883 - val_accuracy: 0.9757 - val_loss: 0.1297
Epoch 6/6
[1m150/150[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9884 - loss: 0.0721 - val_accuracy: 0.9782 - val_loss: 0.1131


Save model

In [19]:
model.save("social_engineering_sms_model.keras")

model Evaluation

In [17]:
loss, accuracy = model.evaluate(X_test_pad, y_test)
print(f"Test Accuracy: {accuracy*100:.2f}%")


[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9757 - loss: 0.1245
Test Accuracy: 97.82%


In [16]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# Predict
y_pred_probs = model.predict(X_test_pad)
y_pred = (y_pred_probs > 0.5).astype(int)

# Classification Report
print(classification_report(y_test, y_pred, target_names=['ham', 'spam']))

# Confusion Matrix
print(confusion_matrix(y_test, y_pred))


[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
              precision    recall  f1-score   support

         ham       0.98      0.99      0.99       985
        spam       0.97      0.90      0.94       208

    accuracy                           0.98      1193
   macro avg       0.97      0.95      0.96      1193
weighted avg       0.98      0.98      0.98      1193

[[979   6]
 [ 20 188]]
