In [None]:
import random
import csv

# Scam keywords/phrases for synthetic data
scam_phrases = [
    "urgent", "immediately", "act fast", "limited time", "your account will be blocked",
    "final warning", "transfer money", "wire funds", "send payment", "Western Union",
    "cryptocurrency wallet", "gift cards", "share your OTP", "provide your password",
    "confirm your card number", "bank account details", "Social Security number",
    "bank security team", "tax department", "your computer is hacked", "tech support from Microsoft",
    "guaranteed returns", "double your money", "risk-free investment", "congratulations you won",
    "lottery", "claim your prize", "your account is compromised", "suspicious login detected",
    "keep this secret", "verify identity here"
]

# Neutral message templates (1 placeholder)
neutral_templates = [
    "Your order #{} has shipped and will arrive by Monday.",
    "Reminder: Your appointment is scheduled for {}.",
    "Thanks for your purchase! Your receipt is attached for {}.",
    "Join our {} event this weekend!",
    "Please confirm your subscription to {}."
]

# Scam message templates (2 placeholders) with noise option
scam_templates = [
    "{}! {} to avoid losing access to your account.",
    "{}: {} now to secure your funds.",
    "{} You need to {} to claim your prize.",
    "{} from {}: Act immediately.",
    "Alert: {}. Please {} to resolve this issue."
]

# Load real data from spam.csv with encoding fallback
def load_real_data(file_path="spam.csv"):
    real_data = []
    encodings = ["utf-8", "latin-1"]
    for encoding in encodings:
        try:
            with open(file_path, encoding=encoding) as f:
                reader = csv.reader(f, delimiter="\t")  # Assuming tab-separated
                next(reader)  # Skip header if present
                for i, row in enumerate(reader, 1):
                    if len(row) == 2:
                        label = 1 if row[0].lower() == "spam" else 0
                        text = row[1]
                        real_data.append([i, text, label])
            break  # Exit loop if successful
        except UnicodeDecodeError:
            continue  # Try next encoding
    return real_data

# Generate synthetic dataset with noise
def generate_synthetic_data(num_rows, noise_prob=0.1):
    synthetic_data = []
    for i in range(1, num_rows + 1):
        if random.random() < 0.5:  # 50% scam messages
            template = random.choice(scam_templates)
            phrase1 = random.choice(scam_phrases).capitalize()
            phrase2 = random.choice(scam_phrases)
            message = template.format(phrase1, phrase2)
            # Add noise: 10% chance to insert a neutral phrase
            if random.random() < noise_prob:
                message += " " + random.choice(neutral_templates).format(random.choice(["Monday", "service"]))
            label = 1
        else:  # 50% non-scam messages
            template = random.choice(neutral_templates)
            placeholder = random.choice(["Monday", "Friday", "next week", "newsletter", "service"])
            message = template.format(placeholder)
            # Add noise: 10% chance to insert a scam phrase
            if random.random() < noise_prob:
                message += " " + random.choice(scam_phrases)
            label = 0
        synthetic_data.append([i + 5000, message, label])  # Offset IDs to avoid collision
    return synthetic_data

# Combine real and synthetic data
real_data = load_real_data()
if len(real_data) >= 2000:
    real_sample = random.sample(real_data, 2000)  # Use 2,000 real messages
else:
    real_sample = real_data  # Use all available real messages if less than 2000

synthetic_data = generate_synthetic_data(2000)  # Generate 2,000 synthetic messages
dataset = real_sample + synthetic_data

# Write to CSV
with open("scam_detection_dataset_with_real_data.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["MessageID", "Message", "ScamLabel"])
    writer.writerows(dataset)

print("Dataset generation complete. Check 'scam_detection_dataset_with_real_data.csv'.")

Dataset generation complete. Check 'scam_detection_dataset_with_real_data.csv'.
