# 🔥 Comprehensive Threat Intelligence Training
## Optimized for overnight runs on older hardware

In [None]:
# Setup
!git clone https://github.com/Pretty-Good-OSINT-Protocol/Have-I-Been-Rekt.git
%cd Have-I-Been-Rekt/ai-training
!pip install -q transformers torch datasets accelerate scikit-learn pandas numpy
print("✅ Setup complete!")

In [None]:
# Generate threat intelligence data
!python3 collect_comprehensive_intelligence.py
print("✅ Data generated!")

In [None]:
# Start training
import torch
import json
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset

print(f"🔥 Using: {'GPU' if torch.cuda.is_available() else 'CPU'}")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# Load and prepare data
with open('datasets/comprehensive_threat_intelligence.json', 'r') as f:
    data = json.load(f)

print(f"📊 Loaded {len(data)} threat intelligence records")

# Prepare training data
texts = []
labels = []

for record in data:
    data_type = record.get('type', '')
    content = record.get('data', {})
    
    # Create text representation
    if data_type == 'username_intelligence':
        username = content.get('username', '')
        scams = len(content.get('scam_reports', []))
        text = f"Username: {username} Scam reports: {scams}"
        label = 2 if scams > 0 else 0
    elif data_type == 'domain_intelligence':
        domain = content.get('domain', '')
        phishing = len(content.get('phishing_indicators', []))
        text = f"Domain: {domain} Phishing indicators: {phishing}"
        label = 2 if phishing > 0 else 0
    else:
        text = f"Type: {data_type}"
        label = 0
    
    texts.append(text)
    labels.append(label)

print(f"✅ Prepared {len(texts)} training samples")

In [None]:
# Create dataset
class ThreatDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Setup model
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

print("✅ Model and tokenizer loaded")

In [None]:
# Split data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

train_dataset = ThreatDataset(train_texts, train_labels, tokenizer)
val_dataset = ThreatDataset(val_texts, val_labels, tokenizer)

print(f"📊 Training: {len(train_dataset)}, Validation: {len(val_dataset)}")

In [None]:
# Training configuration (optimized for overnight run)
training_args = TrainingArguments(
    output_dir='./models/threat-intelligence',
    num_train_epochs=10,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=50,
    weight_decay=0.01,
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    fp16=True,
    report_to=None
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

print("🚀 Starting training - perfect for overnight run!")
trainer.train()
print("✅ Training complete!")

In [None]:
# Save and test
trainer.save_model('./final-model')
tokenizer.save_pretrained('./final-model')

# Test prediction
test_text = "Username: @crypto_king_2024 Scam reports: 1"
inputs = tokenizer(test_text, return_tensors='pt')
with torch.no_grad():
    outputs = model(**inputs)
    prediction = torch.nn.functional.softmax(outputs.logits, dim=-1)
    
print(f"🧪 Test prediction: {prediction}")
print("🎉 Your threat intelligence model is ready!")
print("💾 Model saved to ./final-model")