# 🔥 Have I Been Rekt - Threat Intelligence Training
## Safe overnight training for your 6-year-old GPU

In [None]:
# Clean setup
import os
if os.path.exists('Have-I-Been-Rekt'):
    !rm -rf Have-I-Been-Rekt
    
!git clone https://github.com/Pretty-Good-OSINT-Protocol/Have-I-Been-Rekt.git
%cd Have-I-Been-Rekt/ai-training
!ls -la datasets/
print('Repository cloned successfully!')

In [None]:
# Install dependencies
!pip install -q transformers torch datasets scikit-learn pandas numpy
import torch
print('GPU available:', torch.cuda.is_available())
if torch.cuda.is_available():
    print('GPU:', torch.cuda.get_device_name(0))
print('Ready for training!')

In [None]:
# Load existing threat data
import json
import pandas as pd

with open('datasets/comprehensive_threat_intelligence.json', 'r') as f:
    data = json.load(f)

print(f'Loaded {len(data)} threat intelligence records')

# Show sample data
for i, record in enumerate(data[:3]):
    print(f'Sample {i+1}: {record["type"]} - {list(record["data"].keys())}')

In [None]:
# Prepare training data
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split

texts = []
labels = []

for record in data:
    content = record.get('data', {})
    data_type = record.get('type', '')
    
    if 'username' in data_type:
        username = content.get('username', '')
        scams = len(content.get('scam_reports', []))
        text = f'Username {username} has {scams} scam reports'
        label = 1 if scams > 0 else 0
    elif 'domain' in data_type:
        domain = content.get('domain', '')
        phishing = len(content.get('phishing_indicators', []))
        text = f'Domain {domain} has {phishing} phishing indicators'
        label = 1 if phishing > 0 else 0
    else:
        text = f'Data type {data_type}'
        label = 0
    
    texts.append(text)
    labels.append(label)

print(f'Prepared {len(texts)} training samples')
print('Sample texts:')
for i, text in enumerate(texts[:3]):
    print(f'  {i+1}. {text} (label: {labels[i]})')

In [None]:
# Create dataset class
class ThreatDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=128,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Initialize small model for overnight training
model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

print('Model and tokenizer loaded')
print(f'Model parameters: {sum(p.numel() for p in model.parameters()):,}')

In [None]:
# Expand dataset with synthetic examples
synthetic_texts = [
    'Username admin_help has 1 scam reports',
    'Username normal_user has 0 scam reports', 
    'Domain legitimate-site.com has 0 phishing indicators',
    'Domain fake-exchange.tk has 2 phishing indicators',
    'Username crypto_scammer has 1 scam reports',
    'Domain binance-help.org has 1 phishing indicators'
]

synthetic_labels = [1, 0, 0, 1, 1, 1]

# Combine with real data
all_texts = texts + synthetic_texts
all_labels = labels + synthetic_labels

print(f'Total training data: {len(all_texts)} samples')
print(f'Positive samples: {sum(all_labels)}')
print(f'Negative samples: {len(all_labels) - sum(all_labels)}')

In [None]:
# Split and create datasets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    all_texts, all_labels, test_size=0.3, random_state=42
)

train_dataset = ThreatDataset(train_texts, train_labels, tokenizer)
val_dataset = ThreatDataset(val_texts, val_labels, tokenizer)

print(f'Training: {len(train_dataset)} samples')
print(f'Validation: {len(val_dataset)} samples')

In [None]:
# Training configuration optimized for overnight runs
training_args = TrainingArguments(
    output_dir='./threat-model',
    num_train_epochs=20,  # Perfect for overnight
    per_device_train_batch_size=4,  # Small batch for older GPUs
    per_device_eval_batch_size=4,
    warmup_steps=10,
    weight_decay=0.01,
    logging_steps=5,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    save_total_limit=2,
    load_best_model_at_end=True,
    fp16=True,  # Mixed precision for speed
    report_to=None
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

print('Trainer configured for overnight run!')
print('Configuration:')
print(f'  - Epochs: {training_args.num_train_epochs}')
print(f'  - Batch size: {training_args.per_device_train_batch_size}')
print(f'  - Mixed precision: {training_args.fp16}')

In [None]:
# START TRAINING - Perfect for overnight runs!
from datetime import datetime
import time

print('🚀 STARTING THREAT INTELLIGENCE TRAINING')
print('=' * 50)
print(f'Start time: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')
print('Optimized for overnight training on older GPUs')
print('=' * 50)

start_time = time.time()

try:
    # Train the model
    result = trainer.train()
    
    end_time = time.time()
    duration = (end_time - start_time) / 3600  # hours
    
    print('\n' + '=' * 50)
    print('🎉 TRAINING COMPLETED!')
    print('=' * 50)
    print(f'Duration: {duration:.2f} hours')
    print(f'Final loss: {result.training_loss:.4f}')
    print(f'End time: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')
    
except Exception as e:
    print(f'Training error: {e}')
    print('Saving current progress...')
    trainer.save_model('./backup-model')

In [None]:
# Test the trained model
trainer.save_model('./final-threat-model')
tokenizer.save_pretrained('./final-threat-model')

# Quick test
test_cases = [
    'Username crypto_king_official has 1 scam reports',
    'Username john_doe has 0 scam reports',
    'Domain fake-metamask.org has 3 phishing indicators',
    'Domain google.com has 0 phishing indicators'
]

print('🧪 TESTING TRAINED MODEL')
print('=' * 40)

model.eval()
for i, test_text in enumerate(test_cases):
    inputs = tokenizer(test_text, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**inputs)
        prediction = torch.nn.functional.softmax(outputs.logits, dim=-1)
        threat_score = prediction[0][1].item()  # Probability of being a threat
        
    threat_level = 'HIGH' if threat_score > 0.7 else 'MEDIUM' if threat_score > 0.3 else 'LOW'
    print(f'{i+1}. {test_text[:40]}...')
    print(f'   Threat: {threat_level} ({threat_score:.3f})')
    print()

print('✅ MODEL TRAINING COMPLETE!')
print('💾 Model saved to ./final-threat-model')
print('🎯 Ready for integration with Have I Been Rekt!')