In [None]:
# Setup
!git clone https://github.com/Pretty-Good-OSINT-Protocol/Have-I-Been-Rekt.git
%cd Have-I-Been-Rekt/ai-training
!pip install -q transformers torch datasets scikit-learn pandas
print('Setup complete')

In [None]:
# Generate data
!python3 collect_comprehensive_intelligence.py

In [None]:
# Train model
import torch
import json
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split

print('GPU available:', torch.cuda.is_available())

# Load data
with open('datasets/comprehensive_threat_intelligence.json', 'r') as f:
    data = json.load(f)

print(f'Loaded {len(data)} records')

# Prepare data
texts = []
labels = []

for record in data:
    content = record.get('data', {})
    data_type = record.get('type', '')
    
    if 'username' in data_type:
        username = content.get('username', '')
        scams = len(content.get('scam_reports', []))
        text = f'Username {username} has {scams} scam reports'
        label = 1 if scams > 0 else 0
    elif 'domain' in data_type:
        domain = content.get('domain', '')
        phishing = len(content.get('phishing_indicators', []))
        text = f'Domain {domain} has {phishing} phishing indicators'
        label = 1 if phishing > 0 else 0
    else:
        text = f'Data type {data_type}'
        label = 0
    
    texts.append(text)
    labels.append(label)

print(f'Prepared {len(texts)} samples')

In [None]:
# Create dataset class
class SimpleDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=128,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [None]:
# Initialize model
model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Split data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

train_dataset = SimpleDataset(train_texts, train_labels, tokenizer)
val_dataset = SimpleDataset(val_texts, val_labels, tokenizer)

print(f'Train: {len(train_dataset)}, Val: {len(val_dataset)}')

In [None]:
# Train
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=8,
    logging_steps=10,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    fp16=True,
    report_to=None
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

print('Starting training')
trainer.train()
print('Training complete')

In [None]:
# Save model
trainer.save_model('./threat-model')
tokenizer.save_pretrained('./threat-model')
print('Model saved')

# Test
test_input = 'Username crypto_king_2024 has 1 scam reports'
inputs = tokenizer(test_input, return_tensors='pt')
outputs = model(**inputs)
prediction = torch.nn.functional.softmax(outputs.logits, dim=-1)
print('Test prediction:', prediction.tolist())
print('Done!')