# 🔐 Simple Crypto-BERT for Colab
## Optimized for Google Colab Free Tier

This version avoids mixed precision issues and uses a simplified architecture.

In [None]:
# Install packages
!pip install -q tensorflow==2.13.0
!pip install -q scikit-learn matplotlib seaborn tqdm

# Simple setup without mixed precision
import tensorflow as tf

# Enable memory growth for GPU
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print("🚀 GPU memory growth enabled")
    except RuntimeError as e:
        print(e)

print(f"TensorFlow: {tf.__version__}")
print(f"GPU available: {len(tf.config.list_physical_devices('GPU')) > 0}")

In [None]:
# Imports
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm
import json
import random
import gc
import warnings
warnings.filterwarnings('ignore')

# Set seeds
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)
random.seed(SEED)

print("✅ Setup complete")

In [None]:
# Simple configuration
class SimpleConfig:
    vocab_size = 10000
    max_length = 128
    embedding_dim = 256
    num_heads = 8
    ff_dim = 512
    num_transformer_blocks = 4
    dropout = 0.1
    num_classes = 15
    batch_size = 32
    epochs = 5
    learning_rate = 1e-3
    samples_per_class = 600
    
    class_labels = [
        'STRONG_ENCRYPTION', 'WEAK_CIPHER_SUITE', 'CERTIFICATE_INVALID',
        'KEY_REUSE', 'DOWNGRADE_ATTACK', 'MAN_IN_MIDDLE', 'REPLAY_ATTACK',
        'TIMING_ATTACK', 'QUANTUM_VULNERABLE', 'ENTROPY_WEAKNESS',
        'HASH_COLLISION', 'PADDING_ORACLE', 'LENGTH_EXTENSION',
        'PROTOCOL_CONFUSION', 'CRYPTO_AGILITY_LACK'
    ]

config = SimpleConfig()
print(f"📋 Config loaded: {config.samples_per_class * config.num_classes:,} total samples")

In [None]:
# Simple data generator
def generate_crypto_data(config):
    """Generate simple protocol sequences"""
    
    templates = {
        'STRONG_ENCRYPTION': ['tls13 aes256 gcm secure', 'wpa3 chacha20 strong'],
        'WEAK_CIPHER_SUITE': ['tls10 rc4 weak', 'wpa des vulnerable'],
        'CERTIFICATE_INVALID': ['cert expired invalid', 'ssl chain broken'],
        'KEY_REUSE': ['key reuse detected', 'nonce repeated'],
        'DOWNGRADE_ATTACK': ['tls downgrade detected', 'version rollback'],
        'MAN_IN_MIDDLE': ['cert mismatch mitm', 'dns spoofing'],
        'REPLAY_ATTACK': ['message replay detected', 'timestamp invalid'],
        'TIMING_ATTACK': ['timing variation high', 'side channel'],
        'QUANTUM_VULNERABLE': ['rsa quantum weak', 'post quantum needed'],
        'ENTROPY_WEAKNESS': ['entropy low weak', 'predictable random'],
        'HASH_COLLISION': ['hash collision detected', 'md5 vulnerable'],
        'PADDING_ORACLE': ['padding oracle detected', 'cbc padding'],
        'LENGTH_EXTENSION': ['length extension possible', 'hash extend'],
        'PROTOCOL_CONFUSION': ['protocol confusion', 'mixed implementation'],
        'CRYPTO_AGILITY_LACK': ['crypto agility limited', 'single cipher']
    }
    
    sequences = []
    labels = []
    
    print("🔄 Generating data...")
    for class_idx, class_name in enumerate(tqdm(config.class_labels)):
        for _ in range(config.samples_per_class):
            template = random.choice(templates[class_name])
            # Add some variation
            if random.random() < 0.3:
                template += f" session {random.randint(1000, 9999)}"
            sequences.append(template)
            labels.append(class_idx)
    
    return sequences, labels

sequences, labels = generate_crypto_data(config)
print(f"✅ Generated {len(sequences):,} sequences")
print(f"Sample: {sequences[0]}")

In [None]:
# Simple tokenizer
def create_tokenizer(sequences, vocab_size, max_length):
    """Create simple word-based tokenizer"""
    
    # Special tokens
    word_to_id = {'<PAD>': 0, '<UNK>': 1}
    
    # Count words
    word_freq = {}
    for seq in sequences:
        for word in seq.lower().split():
            word_freq[word] = word_freq.get(word, 0) + 1
    
    # Add most frequent words
    sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
    
    for word, freq in sorted_words[:vocab_size-2]:
        word_to_id[word] = len(word_to_id)
    
    # Encode sequences
    encoded = []
    for seq in tqdm(sequences, desc="Encoding"):
        words = seq.lower().split()[:max_length]
        ids = [word_to_id.get(word, word_to_id['<UNK>']) for word in words]
        
        # Pad
        while len(ids) < max_length:
            ids.append(word_to_id['<PAD>'])
        
        encoded.append(ids[:max_length])
    
    return np.array(encoded), word_to_id

X, tokenizer = create_tokenizer(sequences, config.vocab_size, config.max_length)
y = np.array(labels)

print(f"📝 Encoded shape: {X.shape}")
print(f"🔤 Vocabulary size: {len(tokenizer)}")

# Clean up
del sequences, labels
gc.collect()

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=SEED
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.2, stratify=y_train, random_state=SEED
)

# Convert to categorical
y_train_cat = tf.keras.utils.to_categorical(y_train, config.num_classes)
y_val_cat = tf.keras.utils.to_categorical(y_val, config.num_classes)
y_test_cat = tf.keras.utils.to_categorical(y_test, config.num_classes)

print(f"📊 Train: {X_train.shape[0]:,}, Val: {X_val.shape[0]:,}, Test: {X_test.shape[0]:,}")

del X, y
gc.collect()

In [None]:
# Simple transformer model using Keras built-in layers
def create_simple_bert(config):
    """Create simplified BERT using Keras MultiHeadAttention"""
    
    inputs = tf.keras.Input(shape=(config.max_length,))
    
    # Embedding
    x = tf.keras.layers.Embedding(
        config.vocab_size, config.embedding_dim, mask_zero=True
    )(inputs)
    
    # Positional encoding (simple)
    positions = tf.keras.layers.Embedding(
        config.max_length, config.embedding_dim
    )(tf.range(config.max_length))
    
    x = x + positions
    
    # Transformer blocks
    for i in range(config.num_transformer_blocks):
        # Multi-head attention (built-in Keras layer)
        attn_out = tf.keras.layers.MultiHeadAttention(
            num_heads=config.num_heads,
            key_dim=config.embedding_dim // config.num_heads,
            dropout=config.dropout
        )(x, x)
        
        x = tf.keras.layers.LayerNormalization()(x + attn_out)
        
        # Feed forward
        ffn_out = tf.keras.layers.Dense(config.ff_dim, activation='gelu')(x)
        ffn_out = tf.keras.layers.Dense(config.embedding_dim)(ffn_out)
        ffn_out = tf.keras.layers.Dropout(config.dropout)(ffn_out)
        
        x = tf.keras.layers.LayerNormalization()(x + ffn_out)
    
    # Global average pooling instead of CLS token
    x = tf.keras.layers.GlobalAveragePooling1D()(x)
    
    # Classification head
    x = tf.keras.layers.Dropout(config.dropout)(x)
    x = tf.keras.layers.Dense(128, activation='gelu')(x)
    x = tf.keras.layers.Dropout(config.dropout)(x)
    
    outputs = tf.keras.layers.Dense(config.num_classes, activation='softmax')(x)
    
    model = tf.keras.Model(inputs, outputs, name='SimpleCryptoBERT')
    return model

print("🏗️ Creating model...")
model = create_simple_bert(config)

model.summary()

total_params = model.count_params()
print(f"\n📊 Parameters: {total_params:,}")
print(f"📏 Estimated size: {total_params * 4 / (1024*1024):.1f} MB")

In [None]:
# Compile and train
model.compile(
    optimizer=tf.keras.optimizers.Adam(config.learning_rate),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

# Callbacks
callbacks = [
    tf.keras.callbacks.EarlyStopping(
        monitor='val_accuracy', patience=2, restore_best_weights=True
    ),
    tf.keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss', factor=0.5, patience=1, min_lr=1e-6
    )
]

print("🚀 Training...")
history = model.fit(
    X_train, y_train_cat,
    batch_size=config.batch_size,
    epochs=config.epochs,
    validation_data=(X_val, y_val_cat),
    callbacks=callbacks,
    verbose=1
)

print("✅ Training complete!")

In [None]:
# Evaluate
test_loss, test_acc = model.evaluate(X_test, y_test_cat, verbose=0)

print(f"📊 Test Results:")
print(f"   Accuracy: {test_acc:.4f} ({test_acc*100:.2f}%)")
print(f"   Loss: {test_loss:.4f}")

# Plot training
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train')
plt.plot(history.history['val_accuracy'], label='Val')
plt.title('Accuracy')
plt.legend()
plt.grid(True)

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train')
plt.plot(history.history['val_loss'], label='Val')
plt.title('Loss')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()

# Quick classification report
y_pred = model.predict(X_test, verbose=0)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true_classes = np.argmax(y_test_cat, axis=1)

print(f"\n🎯 Sample Class Accuracies:")
for i in range(min(5, config.num_classes)):
    class_mask = y_true_classes == i
    if np.sum(class_mask) > 0:
        class_acc = np.mean(y_pred_classes[class_mask] == i)
        print(f"   {config.class_labels[i][:20]}: {class_acc:.3f}")

In [None]:
# Save model
model.save('crypto_bert_simple.h5')
print("💾 Model saved: crypto_bert_simple.h5")

# Save tokenizer
with open('tokenizer_simple.json', 'w') as f:
    json.dump({
        'word_to_id': tokenizer,
        'vocab_size': len(tokenizer),
        'max_length': config.max_length
    }, f)
print("💾 Tokenizer saved: tokenizer_simple.json")

# Save config
with open('config_simple.json', 'w') as f:
    json.dump({
        'vocab_size': config.vocab_size,
        'max_length': config.max_length,
        'embedding_dim': config.embedding_dim,
        'num_heads': config.num_heads,
        'ff_dim': config.ff_dim,
        'num_transformer_blocks': config.num_transformer_blocks,
        'num_classes': config.num_classes,
        'class_labels': config.class_labels,
        'test_accuracy': float(test_acc),
        'total_parameters': int(total_params)
    }, f, indent=2)
print("💾 Config saved: config_simple.json")

# Final stats
import os
file_size = os.path.getsize('crypto_bert_simple.h5') / (1024 * 1024)

print(f"\n🎉 Simple Crypto-BERT Complete!")
print(f"   📏 File size: {file_size:.1f} MB")
print(f"   🎯 Test accuracy: {test_acc:.4f}")
print(f"   ⚡ Fast training: ✅")
print(f"   🧠 Parameters: {total_params:,}")

print(f"\n📁 Files created:")
print(f"   • crypto_bert_simple.h5")
print(f"   • tokenizer_simple.json")
print(f"   • config_simple.json")

In [None]:
# Download files (if in Colab)
try:
    from google.colab import files
    print("📥 Downloading files...")
    
    files.download('crypto_bert_simple.h5')
    files.download('tokenizer_simple.json')
    files.download('config_simple.json')
    
    print("✅ Download complete!")
except ImportError:
    print("ℹ️ Files saved locally (not in Colab)")

print("\n🎊 Ready for integration with your CNN, GNN, and LSTM models!")

## 🔧 Usage Example

```python
# Load the model
import tensorflow as tf
import json

model = tf.keras.models.load_model('crypto_bert_simple.h5')

# Load tokenizer
with open('tokenizer_simple.json', 'r') as f:
    tokenizer_data = json.load(f)

# Predict
def predict_vulnerability(text):
    # Tokenize
    words = text.lower().split()[:128]  # max_length
    ids = [tokenizer_data['word_to_id'].get(w, 1) for w in words]  # 1 = <UNK>
    
    # Pad
    while len(ids) < 128:
        ids.append(0)  # 0 = <PAD>
    
    # Predict
    pred = model.predict([ids])
    return pred[0]
```

This simplified version should train successfully on Google Colab! 🚀