## 1. Setup and Imports

In [None]:
import sys
sys.path.append('..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from src.grammar import HierarchicalGrammar, encode_dataset, prepare_tensors
from configs.grammars import BINARY_3_LEVEL_GRAMMAR, get_leaf_alphabet

## 2. Define a Hierarchical Grammar

We'll use a simple 3-level binary grammar with 4 root symbols.

In [None]:
# View the grammar structure
print("Root symbols:", BINARY_3_LEVEL_GRAMMAR['root_symbols'])
print("Terminal symbols:", BINARY_3_LEVEL_GRAMMAR['terminal_symbols'])
print("\nExample production rules:")
for symbol, rules in list(BINARY_3_LEVEL_GRAMMAR['rules'].items())[:3]:
    print(f"  {symbol} -> {rules}")

## 3. Generate Samples

Create a grammar instance and generate hierarchical sequences.

In [None]:
# Initialize grammar
grammar = HierarchicalGrammar(BINARY_3_LEVEL_GRAMMAR)

# Generate dataset
n_samples = 1000
df = grammar.generate_dataset(n_samples=n_samples)

print(f"Generated {len(df)} samples")
print(f"\nFirst 5 samples:")
print(df.head())

## 4. Analyze the Data

In [None]:
# Label distribution
print("Label distribution:")
print(df['label'].value_counts().sort_index())

# Visualize
df['label'].value_counts().sort_index().plot(kind='bar')
plt.title('Distribution of Root Symbols')
plt.xlabel('Root Symbol')
plt.ylabel('Count')
plt.show()

## 5. Encode Data as One-Hot

Convert string sequences to one-hot encoded matrices.

In [None]:
# Get leaf alphabet
leaf_alphabet = get_leaf_alphabet(BINARY_3_LEVEL_GRAMMAR)
print("Leaf alphabet:", leaf_alphabet)

# Encode dataset
df_encoded = encode_dataset(df.copy(), leaf_alphabet)

print(f"\nOriginal sequence: {df.iloc[0]['sequence']}")
print(f"Encoded shape: {df_encoded.iloc[0]['sequence'].shape}")
print(f"(vocabulary_size, sequence_length) = {df_encoded.iloc[0]['sequence'].shape}")

## 6. Prepare PyTorch Tensors

In [None]:
# Convert to tensors
data_tensor, label_tensor = prepare_tensors(df_encoded)

print("Data tensor shape:", data_tensor.shape)
print("Label tensor shape:", label_tensor.shape)
print(f"\n(batch_size, 1, vocab_size, seq_length) = {data_tensor.shape}")

## 7. Visualize a Sample

In [None]:
# Visualize one-hot encoding of a sample
sample_idx = 0
sample_matrix = data_tensor[sample_idx, 0, :, :].numpy()

plt.figure(figsize=(10, 4))
plt.imshow(sample_matrix, cmap='Blues', aspect='auto')
plt.colorbar(label='Activation')
plt.xlabel('Sequence Position')
plt.ylabel('Vocabulary Index')
plt.title(f'One-Hot Encoded Sequence (Label: {label_tensor[sample_idx].item()})')
plt.yticks(range(len(leaf_alphabet)), leaf_alphabet)
plt.show()

## Next Steps

Now that you have hierarchical data, you can:
1. Apply forward diffusion to add noise
2. Train a denoising model
3. Evaluate reconstruction accuracy

See the other notebooks for these steps!