In [None]:
# Install core ML libraries
!pip install -q transformers accelerate bitsandbytes

# Install tree-sitter (specific version for stability)
!pip install -q tree-sitter==0.21.3

# Clone the repository you found
!rm -rf tree-sitter-tcl
!git clone https://github.com/tree-sitter-grammars/tree-sitter-tcl

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[?25hCloning into 'tree-sitter-tcl'...
remote: Enumerating objects: 587, done.[K
remote: Counting objects: 100% (250/250), done.[K
remote: Compressing objects: 100% (121/121), done.[K
remote: Total 587 (delta 163), reused 164 (delta 126), pack-reused 337 (from 1)[K
Receiving objects: 100% (587/587), 748.94 KiB | 12.69 MiB/s, done.
Resolving deltas: 100% (265/265), done.


In [None]:
# Compile the TCL grammar using gcc
# We include both parser.c and scanner.c (standard for Tree-sitter grammars)
!mkdir -p build
!gcc -shared -o build/tcl.so -fPIC \
  -I./tree-sitter-tcl/src \
  ./tree-sitter-tcl/src/parser.c \
  ./tree-sitter-tcl/src/scanner.c

In [None]:
import ctypes
from tree_sitter import Language, Parser

# 1. Load the compiled library
lib = ctypes.cdll.LoadLibrary('build/tcl.so')
getattr(lib, "tree_sitter_tcl").restype = ctypes.c_void_p
language_ptr = getattr(lib, "tree_sitter_tcl")()

# 2. Initialize Language and Parser (Ignoring the DeprecationWarning)
TCL_LANGUAGE = Language(language_ptr)
parser = Parser(TCL_LANGUAGE)

# 3. Test with a standard TCL command
test_code = 'set x [expr {$a + $b}]'
tree = parser.parse(bytes(test_code, "utf8"))

# 4. Corrected AST Printing for 0.25.x
def print_node_recursive(node, depth=0):
    indent = "  " * depth
    print(f"{indent}({node.type} [{node.start_byte}, {node.end_byte}])")
    for child in node.children:
        print_node_recursive(child, depth + 1)

print("✅ SUCCESS: Parser logic verified.")
print(f"Root node type: {tree.root_node.type}")
print("--- Visual AST Structure ---")
print_node_recursive(tree.root_node)

✅ SUCCESS: Parser logic verified.
Root node type: source_file
--- Visual AST Structure ---
(source_file [0, 22])
  (ERROR [0, 22])
    (set [0, 3])
    (id [4, 5])
    (command_substitution [6, 22])
      ([ [6, 7])
      (expr_cmd [7, 21])
        (expr [7, 11])
        (expr [12, 21])
          ({ [12, 13])
          (binop_expr [13, 20])
            (variable_substitution [13, 15])
              ($ [13, 14])
              (id [14, 15])
            (+ [16, 17])
            (variable_substitution [18, 20])
              ($ [18, 19])
              (id [19, 20])
          (} [20, 21])
      (] [21, 22])


  TCL_LANGUAGE = Language(language_ptr)


In [None]:
import torch
import torch.nn as nn
from transformers import LlamaForCausalLM, AutoTokenizer

class ASTEmbeddingLayer(nn.Module):
    def __init__(self, hidden_size, num_types=256, max_depth=64, max_siblings=128):
        super().__init__()
        # Learned lookup tables for structural context
        self.type_embedding = nn.Embedding(num_types, hidden_size)
        self.depth_embedding = nn.Embedding(max_depth, hidden_size)
        self.sibling_embedding = nn.Embedding(max_siblings, hidden_size)

        # Fusion layer to combine features and match variance
        self.proj = nn.Linear(hidden_size, hidden_size)
        self.layer_norm = nn.LayerNorm(hidden_size)

    def forward(self, node_types, depths, siblings):
        # Sum the structural components
        ast_vec = (self.type_embedding(node_types) +
                   self.depth_embedding(depths) +
                   self.sibling_embedding(siblings))

        # Project and normalize for observability injection
        return self.layer_norm(self.proj(ast_vec))



In [None]:
from transformers import LlamaForCausalLM, AutoConfig
import torch.nn.functional as F

class ASTAugmentedLlama(nn.Module):
    def __init__(self, model_id="codellama/CodeLlama-7b-hf"):
        super().__init__()

        # 1. Load the Base Model (using 4-bit for Colab efficiency)
        # The 'device_map="auto"' handles memory across the GPU
        self.base_model = LlamaForCausalLM.from_pretrained(
            model_id,
            load_in_4bit=True,
            device_map="auto",
            torch_dtype=torch.float16
        )

        # 2. Attach the AST Injection Module from Step 1
        # We ensure the hidden_size matches the Llama config (e.g., 4096)
        self.hidden_size = self.base_model.config.hidden_size
        self.ast_injector = ASTEmbeddingLayer(self.hidden_size).to(torch.float16)

    def forward(self, input_ids, node_types, depths, siblings, labels=None, **kwargs):
        """
        Structural Observability Forward Pass
        e_final = e_token + e_position + e_AST
        """
        # Get standard token embeddings from the base Llama model
        # Shape: (batch, seq_len, hidden_size)
        inputs_embeds = self.base_model.get_input_embeddings()(input_ids)

        # Generate the AST structural embeddings
        # Shape: (batch, seq_len, hidden_size)
        ast_embeds = self.ast_injector(node_types, depths, siblings)

        # Injection Logic: Additive structural signal
        # This acts as a constraint before the first attention layer
        combined_embeds = inputs_embeds + ast_embeds

        # Pass the augmented embeddings to the transformer blocks
        return self.base_model(
            inputs_embeds=combined_embeds,
            labels=labels,
            **kwargs
        )

# Initialize the model (This might take a few minutes to download/load)
print("Loading model and injecting AST layer...")
model = ASTAugmentedLlama()
print("✅ SUCCESS: AST-augmented Llama Wrapper is ready.")

`torch_dtype` is deprecated! Use `dtype` instead!


Loading model and injecting AST layer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/637 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

✅ SUCCESS: AST-augmented Llama Wrapper is ready.


In [10]:
import torch

def get_ast_metadata(code_string, tokenizer, max_length=512):
    # 1. Parse the TCL code into a tree
    tree = parser.parse(bytes(code_string, "utf8"))

    # 2. Tokenize with offset mapping to link tokens to character positions
    enc = tokenizer(code_string,
                    truncation=True,
                    max_length=max_length,
                    padding="max_length",
                    return_offsets_mapping=True)

    offsets = enc['offset_mapping']
    input_ids = enc['input_ids']

    # Structural feature lists
    node_types, depths, siblings = [], [], []

    for start, end in offsets:
        # Handle special/padding tokens (where start and end are 0)
        if start == end:
            node_types.append(0)
            depths.append(0)
            siblings.append(0)
            continue

        # 3. Find the specific AST node for this token's byte range
        node = tree.root_node.descendant_for_byte_range(start, end)

        # 4. Extract features for the embedding layer
        node_types.append(hash(node.type) % 256)

        # Calculate Depth: steps from node to root
        depth = 0
        p = node
        while p.parent:
            depth += 1
            p = p.parent
        depths.append(min(depth, 63))

        # Sibling index: position among child nodes
        idx = node.child_index if hasattr(node, 'child_index') else 0
        siblings.append(min(idx, 127))

    return {
        "input_ids": torch.tensor([input_ids]),
        "node_types": torch.tensor([node_types]),
        "depths": torch.tensor([depths]),
        "siblings": torch.tensor([siblings])
    }

print("✅ Success: get_ast_metadata is now defined.")

✅ Success: get_ast_metadata is now defined.


In [11]:
# 1. Prepare a complex TCL script
tcl_script = 'proc factorial {n} { if {$n <= 1} { return 1 } { return [expr {$n * [factorial [expr {$n-1}]]}] } }'

# 2. Generate AST Metadata Tensors
inputs = get_ast_metadata(tcl_script, tokenizer)

# 3. Move tensors to GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
inputs = {k: v.to(device) for k, v in inputs.items()}

# 4. Run a Forward Pass
model.eval()
with torch.no_grad():
    outputs = model(**inputs)

print(f"✅ VERIFIED: Tensors successfully reached the Llama backbone.")
print(f"Logits Shape: {outputs.logits.shape} (Batch, Seq_Len, Vocab_Size)")

✅ VERIFIED: Tensors successfully reached the Llama backbone.
Logits Shape: torch.Size([1, 512, 32016]) (Batch, Seq_Len, Vocab_Size)


In [12]:
import torch.nn.functional as F

def calculate_nll(model, inputs):
    model.eval()
    with torch.no_grad():
        # Forward pass
        outputs = model(**inputs, labels=inputs["input_ids"])
        return outputs.loss.item()

# 1. Complex TCL snippet that usually confuses models
test_script = """
proc check_status {device_id} {
    set status [get_hardware_status $device_id]
    if {$status == "READY"} {
        return [list 1 "Device is operational"]
    } else {
        return [list 0 "Device error: $status"]
    }
}
"""

# 2. Get Metadata
inputs = get_ast_metadata(test_script, tokenizer)
inputs = {k: v.to(device) for k, v in inputs.items()}

# 3. Measure Augmented Loss
augmented_loss = calculate_nll(model, inputs)

# 4. Measure Baseline Loss (Simulated by zeroing out the AST signal)
# We temporarily zero the AST embeddings to see how the base model performs
zero_inputs = inputs.copy()
zero_inputs["node_types"] = torch.zeros_like(inputs["node_types"])
zero_inputs["depths"] = torch.zeros_like(inputs["depths"])
zero_inputs["siblings"] = torch.zeros_like(inputs["siblings"])
baseline_loss = calculate_nll(model, zero_inputs)

print("--- RESULTS FOR PAPER ---")
print(f"Baseline Loss (No AST): {baseline_loss:.4f}")
print(f"Augmented Loss (With AST): {augmented_loss:.4f}")
print(f"Improvement: {((baseline_loss - augmented_loss) / baseline_loss) * 100:.2f}%")

--- RESULTS FOR PAPER ---
Baseline Loss (No AST): 12.1280
Augmented Loss (With AST): 12.1703
Improvement: -0.35%
