# Demonstration of `extract_protected_spans` and `segment_tokens`
This notebook demonstrates how to use the `extract_protected_spans` and `segment_tokens` functions from the `code_segmentation.py` module.

In [1]:
import ast
import json
import re
from code_segmentation import pretokenize, tokenize_pretokenized_string, extract_protected_spans, segment_tokens, pretty_print_tokens, pretty_print_spans

## Step 1: Define and parse a code example

In [2]:
code_example = '''
def add(a, b):
    return a + b

class Calculator:
    def subtract(self, x, y):
        return x - y
'''

parsed = pretokenize(ast.parse(code_example), _use_dedent=True, _use_semantics=True)
tokens = tokenize_pretokenized_string(parsed)
pretty_print_tokens(tokens)

[DEF] add [DELIMIT_1_L] [SEMANTIC_START] a [SEMANTIC_END] [COMMA] [SEMANTIC_START] b [SEMANTIC_END] [DELIMIT_1_R] [BLOCK]
[INDENT]
    [RETURN]
    [SEMANTIC_START] a [SEMANTIC_END] [ADD] [SEMANTIC_START] b [SEMANTIC_END]
[DEDENT]
[CLASS] Calculator [BLOCK]
[INDENT]
    [DEF] subtract [DELIMIT_1_L] [SEMANTIC_START] self [SEMANTIC_END] [COMMA] [SEMANTIC_START] x [SEMANTIC_END] [COMMA] [SEMANTIC_START] y [SEMANTIC_END] [DELIMIT_1_R] [BLOCK]
    [INDENT]
        [RETURN]
        [SEMANTIC_START] x [SEMANTIC_END] [SUB] [SEMANTIC_START] y [SEMANTIC_END]
    [DEDENT]
[DEDENT]


## Step 2: Extract protected spans using `extract_protected_spans`

In [3]:
spans = extract_protected_spans(tokens, all_options=True)
pretty_print_spans(tokens, spans)


=== Span (0, 21) ===
[DEF] add [DELIMIT_1_L] [SEMANTIC_START] a [SEMANTIC_END] [COMMA] [SEMANTIC_START] b [SEMANTIC_END] [DELIMIT_1_R] [BLOCK]
[INDENT]
    [RETURN]
    [SEMANTIC_START] a [SEMANTIC_END] [ADD] [SEMANTIC_START] b [SEMANTIC_END]
[DEDENT]

=== Span (2, 10) ===
[DELIMIT_1_L] [SEMANTIC_START] a [SEMANTIC_END] [COMMA] [SEMANTIC_START] b [SEMANTIC_END] [DELIMIT_1_R]

=== Span (12, 20) ===
[INDENT]
    [RETURN]
    [SEMANTIC_START] a [SEMANTIC_END] [ADD] [SEMANTIC_START] b [SEMANTIC_END]

=== Span (12, 21) ===
[INDENT]
    [RETURN]
    [SEMANTIC_START] a [SEMANTIC_END] [ADD] [SEMANTIC_START] b [SEMANTIC_END]
[DEDENT]

=== Span (13, 20) ===
[RETURN]
[SEMANTIC_START] a [SEMANTIC_END] [ADD] [SEMANTIC_START] b [SEMANTIC_END]

=== Span (21, 24) ===
[DEDENT]
[CLASS] Calculator [BLOCK]

=== Span (22, 52) ===
[CLASS] Calculator [BLOCK]
[INDENT]
    [DEF] subtract [DELIMIT_1_L] [SEMANTIC_START] self [SEMANTIC_END] [COMMA] [SEMANTIC_START] x [SEMANTIC_END] [COMMA] [SEMANTIC_START] y [SE

## Step 3: Segment tokens using `segment_tokens`
We split the token sequence into segments of at most 10 tokens, avoiding cuts within protected spans.

In [7]:
segments = segment_tokens(tokens, max_len=40, protected_spans=spans)
for start, end in segments: 
    print(f"Segment [{start}:{end}]:", tokens[start:end])

Segment [0:22]: ['[DEF]', 'add', '[DELIMIT_1_L]', '[SEMANTIC_START]', 'a', '[SEMANTIC_END]', '[COMMA]', '[SEMANTIC_START]', 'b', '[SEMANTIC_END]', '[DELIMIT_1_R]', '[BLOCK]', '[INDENT]', '[RETURN]', '[SEMANTIC_START]', 'a', '[SEMANTIC_END]', '[ADD]', '[SEMANTIC_START]', 'b', '[SEMANTIC_END]', '[DEDENT]']
Segment [22:53]: ['[CLASS]', 'Calculator', '[BLOCK]', '[INDENT]', '[DEF]', 'subtract', '[DELIMIT_1_L]', '[SEMANTIC_START]', 'self', '[SEMANTIC_END]', '[COMMA]', '[SEMANTIC_START]', 'x', '[SEMANTIC_END]', '[COMMA]', '[SEMANTIC_START]', 'y', '[SEMANTIC_END]', '[DELIMIT_1_R]', '[BLOCK]', '[INDENT]', '[RETURN]', '[SEMANTIC_START]', 'x', '[SEMANTIC_END]', '[SUB]', '[SEMANTIC_START]', 'y', '[SEMANTIC_END]', '[DEDENT]', '[DEDENT]']
