# Demonstration of `extract_protected_spans` and `segment_tokens`
This notebook demonstrates how to use the `extract_protected_spans` and `segment_tokens` functions from the `code_segmentation.py` module.

In [None]:
import ast
import sys

from pathlib import Path

root = Path().resolve().parent
sys.path.insert(0, str(root))

from pretokenizers.firstpretokenizer import FirstPretokenizer 
from segmentators.ultimatesegmentator import UltimateSegmentator
from utils.pretty_printer import pretty_print_tokens, pretty_print_spans

In [2]:
pretokenizer = FirstPretokenizer(_use_dedent=True, _use_semantics=True)
segmentator = UltimateSegmentator(pretokenizer)

## Step 1: Define and parse a code example

In [4]:
code_example = '''
def add(a, b):
    return a + b

class Calculator:
    def subtract(self, x, y):
        return x - y
'''

parsed = pretokenizer.pretokenize(ast.parse(code_example))
tokens = segmentator.tokenize_pretokenized_string(parsed)
pretty_print_tokens(tokens)

[DEF] add [DELIMIT_1_L] [SEMANTIC_START] a [SEMANTIC_END] [COMMA] [SEMANTIC_START] b [SEMANTIC_END] [DELIMIT_1_R] [BLOCK]
[INDENT]
    [RETURN]
    [SEMANTIC_START] a [SEMANTIC_END] [ADD] [SEMANTIC_START] b [SEMANTIC_END]
[DEDENT]
[CLASS] Calculator [BLOCK]
[INDENT]
    [DEF] subtract [DELIMIT_1_L] [SEMANTIC_START] self [SEMANTIC_END] [COMMA] [SEMANTIC_START] x [SEMANTIC_END] [COMMA] [SEMANTIC_START] y [SEMANTIC_END] [DELIMIT_1_R] [BLOCK]
    [INDENT]
        [RETURN]
        [SEMANTIC_START] x [SEMANTIC_END] [SUB] [SEMANTIC_START] y [SEMANTIC_END]
    [DEDENT]
[DEDENT]


## Step 2: Extract protected spans using `extract_protected_spans`

In [5]:
spans = segmentator.extract_protected_spans(tokens, all_options=True)
pretty_print_spans(tokens, spans)

x

=== Span (0, 21) ===
[DEF] add [DELIMIT_1_L] [SEMANTIC_START] a [SEMANTIC_END] [COMMA] [SEMANTIC_START] b [SEMANTIC_END] [DELIMIT_1_R] [BLOCK]
[INDENT]
    [RETURN]
    [SEMANTIC_START] a [SEMANTIC_END] [ADD] [SEMANTIC_START] b [SEMANTIC_END]
[DEDENT]

=== Span (2, 10) ===
[DELIMIT_1_L] [SEMANTIC_START] a [SEMANTIC_END] [COMMA] [SEMANTIC_START] b [SEMANTIC_END] [DELIMIT_1_R]

=== Span (12, 20) ===
[INDENT]
    [RETURN]
    [SEMANTIC_START] a [SEMANTIC_END] [ADD] [SEMANTIC_START] b [SEMANTIC_END]

=== Span (21, 24) ===
[DEDENT]
[CLASS] Calculator [BLOCK]

=== Span (22, 52) ===
[CLASS] Calculator [BLOCK]
[INDENT]
    [DEF] subtract [DELIMIT_1_L] [SEMANTIC_START] self [SEMANTIC_END] [COMMA] [SEMANTIC_START] x [SEMANTIC_END] [COMMA] [SEMANTIC_START] y [SEMANTIC_END] [DELIMIT_1_R] [BLOCK]
    [INDENT]
        [RETURN]
        [SEMANTIC_START] x [SEMANTIC_END] [SUB] [SEMANTIC_START] y [SEMANTIC_END]
    [DEDENT]
[DEDENT]

=== Span (25, 41) ===
[INDENT]
    [DEF] subtract [DELIMIT_1_L] [SE

## Step 3: Segment tokens using `segment_tokens`
We split the token sequence into segments of at most 10 tokens, avoiding cuts within protected spans.

In [6]:
segments = segmentator.segment_tokens(tokens, max_len=40, protected_spans=spans)
for start, end in segments: 
    print(f"Segment [{start}:{end}]:", tokens[start:end])

Segment [0:21]: ['[DEF]', 'add', '[DELIMIT_1_L]', '[SEMANTIC_START]', 'a', '[SEMANTIC_END]', '[COMMA]', '[SEMANTIC_START]', 'b', '[SEMANTIC_END]', '[DELIMIT_1_R]', '[BLOCK]', '[INDENT]', '[RETURN]', '[SEMANTIC_START]', 'a', '[SEMANTIC_END]', '[ADD]', '[SEMANTIC_START]', 'b', '[SEMANTIC_END]']
Segment [21:53]: ['[DEDENT]', '[CLASS]', 'Calculator', '[BLOCK]', '[INDENT]', '[DEF]', 'subtract', '[DELIMIT_1_L]', '[SEMANTIC_START]', 'self', '[SEMANTIC_END]', '[COMMA]', '[SEMANTIC_START]', 'x', '[SEMANTIC_END]', '[COMMA]', '[SEMANTIC_START]', 'y', '[SEMANTIC_END]', '[DELIMIT_1_R]', '[BLOCK]', '[INDENT]', '[RETURN]', '[SEMANTIC_START]', 'x', '[SEMANTIC_END]', '[SUB]', '[SEMANTIC_START]', 'y', '[SEMANTIC_END]', '[DEDENT]', '[DEDENT]']
