In [3]:
import ast
import sys
from pathlib import Path

root = Path().resolve().parent.parent
sys.path.insert(0, str(root))

from pretokenizers.firstpretokenizer import FirstPretokenizer

## Pretokenization

In [2]:
pretokenizer = FirstPretokenizer(_use_dedent=False, _use_semantics=True)
examples = open(f"{root}/utils/code_examples", "r").read().split("\n\n\n\n")

In [3]:
for example in examples:
    parsed = pretokenizer.pretokenize(ast.parse(example))
    print(f"example:\n{example}\n\nparsed:\n{parsed}\n\n")

example:
123

parsed:
[SEMANTIC_START]123[SEMANTIC_END]


example:
f"sin({a}) is {sin(a):.3}"

parsed:
[F][QUOT_2][SEMANTIC_START]sin([SEMANTIC_END][DELIMIT_3_L][SEMANTIC_START]a[SEMANTIC_END][DELIMIT_3_R][SEMANTIC_START]) is [SEMANTIC_END][DELIMIT_3_L][SEMANTIC_START]sin[SEMANTIC_END][DELIMIT_1_L][SEMANTIC_START]a[SEMANTIC_END][DELIMIT_1_R][DELIMIT_3_R][QUOT_2]


example:
[1, 2, 3]

parsed:
[DELIMIT_2_L][SEMANTIC_START]1[SEMANTIC_END][COMMA][SEMANTIC_START]2[SEMANTIC_END][COMMA][SEMANTIC_START]3[SEMANTIC_END][DELIMIT_2_R]


example:
(1, 2, 3)

parsed:
[TUPLE_L][SEMANTIC_START]1[SEMANTIC_END][COMMA][SEMANTIC_START]2[SEMANTIC_END][COMMA][SEMANTIC_START]3[SEMANTIC_END][TUPLE_R]


example:
{1, 2, 3}

parsed:
[DELIMIT_3_L][SEMANTIC_START]1[SEMANTIC_END][COMMA][SEMANTIC_START]2[SEMANTIC_END][COMMA][SEMANTIC_START]3[SEMANTIC_END][DELIMIT_3_R]


example:
{"a":1, **d}

parsed:
[DELIMIT_3_L][QUOT_1][SEMANTIC_START]a[SEMANTIC_END][QUOT_1][DICT_COLON][SEMANTIC_START]1[SEMANTIC_END][COMMA][UNPACK]

## Reverse pretokenization

without dedent

In [4]:
pretokenizer = FirstPretokenizer(_use_dedent=False, _use_semantics=True)

In [5]:
for example in examples:
    parsed = pretokenizer.pretokenize(ast.parse(example))
    print(f"example:\n{example}\n\nparsed:\n{pretokenizer.reverse(parsed)}\n\n")

example:
123

parsed:
123


example:
f"sin({a}) is {sin(a):.3}"

parsed:
f"sin({a}) is {sin(a)}"


example:
[1, 2, 3]

parsed:
[1, 2, 3]


example:
(1, 2, 3)

parsed:
(1, 2, 3)


example:
{1, 2, 3}

parsed:
{1, 2, 3}


example:
{"a":1, **d}

parsed:
{'a': 1, **d}


example:
a

parsed:
a


example:
a = 1

parsed:
a = 1


example:
del a

parsed:
del a


example:
a, *b = it

parsed:
a, *b = it


example:
-a

parsed:
-a


example:
not x

parsed:
not x


example:
x + y

parsed:
x + y


example:
x or y

parsed:
x or y


example:
1 <= a < 10

parsed:
1 <= a < 10


example:
func(a, b=c, *d, **e)

parsed:
func(a, *d, b = c, **e)


example:
a if b else c

parsed:
a if b else c


example:
snake.colour

parsed:
snake.colour


example:
(x := 4)

parsed:
(x := 4)


example:
l[1:2, 3]

parsed:
l[1:2, 3]


example:
l[1:2]

parsed:
l[1:2]


example:
[x for x in numbers]

parsed:
[x for x in numbers]


example:
{x: x**2 for x in numbers}

parsed:
{x: x**2 for x in numbers}


example:
{x for x in numbers

with dedent

In [8]:
pretokenizer = FirstPretokenizer(_use_dedent=True, _use_semantics=True)

In [9]:
for example in examples:
    parsed = pretokenizer.pretokenize(ast.parse(example))
    print(f"example:\n{example}\n\nparsed:\n{pretokenizer.reverse(parsed)}\n\n")

example:
123

parsed:
123


example:
f"sin({a}) is {sin(a):.3}"

parsed:
f"sin({a}) is {sin(a)}"


example:
[1, 2, 3]

parsed:
[1, 2, 3]


example:
(1, 2, 3)

parsed:
(1, 2, 3)


example:
{1, 2, 3}

parsed:
{1, 2, 3}


example:
{"a":1, **d}

parsed:
{'a': 1, **d}


example:
a

parsed:
a


example:
a = 1

parsed:
a = 1


example:
del a

parsed:
del a


example:
a, *b = it

parsed:
a, *b = it


example:
-a

parsed:
-a


example:
not x

parsed:
not x


example:
x + y

parsed:
x + y


example:
x or y

parsed:
x or y


example:
1 <= a < 10

parsed:
1 <= a < 10


example:
func(a, b=c, *d, **e)

parsed:
func(a, *d, b = c, **e)


example:
a if b else c

parsed:
a if b else c


example:
snake.colour

parsed:
snake.colour


example:
(x := 4)

parsed:
(x := 4)


example:
l[1:2, 3]

parsed:
l[1:2, 3]


example:
l[1:2]

parsed:
l[1:2]


example:
[x for x in numbers]

parsed:
[x for x in numbers]


example:
{x: x**2 for x in numbers}

parsed:
{x: x**2 for x in numbers}


example:
{x for x in numbers