Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Update contributing.md (#201) * Create the lexer - Add the TokenType class - Add initial documentation * Add tokenise function - Add initial documentation and type hints * Add token class - Change return type of tokenise to list[Token] * Basics of the Lexer - Does strings (but not the two char ones) - Does not do numbers - But does do escapes * Finished the lexer * Made the digraphs do good And made the `\` in the non standard string do good too * added some extra type annotation stuff * Because we don't do if name is main in files no more * added test cases * Update test_lexer.py
- Loading branch information
Showing
4 changed files
with
233 additions
and
0 deletions.
There are no files selected for viewing
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
import os | ||
import sys | ||
|
||
THIS_FOLDER = os.path.dirname(os.path.abspath(__file__)) + "/.." | ||
sys.path.insert(1, THIS_FOLDER) | ||
|
||
from vyxal.lexer import * | ||
|
||
|
||
def token_equal(source: str, expected: list[Token]) -> bool: | ||
""" | ||
Vectorises equality over the tokenised version of the program and | ||
the expected token list. This is because memory references. | ||
Parameters | ||
---------- | ||
source : str | ||
The test program to tokenise | ||
expected: list[Token] | ||
The expected token list | ||
Returns | ||
------- | ||
bool | ||
True iff corresponding tokens in the tokenised source and the | ||
expected list have the same name and value | ||
""" | ||
|
||
return all( | ||
map( | ||
lambda x: x[0].name == x[1].name and x[0].value == x[1].value, | ||
zip(tokenise(source), expected), | ||
) | ||
) | ||
|
||
|
||
def test_single_token(): | ||
assert token_equal("1", [Token(TokenType.NUMBER, "1")]) | ||
|
||
|
||
def test_one_plus_one(): | ||
assert token_equal( | ||
"1 1+", | ||
[ | ||
Token(TokenType.NUMBER, "1"), | ||
Token(TokenType.GENERAL, " "), | ||
Token(TokenType.NUMBER, "1"), | ||
Token(TokenType.GENERAL, "+"), | ||
], | ||
) |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,179 @@ | ||
""" | ||
File: lexer.py | ||
Description: Before Vyxal programs can be grouped into appropriate | ||
structures, they need to be turned into tokens representing the | ||
different components of a program. For the full specification on token | ||
types, go to documents/specs/Lexer.md | ||
""" | ||
|
||
import collections | ||
import string | ||
|
||
|
||
class TokenType: | ||
""" | ||
A class providing a namespace for token type constants. Do not | ||
create any instances of this class. | ||
Attributes | ||
---------- | ||
LITERAL : str | ||
Used to denote that a token is a literal. In this case, this is | ||
defined as numbers and strings. Lists are NOT considered | ||
to be literal tokens. | ||
NAME : str | ||
Used to denote that a token is a name, meaning that it belongs | ||
to a structure such as a function defintion/call or a variable | ||
get/set. | ||
GENERAL : str | ||
Used to denote that a token does not have a specific type. This | ||
kind of token can be anything - a digraph, a structure delimiter | ||
or just a simple element. | ||
""" | ||
|
||
STRING: str = "string" | ||
NUMBER: str = "number" | ||
NAME: str = "name" | ||
GENERAL: str = "general" | ||
COMPRESSED_NUMBER: str = "compressed_number" | ||
COMPRESSED_STRING: str = "compressed_string" | ||
|
||
|
||
class Token: | ||
""" | ||
A class representing tokens of code | ||
Attributes | ||
---------- | ||
name : str | ||
The name of the token. Usually a TokenType literal | ||
value : str | ||
The value of the token | ||
Parameters | ||
---------- | ||
token_name : str | ||
The value to use as the name of the token | ||
token_value : str | ||
The value to use as the value of the token | ||
""" | ||
|
||
def __init__(self, token_name: str, token_value: str): | ||
self.name: str = token_name | ||
self.value: str = token_value | ||
|
||
def __str__(self) -> str: | ||
""" | ||
Return a nicely formatted representation of the token | ||
Returns | ||
------- | ||
str | ||
{name}: {value} | ||
""" | ||
|
||
return f"{self.name}: {self.value}" | ||
|
||
def __repr__(self) -> str: | ||
""" | ||
Returns the token as a stringified list version of name, value | ||
Returns | ||
------- | ||
str | ||
[name, value] | ||
""" | ||
|
||
return str([self.name, self.value]) | ||
|
||
|
||
def tokenise(source: str) -> list[Token]: | ||
""" | ||
Transform a Vyxal program into a list of tokens | ||
Parameters | ||
---------- | ||
source : str | ||
The Vyxal program to turn into tokens. This will have a utf-8 | ||
encoding. | ||
Returns | ||
------- | ||
list[Token] | ||
Each token is represented as a Token object. | ||
""" | ||
|
||
tokens: list[Token] = [] | ||
source: collections.deque = collections.deque(source) | ||
|
||
contextual_token_value: str = "" | ||
|
||
while source: | ||
# By treating the program as a queue, we can dequeue elements | ||
# until a certain predicate is satisfied. In simple terms, this | ||
# means it's easier to group things based on order...you don't | ||
# have to worry about what you group first. | ||
|
||
head: str = source.popleft() | ||
if head == "\\": # Need to escape the next character | ||
if source: | ||
# This has the consequence of making backslahses at the | ||
# end of a program not error. | ||
tokens.append(Token(TokenType.LITERAL, source.popleft())) | ||
|
||
elif head in "`»«": # String | ||
# Dequeue characters until the same string character is | ||
# reached. | ||
contextual_token_value = "" | ||
while source and source[0] != head: | ||
character: str = source.popleft() | ||
if head == "`" and character == "\\": | ||
# Handle the escape by just dequeueing the next | ||
# character | ||
if source: | ||
contextual_token_value += "\\" + source.popleft() | ||
else: | ||
contextual_token_value += character | ||
token_type: str = "" | ||
if head == "`": | ||
token_type = TokenType.STRING | ||
elif head == "»": | ||
token_type = TokenType.COMPRESSED_NUMBER | ||
elif head == "«": | ||
token_type = TokenType.COMPRESSED_STRING | ||
tokens.append(Token(token_type, contextual_token_value)) | ||
if source: | ||
source.popleft() | ||
elif head in string.digits + ".": | ||
contextual_token_value = head | ||
while source and source[0] in string.digits + ".": | ||
contextual_token_value += source.popleft() | ||
tokens.append(Token(TokenType.NUMBER, contextual_token_value)) | ||
elif head == "‛": | ||
contextual_token_value = "" | ||
while source and len(contextual_token_value) != 2: | ||
contextual_token_value += source.popleft() | ||
tokens.append(Token(TokenType.LITERAL, contextual_token_value)) | ||
elif head in "@→←°": | ||
tokens.append(Token(TokenType.GENERAL, head)) | ||
contextual_token_value = "" | ||
while source and source[0] in string.ascii_letters + "_": | ||
contextual_token_value += source.popleft() | ||
|
||
tokens.append(Token(TokenType.NAME, contextual_token_value)) | ||
elif head in "k∆øÞ¨": | ||
if source: | ||
tokens.append(Token(TokenType.GENERAL, head + source.popleft())) | ||
else: | ||
tokens.append(Token(TokenType.GENERAL, head)) | ||
return tokens |