Skip to content

Commit

Permalink
Create the lexer (#202)
Browse files Browse the repository at this point in the history
* Update contributing.md (#201)

* Create the lexer

- Add the TokenType class
- Add initial documentation

* Add tokenise function

- Add initial documentation and type hints

* Add token class

- Change return type of tokenise to list[Token]

* Basics of the Lexer

- Does strings (but not the two char ones)
- Does not do numbers
- But does do escapes

* Finished the lexer

* Made the digraphs do good

And made the `\` in the non standard string do good too

* added some extra type annotation stuff

* Because we don't do if name is main in files no more

* added test cases

* Update test_lexer.py
  • Loading branch information
lyxal committed Jul 28, 2021
1 parent e46392a commit 700c523
Show file tree
Hide file tree
Showing 4 changed files with 233 additions and 0 deletions.
Empty file added tests/__init__.py
Empty file.
54 changes: 54 additions & 0 deletions tests/test_lexer.py
@@ -0,0 +1,54 @@
import os
import sys

THIS_FOLDER = os.path.dirname(os.path.abspath(__file__)) + "/.."
sys.path.insert(1, THIS_FOLDER)

from vyxal.lexer import *


def token_equal(source: str, expected: list[Token]) -> bool:
"""
Vectorises equality over the tokenised version of the program and
the expected token list. This is because memory references.
Parameters
----------
source : str
The test program to tokenise
expected: list[Token]
The expected token list
Returns
-------
bool
True iff corresponding tokens in the tokenised source and the
expected list have the same name and value
"""

return all(
map(
lambda x: x[0].name == x[1].name and x[0].value == x[1].value,
zip(tokenise(source), expected),
)
)


def test_single_token():
assert token_equal("1", [Token(TokenType.NUMBER, "1")])


def test_one_plus_one():
assert token_equal(
"1 1+",
[
Token(TokenType.NUMBER, "1"),
Token(TokenType.GENERAL, " "),
Token(TokenType.NUMBER, "1"),
Token(TokenType.GENERAL, "+"),
],
)
Empty file added vyxal/__init__.py
Empty file.
179 changes: 179 additions & 0 deletions vyxal/lexer.py
@@ -0,0 +1,179 @@
"""
File: lexer.py
Description: Before Vyxal programs can be grouped into appropriate
structures, they need to be turned into tokens representing the
different components of a program. For the full specification on token
types, go to documents/specs/Lexer.md
"""

import collections
import string


class TokenType:
"""
A class providing a namespace for token type constants. Do not
create any instances of this class.
Attributes
----------
LITERAL : str
Used to denote that a token is a literal. In this case, this is
defined as numbers and strings. Lists are NOT considered
to be literal tokens.
NAME : str
Used to denote that a token is a name, meaning that it belongs
to a structure such as a function defintion/call or a variable
get/set.
GENERAL : str
Used to denote that a token does not have a specific type. This
kind of token can be anything - a digraph, a structure delimiter
or just a simple element.
"""

STRING: str = "string"
NUMBER: str = "number"
NAME: str = "name"
GENERAL: str = "general"
COMPRESSED_NUMBER: str = "compressed_number"
COMPRESSED_STRING: str = "compressed_string"


class Token:
"""
A class representing tokens of code
Attributes
----------
name : str
The name of the token. Usually a TokenType literal
value : str
The value of the token
Parameters
----------
token_name : str
The value to use as the name of the token
token_value : str
The value to use as the value of the token
"""

def __init__(self, token_name: str, token_value: str):
self.name: str = token_name
self.value: str = token_value

def __str__(self) -> str:
"""
Return a nicely formatted representation of the token
Returns
-------
str
{name}: {value}
"""

return f"{self.name}: {self.value}"

def __repr__(self) -> str:
"""
Returns the token as a stringified list version of name, value
Returns
-------
str
[name, value]
"""

return str([self.name, self.value])


def tokenise(source: str) -> list[Token]:
"""
Transform a Vyxal program into a list of tokens
Parameters
----------
source : str
The Vyxal program to turn into tokens. This will have a utf-8
encoding.
Returns
-------
list[Token]
Each token is represented as a Token object.
"""

tokens: list[Token] = []
source: collections.deque = collections.deque(source)

contextual_token_value: str = ""

while source:
# By treating the program as a queue, we can dequeue elements
# until a certain predicate is satisfied. In simple terms, this
# means it's easier to group things based on order...you don't
# have to worry about what you group first.

head: str = source.popleft()
if head == "\\": # Need to escape the next character
if source:
# This has the consequence of making backslahses at the
# end of a program not error.
tokens.append(Token(TokenType.LITERAL, source.popleft()))

elif head in "`»«": # String
# Dequeue characters until the same string character is
# reached.
contextual_token_value = ""
while source and source[0] != head:
character: str = source.popleft()
if head == "`" and character == "\\":
# Handle the escape by just dequeueing the next
# character
if source:
contextual_token_value += "\\" + source.popleft()
else:
contextual_token_value += character
token_type: str = ""
if head == "`":
token_type = TokenType.STRING
elif head == "»":
token_type = TokenType.COMPRESSED_NUMBER
elif head == "«":
token_type = TokenType.COMPRESSED_STRING
tokens.append(Token(token_type, contextual_token_value))
if source:
source.popleft()
elif head in string.digits + ".":
contextual_token_value = head
while source and source[0] in string.digits + ".":
contextual_token_value += source.popleft()
tokens.append(Token(TokenType.NUMBER, contextual_token_value))
elif head == "‛":
contextual_token_value = ""
while source and len(contextual_token_value) != 2:
contextual_token_value += source.popleft()
tokens.append(Token(TokenType.LITERAL, contextual_token_value))
elif head in "@→←°":
tokens.append(Token(TokenType.GENERAL, head))
contextual_token_value = ""
while source and source[0] in string.ascii_letters + "_":
contextual_token_value += source.popleft()

tokens.append(Token(TokenType.NAME, contextual_token_value))
elif head in "k∆øÞ¨":
if source:
tokens.append(Token(TokenType.GENERAL, head + source.popleft()))
else:
tokens.append(Token(TokenType.GENERAL, head))
return tokens

0 comments on commit 700c523

Please sign in to comment.