In [3]:
def is_letter(char):
    # Check if the character is a letter.
    return char.isalpha()

def is_digit(char):
    # Check if the character is a digit.
    return char.isdigit()

def is_whitespace(char):
    # Check if the character is whitespace.
    return char.isspace()


In [4]:
# Lexical analyzer to tokenize the input string.

def lexer(input_string):
    
    tokens = []  # List to store the tokens
    i = 0  # Index to traverse the string

    while i < len(input_string):
        char = input_string[i]

        # Skip whitespace
        if is_whitespace(char):
            i += 1
            continue

        # Identify IDENTIFIERS
        if is_letter(char):
            identifier = char
            i += 1
            # Build the full identifier
            while i < len(input_string) and (is_letter(input_string[i]) or is_digit(input_string[i])):
                identifier += input_string[i]
                i += 1
            tokens.append(('IDENTIFIER', identifier))

        # Identify NUMBERS
        elif is_digit(char):
            number = char
            i += 1
            # Build the full number
            while i < len(input_string) and is_digit(input_string[i]):
                number += input_string[i]
                i += 1
            tokens.append(('NUMBER', number))

        # Identify OPERATORS
        elif char in '+-*/':
            tokens.append(('OPERATOR', char))
            i += 1

        # Identify ASSIGNMENT
        elif char == '=':
            tokens.append(('ASSIGNMENT', char))
            i += 1

        # Identify PARENTHESES
        elif char in '()':
            tokens.append(('PARENTHESIS', char))
            i += 1

        # Identify SEMICOLON
        elif char == ';':
            tokens.append(('SEMICOLON', char))
            i += 1

        else:
            raise ValueError(f"Unexpected character: {char}")

    return tokens

In [73]:
if __name__ == "__main__":
    
    input_string = "x = 10 + 20 * (30 / y);"
    
    #input_string = "value = (5 * 4) + 3 / (2 - 1);"
    
    print("Input String:", input_string)
    tokens = lexer(input_string)
    print("Tokens:")
    
    for token in tokens:
        print(token)

Input String: x = 10 + 20 * (30 / y);
Tokens:
('IDENTIFIER', 'x')
('ASSIGNMENT', '=')
('NUMBER', '10')
('OPERATOR', '+')
('NUMBER', '20')
('OPERATOR', '*')
('PARENTHESIS', '(')
('NUMBER', '30')
('OPERATOR', '/')
('IDENTIFIER', 'y')
('PARENTHESIS', ')')
('SEMICOLON', ';')


In [62]:
def run_tests():
    """Run a series of test cases for the lexer."""
    test_cases = {
        # Test Case 1: Simple assignment
        "x = 10;": [
            ('IDENTIFIER', 'x'),
            ('ASSIGNMENT', '='),
            ('NUMBER', '10'),
            ('SEMICOLON', ';')
        ],

        # Test Case 2: Complex arithmetic
        "x = 10 + 20 * (30 / y);": [
            ('IDENTIFIER', 'x'),
            ('ASSIGNMENT', '='),
            ('NUMBER', '10'),
            ('OPERATOR', '+'),
            ('NUMBER', '20'),
            ('OPERATOR', '*'),
            ('PARENTHESIS', '('),
            ('NUMBER', '30'),
            ('OPERATOR', '/'),
            ('IDENTIFIER', 'y'),
            ('PARENTHESIS', ')'),
            ('SEMICOLON', ';')
        ],

        # Test Case 3: Variables and operators
        "result = a - b;": [
            ('IDENTIFIER', 'result'),
            ('ASSIGNMENT', '='),
            ('IDENTIFIER', 'a'),
            ('OPERATOR', '-'),
            ('IDENTIFIER', 'b'),
            ('SEMICOLON', ';')
        ],

        # Test Case 4: Multiple statements
        "x1 = 42; y2 = x1 + 8 * z;": [
            ('IDENTIFIER', 'x1'),
            ('ASSIGNMENT', '='),
            ('NUMBER', '42'),
            ('SEMICOLON', ';'),
            ('IDENTIFIER', 'y2'),
            ('ASSIGNMENT', '='),
            ('IDENTIFIER', 'x1'),
            ('OPERATOR', '+'),
            ('NUMBER', '8'),
            ('OPERATOR', '*'),
            ('IDENTIFIER', 'z'),
            ('SEMICOLON', ';')
        ],

        # Test Case 5: Mixed spaces and tabs
        "value   = (5 +   3) * (  2   / 1);": [
            ('IDENTIFIER', 'value'),
            ('ASSIGNMENT', '='),
            ('PARENTHESIS', '('),
            ('NUMBER', '5'),
            ('OPERATOR', '+'),
            ('NUMBER', '3'),
            ('PARENTHESIS', ')'),
            ('OPERATOR', '*'),
            ('PARENTHESIS', '('),
            ('NUMBER', '2'),
            ('OPERATOR', '/'),
            ('NUMBER', '1'),
            ('PARENTHESIS', ')'),
            ('SEMICOLON', ';')
        ],

        # Failing test case
        "value = @123;": [
            ('IDENTIFIER', 'value'),
            ('ASSIGNMENT', '='),
            ('UNKNOWN', '@'),  # Expected behavior, unknown character
            ('NUMBER', '123'),
            ('SEMICOLON', ';')
        ]
    }
    
    # Iterate through test cases

    for input_string, expected_tokens in test_cases.items():
        print(f"Testing: {input_string}")
    
        try:
            tokens = lexer(input_string)
            if tokens == expected_tokens:
                print("PASS\n")
                 print("-" * 40)
            else:
                print("FAIL")
                print(f"Expected: {expected_tokens}")
                print(f"Got: {tokens}")
                
        except ValueError as e:
            print("FAIL")
            print(f"Error: {str(e)}")  # Print the error message only
            print("-" * 40)

In [63]:
if __name__ == "__main__":
    run_tests()

Testing: x = 10;
PASS

Testing: x = 10 + 20 * (30 / y);
PASS

Testing: result = a - b;
PASS

Testing: x1 = 42; y2 = x1 + 8 * z;
PASS

Testing: value   = (5 +   3) * (  2   / 1);
PASS

Testing: value = @123;
FAIL
Error: Unexpected character: @
----------------------------------------
