In [None]:
# Install PLY (only needed in fresh Colab runtimes)
try:
    import ply.lex as lex
    import ply.yacc as yacc
    print('PLY already installed')
except Exception:
    import sys
    !{sys.executable} -m pip install ply --quiet
    import ply.lex as lex
    import ply.yacc as yacc
    print('Installed PLY')

In [None]:
# PLY Lexer for a subset of C
import re
import os
from collections import defaultdict
import ply.lex as lex

# Reserved keywords mapping to token names
reserved = {
    'continue': 'CONTINUE', 'default': 'DEFAULT', 'do': 'DO', 'double': 'DOUBLE', 'else': 'ELSE',
    'if': 'IF', 'int': 'INT', 'long': 'LONG', 'register': 'REGISTER', 'return': 'RETURN',
    'switch': 'SWITCH', 'typedef': 'TYPEDEF', 'union': 'UNION', 'unsigned': 'UNSIGNED', 'void': 'VOID',
] + list(set(reserved.values()))

# Single-character literals are handled via `literals` so we can keep code concise
literals = ['+','-','*','/','%','=','<','>','!','&','|','^','~',',',';','(',')','{','}','[',']']

# Regex rules for multi-character tokens (longer patterns first)
t_PLUSPLUS = r'\+\+'
t_MINUSMINUS = r'--'
t_EQ = r'=='
t_NE = r'!='
t_LE = r'<='
t_GE = r'>='
t_ANDAND = r'&&'
t_OROR = r'\|\|'
t_LSHIFT = r'<<'
t_RSHIFT = r'>>'

# Preprocessor (e.g., #include <...>)
def t_PREPROCESSOR(t):
    r'.*'
    # capture special characters like < and > in preprocessor lines
    return t

# String literal (handles simple escaped chars)
def t_STRING(t):
    r'"([^\\
]|(\\.))*?"'
    return t

# Number constant (integers for now)
def t_NUMBER(t):
    r'\b\d+\b'
    t.value = int(t.value)
    return t

# Identifier (and check for reserved keywords)
def t_ID(t):
    r'\b[_a-zA-Z][_a-zA-Z0-9]*\b'
    val = t.value
    if val in reserved:
        t.type = reserved[val]
    return t

# Ignore spaces and tabs
t_ignore = ' 	'

# Track line numbers
def t_newline(t):
    r'\n+'
    t.lexer.lineno += t.value.count('
')

# Error handling for illegal characters - we'll collect them
lexical_errors = []
def t_error(t):
    lexical_errors.append((t.value[0], t.lineno))
    t.lexer.skip(1)

# Build the lexer
lexer = lex.lex()

# Helper: classify single-character tokens into separators/operators/special chars
SEPARATORS = set([',',';','(',')','{','}','[',']'])
OPERATOR_CHARS = set(['+','-','*','/','%','=','<','>','!','&','|','^','~'])
SPECIAL_CHARS = set(['#','<','>'])

def analyze_code_with_lexer(code):
    # reset
    global lexical_errors
    lexical_errors = []
    lexer.input(code)
    symbol_table = defaultdict(set)
    token_count = 0
    token_list = []
    while True:
        tok = lexer.token()
        if not tok:
            break
        token_count += 1
        token_list.append(tok)
        # categorize token into symbol table buckets similar to original script
        t = tok.type
        v = tok.value
        if t in reserved.values():
            symbol_table['Keyword'].add(v)
        elif t == 'ID':
            symbol_table['Identifier'].add(v)
        elif t == 'NUMBER':
            symbol_table['Constant'].add(str(v))
        elif t == 'STRING':
            symbol_table['Constant'].add(v)
        elif t == 'PREPROCESSOR':
            symbol_table['Special Character'].add('#')
            # also capture < and > if present in includes
            if '<' in v or '>' in v:
                symbol_table['Special Character'].update([c for c in v if c in SPECIAL_CHARS])
        elif t in ('PLUSPLUS','MINUSMINUS','EQ','NE','LE','GE','ANDAND','OROR','LSHIFT','RSHIFT'):
            symbol_table['Operator'].add(tok.value)
        else:
            # if token is a single-char literal, classify accordingly
            sval = str(v)
            if len(sval) == 1 and sval in SEPARATORS:
                symbol_table['Separator'].add(sval)
            elif len(sval) == 1 and sval in OPERATOR_CHARS:
                symbol_table['Operator'].add(sval)
            else:
                # unknown categories fall back to listing as lexical error if not whitespace
                pass
    return token_list, token_count, symbol_table, lexical_errors

In [None]:
# Simple PLY Yacc grammar for a small subset of C (function definitions, declarations, basic statements)
import ply.yacc as yacc

# precedence (a small subset)
precedence = (
,
,

def p_program(p):
    '''program : external_declaration_list'''
    p[0] = ('program', p[1])

def p_external_declaration_list(p):
    '''external_declaration_list : external_declaration_list external_declaration
                               | external_declaration'''
    if len(p) == 3:
        if isinstance(p[1], list):
            p[0] = p[1] + [p[2]]
        else:
            p[0] = [p[1], p[2]]
    else:
        p[0] = [p[1]]

def p_external_declaration(p):
    '''external_declaration : function_definition
                          | declaration'''
    p[0] = p[1]

def p_function_definition(p):
    '''function_definition : type_specifier ID '(' parameter_list_opt ')' compound_statement'''
    p[0] = ('func', p[1], p[2], p[4], p[6])

def p_parameter_list_opt(p):
    '''parameter_list_opt : parameter_list
                         | empty'''
    p[0] = p[1]

def p_parameter_list(p):
    '''parameter_list : parameter_list ',' parameter
                     | parameter'''
    if len(p) == 4:
        p[0] = p[1] + [p[3]]
    else:
        p[0] = [p[1]]

def p_parameter(p):
    'parameter : type_specifier ID'
    p[0] = (p[1], p[2])

def p_declaration(p):
    '''declaration : type_specifier init_declarator_list ';'
                   | type_specifier ';' '''
    if len(p) == 4:
        p[0] = ('decl', p[1], p[2])
    else:
        p[0] = ('decl', p[1], [])

def p_init_declarator_list(p):
    '''init_declarator_list : init_declarator_list ',' init_declarator
                            | init_declarator'''
    if len(p) == 4:
        p[0] = p[1] + [p[3]]
    else:
        p[0] = [p[1]]

def p_init_declarator(p):
    '''init_declarator : ID
                    | ID '=' expression'''
    if len(p) == 2:
        p[0] = (p[1], None)
    else:
        p[0] = (p[1], p[3])

def p_type_specifier(p):
    '''type_specifier : INT
                      | CHAR
                      | VOID
                      | FLOAT
                      | DOUBLE
                      | UNSIGNED
                      | LONG
                      | SHORT'''
    p[0] = p[1]

def p_compound_statement(p):
    'compound_statement : '{' statement_list_opt '}' '
,

2
,
,