<a href="https://colab.research.google.com/github/ShashankShorya0211/MIMDPU/blob/main/260824.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
import re
from collections import namedtuple

class BasicCompiler:
    def __init__(self):
        self.variables = {}
        self.registers = ['eax', 'ebx', 'ecx', 'edx']
        self.register_map = {}
        self.free_registers = self.registers[:]
        self.temp_var_count = 0

    def lexical_analysis(self, source_code):
        tokens = []
        token_specification = [
            ('NUMBER',     r'\d+(\.\d*)?'),           # Integer or decimal number
            ('KEYWORD',    r'\b(if|else|while|return|int|float|char|void)\b'),  # Keywords
            ('IDENT',      r'[A-Za-z_]\w*'),          # Identifiers
            ('OP',         r'[+\-*/%]'),              # Arithmetic operators
            ('ASSIGN',     r'='),                     # Assignment operator
            ('EQ',         r'=='),                    # Equal operator
            ('NEQ',        r'!='),                    # Not equal operator
            ('LT',         r'<'),                     # Less than operator
            ('GT',         r'>'),                     # Greater than operator
            ('LTE',        r'<='),                    # Less than or equal to
            ('GTE',        r'>='),                    # Greater than or equal to
            ('AND',        r'&&'),                    # Logical AND
            ('OR',         r'\|\|'),                  # Logical OR
            ('NOT',        r'!'),                     # Logical NOT
            ('SEMI',       r';'),                     # Statement terminator
            ('LPAREN',     r'\('),                    # Left parenthesis
            ('RPAREN',     r'\)'),                    # Right parenthesis
            ('LBRACE',     r'\{'),                    # Left brace
            ('RBRACE',     r'\}'),                    # Right brace
            ('COMMA',      r','),                     # Comma
            ('WHITESPACE', r'[ \t]+'),                # Skip over spaces and tabs
            ('NEWLINE',    r'\n'),                    # Line endings
            ('MISMATCH',   r'.'),                     # Any other character
        ]
        tok_regex = '|'.join('(?P<%s>%s)' % pair for pair in token_specification)
        for mo in re.finditer(tok_regex, source_code):
            kind = mo.lastgroup
            value = mo.group()
            if kind == 'NUMBER':
                value = float(value) if '.' in value else int(value)
            elif kind == 'WHITESPACE' or kind == 'NEWLINE':
                continue
            elif kind == 'MISMATCH':
                raise RuntimeError(f'{value!r} unexpected')
            tokens.append((kind, value))
        return tokens

    def syntax_analysis(self, tokens):
        ASTNode = namedtuple('ASTNode', ['type', 'value', 'children'])

        def parse_expression(tokens):
            if tokens[0][0] == 'NUMBER':
                return ASTNode(type='Literal', value=tokens[0][1], children=[]), tokens[1:]
            elif tokens[0][0] == 'IDENT':
                return ASTNode(type='Variable', value=tokens[0][1], children=[]), tokens[1:]
            elif tokens[0][0] == 'LPAREN':
                expr, tokens = parse_expression(tokens[1:])
                if tokens[0][0] != 'RPAREN':
                    raise SyntaxError("Expected ')'")
                return expr, tokens[1:]
            raise SyntaxError("Invalid expression")

        def parse_comparison(tokens):
            left, tokens = parse_expression(tokens)
            if tokens[0][0] in ['EQ', 'NEQ', 'LT', 'GT', 'LTE', 'GTE']:   #'==', '!=', '<', '>', '<=', '>='
                op = tokens[0][0]
                right, tokens = parse_expression(tokens[1:])
                return ASTNode(type='Comparison', value=op, children=[left, right]), tokens
            return left, tokens

        def parse_statement(tokens):
            if tokens[0][0] == 'KEYWORD' and tokens[0][1] in ['int', 'float', 'char', 'void']:
              var_type = tokens[0][1]
              tokens = tokens[1:]
              if tokens[1][0] == 'IDENT':
                  raise SyntaxError(f"Expected identifier after {var_type}")
              var_name = tokens[0][1]
              tokens = tokens[1:]
              if tokens[0][0] =='ASSIGN':
                  op = tokens[0][1]
                  right, tokens = parse_expression(tokens[1:])
                  if tokens and tokens[0][0] == 'OP':
                      op2 = tokens[0][1]
                      right2, tokens = parse_expression(tokens[1:])
                      right = ASTNode(type='BinaryOp', value=op2, children=[right, right2])
                  if tokens[0][0] != 'SEMI':
                      raise SyntaxError("Expected ';'")
                  return ASTNode(type='Declaration', value=(var_type, var_name), children=[right]), tokens[1:]
              elif tokens[0][0] == 'SEMI':
                  return ASTNode(type='Declaration', value=(var_type, var_name), children=[]), tokens[1:]
            elif tokens[0][0] == 'IDENT' and tokens[1][0] == 'ASSIGN':
                left = ASTNode(type='Variable', value=tokens[0][1], children=[])
                op = tokens[1][1]
                right, tokens = parse_expression(tokens[2:])
                if tokens and tokens[0][0] == 'OP':
                    op2 = tokens[0][1]
                    right2, tokens = parse_expression(tokens[1:])
                    right = ASTNode(type='BinaryOp', value=op2, children=[right, right2])
                if tokens[0][0] != 'SEMI':
                    raise SyntaxError("Expected ';'")
                return ASTNode(type='Assignment', value=op, children=[left, right]), tokens[1:]
            elif tokens[0][0] == 'KEYWORD' and tokens[0][1] == 'if':
                tokens = tokens[1:]
                if tokens[0][0] != 'LPAREN':
                    raise SyntaxError("Expected '(' after 'if'")
                Condition, tokens = parse_comparison(tokens[1:])
                if tokens[0][0] != 'RPAREN':
                    raise SyntaxError("Expected ')' after condition")
                tokens = tokens[1:]
                if tokens[0][0] != 'LBRACE':
                    raise SyntaxError("Expected '{' after condition")
                body = []
                tokens = tokens[1:]
                while tokens[0][0] != 'RBRACE':
                    smt, tokens = parse_statement(tokens)
                    body.append(smt)
                return ASTNode(type='IfStatement', value=None, children=[Condition, ASTNode(type='Block', value=None, children=body)]), tokens[1:]
            elif tokens[0][0] == 'KEYWORD' and tokens[0][1] == 'return':
                expr, tokens = parse_expression(tokens[1:])
                if tokens[0][0] != 'SEMI':
                    raise SyntaxError("Expected ';' after return statement")
                return ASTNode(type='ReturnStatement', value=None, children=[expr]), tokens[1:]
            raise SyntaxError(f"Unknown statement starting with {tokens[0]}")



        def parse_program(tokens):
            statements = []
            while tokens:
                stmt, tokens = parse_statement(tokens)
                statements.append(stmt)
            return ASTNode(type='Program', value=None, children=statements)

        ast = parse_program(tokens)
        return ast

    def semantic_analysis(self, ast):
        def check_node(node):
            if node.type == 'BinaryOp':
                left_type = check_node(node.children[0])
                right_type = check_node(node.children[1])
                if left_type != right_type:
                    raise TypeError("Type mismatch in binary operation")
                return left_type
            elif node.type == 'Literal':
                return 'int'
            elif node.type == 'Variable':
                if node.value not in self.variables:
                    raise NameError(f"Variable {node.value} not declared")
                return self.variables[node.value]
            elif node.type == 'Declaration':
                var_type, var_name = node.value
                self.variables[var_name] = var_type
                if node.children:
                    init_type = check_node(node.children[0])
                    if init_type != var_type:
                        raise TypeError(f"Type mismatch in initialization of variable {var_name}")
                return var_type
            elif node.type == 'Assignment':
                var_name = node.children[0].value
                if var_name not in self.variables:
                    raise NameError(f"Variable {var_name} not declared")
                var_type = self.variables[var_name]
                expr_type = check_node(node.children[1])
                if var_type != expr_type:
                    raise TypeError(f"Type mismatch in assignment to variable {var_name}")
                return var_type
            elif node.type == 'Comparison':
                left_type = check_node(node.children[0])
                right_type = check_node(node.children[1])
                if left_type != right_type:
                    raise TypeError("Type mismatch in comparison")
                return 'bool'
            elif node.type == 'IfStatement':
                condition_type = check_node(node.children[0])
                if condition_type != 'bool':
                    raise TypeError("Condition in if statement must be of type bool")
                check_node(node.children[1])
            elif node.type == 'Block':
                for child in node.children:
                    check_node(child)
            elif node.type == 'ReturnStatement':
                return check_node(node.children[0])
            elif node.type == 'Program':
                for child in node.children:
                    check_node(child)

        check_node(ast)
        return ast

    def generate_ir(self, ast):
        ir = []

        def generate_node_ir(node):
            def get_temp_var():
                nonlocal self
                temp_var = f"t{self.temp_var_count}"
                self.temp_var_count += 1
                return temp_var

            if node.type == 'Program':
                for child in node.children:
                    generate_node_ir(child)
            elif node.type == 'Declaration':
                var_type, var_name = node.value
                if node.children:
                    value_ir = generate_node_ir(node.children[0])
                    ir.append(('DECLARE', var_type, var_name))
                    ir.append(('STORE', var_name, value_ir))
                else:
                    ir.append(('DECLARE', var_type, var_name))
            elif node.type == 'Assignment':
                var_name = node.children[0].value
                value_ir = generate_node_ir(node.children[1])
                ir.append(('STORE', var_name, value_ir))
            elif node.type == 'BinaryOp':
                left_ir = generate_node_ir(node.children[0])
                right_ir = generate_node_ir(node.children[1])
                result_reg = get_temp_var()
                ir.append(('BIN_OP', node.value, left_ir, right_ir, result_reg))
                return result_reg
            elif node.type == 'Literal':
                temp_var = get_temp_var()
                ir.append(('LOAD_CONST', node.value, temp_var))
                return temp_var
            elif node.type == 'Variable':
                return node.value

        generate_node_ir(ast)
        return ir

    def optimize_ir(self, ir):
        # Placeholder for optimization
        return ir

    def code_generation(self, optimized_ir):
        assembly_code = []
        live_variables = set()
        variable_to_register = {}
        register_to_variable = {reg: None for reg in self.registers}

        def allocate_register(var):
            if var in variable_to_register:
                return variable_to_register[var]
            for reg in self.registers:
                if register_to_variable[reg] is None:
                    variable_to_register[var] = reg
                    register_to_variable[reg] = var
                    return reg
            spill_reg = min(register_to_variable, key=lambda r: list(live_variables).index(register_to_variable[r]) if register_to_variable[r] in live_variables else float('inf'))
            spill_var = register_to_variable[spill_reg]
            if spill_var is not None:
                assembly_code.append(f"MOV [{spill_var}], {spill_reg}")
                live_variables.remove(spill_var)
                del variable_to_register[spill_var]
            variable_to_register[var] = spill_reg
            register_to_variable[spill_reg] = var
            return spill_reg

        def get_operand(op):
            if isinstance(op, str) and op.startswith('t'):
                return allocate_register(op)
            elif op in self.variables:
                return allocate_register(op)
            else:
                return op

        for instruction in optimized_ir:
            if instruction[0] == 'LOAD_CONST':
                dest = instruction[2]
                value = instruction[1]
                reg = allocate_register(dest)
                assembly_code.append(f"MOV {reg}, {value}")
                live_variables.add(dest)

            elif instruction[0] == 'STORE':
                src = instruction[2]
                dest = instruction[1]
                src_reg = get_operand(src)
                if dest in variable_to_register:
                    dest_reg = variable_to_register[dest]
                    if src_reg != dest_reg:
                        assembly_code.append(f"MOV {dest_reg}, {src_reg}")
                else:
                    assembly_code.append(f"MOV [{dest}], {src_reg}")
                live_variables.add(dest)

            elif instruction[0] == 'BIN_OP':
                op = instruction[1]
                left = instruction[2]
                right = instruction[3]
                result = instruction[4]

                left_reg = get_operand(left)
                right_reg = get_operand(right)
                result_reg = allocate_register(result)

                if op == '+':
                    if result_reg != left_reg:
                        assembly_code.append(f"MOV {result_reg}, {left_reg}")
                    assembly_code.append(f"ADD {result_reg}, {right_reg}")
                elif op == '-':
                    if result_reg != left_reg:
                        assembly_code.append(f"MOV {result_reg}, {left_reg}")
                    assembly_code.append(f"SUB {result_reg}, {right_reg}")
                elif op == '*':
                    if result_reg != left_reg:
                        assembly_code.append(f"MOV {result_reg}, {left_reg}")
                    assembly_code.append(f"IMUL {result_reg}, {right_reg}")
                elif op == '/':
                    assembly_code.append(f"MOV eax, {left_reg}")
                    assembly_code.append("CWD")
                    assembly_code.append(f"IDIV {right_reg}")
                    if result_reg != 'eax':
                        assembly_code.append(f"MOV {result_reg}, eax")

                live_variables.add(result)
                live_variables.discard(left)
                live_variables.discard(right)

        for var in live_variables:
            if var in variable_to_register:
                reg = variable_to_register[var]
                assembly_code.append(f"MOV [{var}], {reg}")

        return '\n'.join(assembly_code)

    def assemble(self, assembly_code):
        machine_code = []
        instructions = assembly_code.split('\n')
        for instruction in instructions:
            parts = instruction.split()
            if not parts:
                continue
            if parts[0] == 'MOV':
                if len(parts) >= 3:
                    machine_code.append(f"0001 {parts[1]}, {parts[2]}")
            elif parts[0] == 'ADD':
                if len(parts) >= 3:
                    machine_code.append(f"0010 {parts[1]}, {parts[2]}")
            elif parts[0] == 'SUB':
                if len(parts) >= 3:
                    machine_code.append(f"0011 {parts[1]}, {parts[2]}")
            elif parts[0] == 'IMUL':
                if len(parts) >= 3:
                    machine_code.append(f"0100 {parts[1]}, {parts[2]}")
            elif parts[0] == 'IDIV':
                if len(parts) >= 2:
                    machine_code.append(f"0101 {parts[1]}")
            elif parts[0] == 'CWD':
                machine_code.append("0110")
        return '\n'.join(machine_code)

    def compile(self, source_code):
        tokens = self.lexical_analysis(source_code)
        print("Tokens:", tokens)
        ast = self.syntax_analysis(tokens)
        print("AST:", ast)
        semantically_correct_ast = self.semantic_analysis(ast)
        ir = self.generate_ir(semantically_correct_ast)
        print("IR:", ir)
        optimized_ir = self.optimize_ir(ir)
        assembly_code = self.code_generation(optimized_ir)
        print("Assembly Code:\n", assembly_code)
        machine_code = self.assemble(assembly_code)
        return machine_code


# Test the compiler with a multi-line C code
source_code = """
int a = 10;
int b = 20;
int result;

if (a < b) {
    result = a + b;
}

return result;

"""
compiler = BasicCompiler()
machine_code = compiler.compile(source_code)
print("Generated Machine Code:\n", machine_code)


Tokens: [('KEYWORD', 'int'), ('IDENT', 'a'), ('ASSIGN', '='), ('NUMBER', 10), ('SEMI', ';'), ('KEYWORD', 'int'), ('IDENT', 'b'), ('ASSIGN', '='), ('NUMBER', 20), ('SEMI', ';'), ('KEYWORD', 'int'), ('IDENT', 'result'), ('SEMI', ';'), ('KEYWORD', 'if'), ('LPAREN', '('), ('IDENT', 'a'), ('LT', '<'), ('IDENT', 'b'), ('RPAREN', ')'), ('LBRACE', '{'), ('IDENT', 'result'), ('ASSIGN', '='), ('IDENT', 'a'), ('OP', '+'), ('IDENT', 'b'), ('SEMI', ';'), ('RBRACE', '}'), ('KEYWORD', 'return'), ('IDENT', 'result'), ('SEMI', ';')]
AST: ASTNode(type='Program', value=None, children=[ASTNode(type='Declaration', value=('int', 'a'), children=[ASTNode(type='Literal', value=10, children=[])]), ASTNode(type='Declaration', value=('int', 'b'), children=[ASTNode(type='Literal', value=20, children=[])]), ASTNode(type='Declaration', value=('int', 'result'), children=[]), ASTNode(type='IfStatement', value=None, children=[ASTNode(type='Comparison', value='LT', children=[ASTNode(type='Variable', value='a', children=