In [9]:
import re
class LexicalAnalyzer:

  token_types = {
    r'//[\s\S]+': 'COMMENT', #replacing all the comments with ... and find them
    r'int|float|if|else|exit|while|read|write|return': 'KEYWORD',
    r'[A-Za-z][\w]*': 'IDENTIFIER',  #identifiers
    r'\{|\}|\(|\)|;|,|\[|\]': 'SEPARATOR',  #separators - ({ } ( ) [ ] ; ,) ;
    r'==|!=|<|>|<=|>=|\|\||&&|!': 'LOGIC-OP',  #logical operators - ( == != < > <= >= ! || && )
    r'[+\-*/=]': 'ARITH-OP',  #arithmetic operators - (+ - * / =)
    r'[-+]?\d*\.\d+|\d+|("[^"]*")|(\'[^\']*\')': 'CONSTANT', # Matches integers and floating point numbers and strings (ex: 2, 4.5, "cat", etc)
}

  def __init__(self, path: str = ""):

    try:
      with open(path) as file:
        self.code = file.read()
    except FileNotFoundError:
      print("File does not exist")


  def run(self):
    tokens = []

    comments = re.findall('/\\*[^*]*\\*+(?:[^/*][^*]*\\*+)*/',self.code)
    cleaned_comments = [i.strip().replace('\n',' ').replace('/*','// ').replace('*/','').strip() for i in comments]
    for i in range(len(cleaned_comments)):
      self.code = self.code.replace(comments[i],cleaned_comments[i])

    try:

        for line in self.code.split('\n'):
            line = line.strip()
            while line:
                match = None
                for pattern, token_type in LexicalAnalyzer.token_types.items():
                    regex = re.compile(f'^{pattern}')
                    match = regex.match(line)
                    if match:
                        value = match.group(0)
                        if token_type != 'COMMENT':
                            tokens.append((token_type, value))
                        else:
                            tokens.append((token_type, value[2:].strip()))
                        line = line[len(value):].lstrip()  # Remove the matched token and any leading whitespace
                        break

                if not match:
                    print(f"Error: Unrecognized token at the beginning of line: {line}")
                    break

        return tokens

    except:
        return []

In [10]:
obj = LexicalAnalyzer('/content/testcode.cminus')
result = obj.run()
print('For the following code -\n')
print(obj.code)
print('\nThe Lexical Analyzation is as follows')
print(result)

For the following code -

int main() {
int x, y = 2;
//  this is a multiline comment
float z_9 = 4.5
read(x); read(y);
while ((x!=0) || (y!=0)) {
write (x*y);
read (x); read (y);
write('string');
write("string");
}


exit;
}

The Lexical Analyzation is as follows
[('KEYWORD', 'int'), ('IDENTIFIER', 'main'), ('SEPARATOR', '('), ('SEPARATOR', ')'), ('SEPARATOR', '{'), ('KEYWORD', 'int'), ('IDENTIFIER', 'x'), ('SEPARATOR', ','), ('IDENTIFIER', 'y'), ('ARITH-OP', '='), ('CONSTANT', '2'), ('SEPARATOR', ';'), ('COMMENT', 'this is a multiline comment'), ('KEYWORD', 'float'), ('IDENTIFIER', 'z_9'), ('ARITH-OP', '='), ('CONSTANT', '4.5'), ('KEYWORD', 'read'), ('SEPARATOR', '('), ('IDENTIFIER', 'x'), ('SEPARATOR', ')'), ('SEPARATOR', ';'), ('KEYWORD', 'read'), ('SEPARATOR', '('), ('IDENTIFIER', 'y'), ('SEPARATOR', ')'), ('SEPARATOR', ';'), ('KEYWORD', 'while'), ('SEPARATOR', '('), ('SEPARATOR', '('), ('IDENTIFIER', 'x'), ('LOGIC-OP', '!='), ('CONSTANT', '0'), ('SEPARATOR', ')'), ('LOGIC-OP', '||