In [1]:
import re

def extract_if_block(source: str) -> tuple((str, str, str, str)):
    indent_regex = re.compile(r'^\s+', flags=re.MULTILINE)
    try: 
        code_before_block = re.sub(indent_regex, '', source[:source.index('if')].strip())
    except ValueError:
        return source, None, None, None

    condition_start = re.search('if\s*\(', source).end()
    condition_end = source.index(')', condition_start)
    condition = source[condition_start:condition_end].strip()
    
    # search for the end } by matching the number of { and } in the content
    content_start = source.index('{', condition_end) + 1
    brace_count = 1
    for i, c in enumerate(source[content_start:]):
        if c == '{':
            brace_count += 1
        elif c == '}':
            brace_count -= 1
            if brace_count == 0:
                content_end = content_start + i
                break

    content = re.sub(indent_regex, '', source[content_start:content_end].strip())
    all_code_after_block = re.sub(indent_regex, '', source[content_end+1:].strip())
    return code_before_block, condition, content, all_code_after_block

In [2]:
with open('source.c', 'r') as f:
    source = f.read()
code_before_block, condition, content, all_code_after_block = extract_if_block(source)
print("================================ before: ")
print(code_before_block)
print("================================ condition: ")
print(condition)
print("================================ content: ")
print(content)
print("================================ after: ")
print(all_code_after_block)

a = 10;
a > b
a = a + 1; 
b = 12; 
if (a > b) {
a = a + 14; 
}
c = 11;
c = 123;


In [3]:
class IfNode: 
    def __init__(self, data, children, condition=False):
        self.data = data
        self.children = children
        self.condition = condition
        self.variables = {}

In [4]:
def build_tree_helper(parent: IfNode, content: str) -> IfNode:  
    if not content: 
        return
    code_before_block, condition, content, all_code_after_block = extract_if_block(content)
    if code_before_block:
        parent.children.append(IfNode(code_before_block, []))
    if condition: 
        wNode = IfNode(condition, [], True)
        parent.children.append(wNode)
        build_tree_helper(wNode, content)
        build_tree_helper(parent, all_code_after_block)
    return parent

def build_tree(content: str) -> IfNode: 
    wNode = IfNode('Root', [])
    return build_tree_helper(wNode, content)

In [5]:
def print_tree(node: IfNode, indent=-1, childNum=[]) -> None:
    if node.condition:
        print('  '*indent + 'if (' + node.data + ') {', '->', childNum)
    else: 
        for line in node.data.splitlines():
            print('  '*indent + line, '->', childNum)
    for idx, child in enumerate(node.children):
        print_tree(child, indent+1, childNum + [idx]) 
    if node.condition:
        print('  '*indent + '}')

In [6]:
with open('source.c', 'r') as f:
    source = f.read()
tree = build_tree(source)
print_tree(tree)

Root -> []
a = 10; -> [0]
if (a > b) { -> [1]
  a = a + 1;  -> [1, 0]
  b = 12; -> [1, 0]
  if (a > b) { -> [1, 1]
    a = a + 14; -> [1, 1, 0]
  }
  c = 11; -> [1, 2]
}
c = 123; -> [2]


In [7]:
import re

def extract_variables(code: str) -> set:
    # Extract variables that start with a letter
    variables = re.findall(r'\b[a-zA-Z]\w*\b', code)
    # remove keywords in C, such as int, float, double, etc.
    variables = [var for var in variables if var not in ['int', 'float', 'double', 'char', 'long', 'short', 'unsigned', 'signed', 'void', 'struct', 'union', 'enum', 'typedef', 'const', 'volatile', 'auto', 'register', 'static', 'extern', 'inline', 'restrict', 'bool', 'complex', 'imaginary', 'break', 'case', 'continue', 'default', 'do', 'else', 'for', 'goto', 'if', 'return', 'sizeof', 'switch', 'while', 'alignas', 'alignof', 'atomic', 'noreturn', 'static_assert', 'thread_local', 'true', 'false', 'NULL']]
    # remove duplicates
    variables = set(variables)
    return variables

In [8]:
with open('source.c', 'r') as f:
    source = f.read()
tree = build_tree(source)
extract_variables(tree.children[1].children[0].data)

{'a', 'b'}