Notebook created by Rosa Filgueira - r.filgueira@epcc.ed.ac.uk


# Parsing Java Code

This section demonstrates how to extract key elements from Java code using the **javalang** library:

* Installation:
Install the javalang library using pip.

* Code Example:
A sample Java file is created with classes, methods, fields, and documentation comments.

* Parsing Process:
  * Parse the Java code to extract:
    * Imports: Lists all imported packages.
    * Classes: Identifies class names, methods, fields, and associated documentation.
    * Methods: Extracts method names, parameters, return types, **code**, and **comments**.

* Output:
A detailed breakdown of imports, classes, methods, and their metadata (e.g., documentation).

In [1]:
!pip install javalang


Collecting javalang
  Downloading javalang-0.13.0-py3-none-any.whl.metadata (805 bytes)
Downloading javalang-0.13.0-py3-none-any.whl (22 kB)
Installing collected packages: javalang
Successfully installed javalang-0.13.0


In [2]:
# Write example Java code to a file
java_code = """
// Example.java

// Import statements
import java.util.List;
import java.util.ArrayList;

// A simple utility class
public class Example {
    // A private field
    private String name;

    // Constructor
    public Example(String name) {
        this.name = name;
    }

    /**
     * A method to get the name.
     * @return the name of the object.
     */
    public String getName() {
        return this.name;
    }

    /**
     * A method to sum two integers.
     * @param a the first integer.
     * @param b the second integer.
     * @return the sum of a and b.
     */
    public int add(int a, int b) {
        return a + b;
    }

    /**
     * A method that returns a list of strings.
     * @return a list of sample strings.
     */
    public List<String> getSampleList() {
        List<String> samples = new ArrayList<>();
        samples.add("Sample 1");
        samples.add("Sample 2");
        return samples;
    }
}

// A utility class for mathematical and printing functions
class Utility {
    /**
     * A method to print a welcome message.
     */
    public void printWelcome() {
        System.out.println("Welcome to the utility method demonstration!");
    }

    /**
     * A method to calculate the square of a number.
     * @param num the number to square.
     * @return the square of the number.
     */
    public int square(int num) {
        return num * num;
    }
}

// Another helper class
class Helper {
    /**
     * A helper method to print a message.
     * @param message the message to print.
     */
    public void printMessage(String message) {
        System.out.println("Message: " + message);
    }
}
"""

# Save it as example_java.java
with open("example_java.java", "w") as file:
    file.write(java_code)


In [3]:
!cat example_java.java


// Example.java

// Import statements
import java.util.List;
import java.util.ArrayList;

// A simple utility class
public class Example {
    // A private field
    private String name;

    // Constructor
    public Example(String name) {
        this.name = name;
    }

    /**
     * A method to get the name.
     * @return the name of the object.
     */
    public String getName() {
        return this.name;
    }

    /**
     * A method to sum two integers.
     * @param a the first integer.
     * @param b the second integer.
     * @return the sum of a and b.
     */
    public int add(int a, int b) {
        return a + b;
    }

    /**
     * A method that returns a list of strings.
     * @return a list of sample strings.
     */
    public List<String> getSampleList() {
        List<String> samples = new ArrayList<>();
        samples.add("Sample 1");
        samples.add("Sample 2");
        return samples;
    }
}

// A utility class for mathematical and printing functi

In [10]:
import javalang


def extract_code(file_path, start_line, end_line):
    """Extract code from a Java file given start and end lines."""
    with open(file_path, 'r') as file:
        lines = file.readlines()
    return ''.join(lines[start_line - 1:end_line]) if start_line and end_line else ""


def extract_preceding_line_comment(lines, start_line):
    """Extract single-line comments (`//`) directly preceding a line of code."""
    comment_lines = []
    for i in range(start_line - 2, -1, -1):  # Traverse upwards from the start line
        line = lines[i].strip()
        if line.startswith("//"):
            comment_lines.insert(0, line[2:].strip())  # Remove `//` and strip whitespace
        elif line:  # Stop at non-empty, non-comment line
            break
    return " ".join(comment_lines) if comment_lines else "No documentation provided."


def parse_java_file(file_path):
    """Parse a Java file and extract imports, classes, methods, and their code."""
    with open(file_path, 'r') as file:
        code = file.read()

    # Parse the code using javalang
    tree = javalang.parse.parse(code)

    # Extract imports
    imports = []
    for imp in tree.imports:
        imports.append({
            'path': imp.path,
            'is_static': imp.static,
            'is_wildcard': imp.wildcard
        })

    # Extract classes and their methods
    classes = []
    with open(file_path, 'r') as file:
        lines = file.readlines()

    for path, node in tree:
        if isinstance(node, javalang.tree.ClassDeclaration):
            # Get the class code
            class_start = node.position.line if node.position else None
            class_end = None
            if node.position and hasattr(node, 'body') and node.body:
                class_end = max(
                    (child.position.line for child in node.body if child.position),
                    default=None
                )
            class_code = extract_code(file_path, class_start, class_end)

            class_info = {
                'name': node.name,
                'fields': [
                    variable.name
                    for field in node.body
                    if isinstance(field, javalang.tree.FieldDeclaration)
                    for variable in field.declarators
                ],
                'documentation': extract_preceding_line_comment(lines, class_start),
                'methods': [],

            }

            # Extract methods within the class
            for member in node.body:
                if isinstance(member, javalang.tree.MethodDeclaration):
                    method_start = member.position.line if member.position else None
                    method_end = None
                    if member.position and member.body:
                        method_end = max(
                            (statement.position.line for statement in member.body if statement.position),
                            default=None
                        )
                    method_code = extract_code(file_path, method_start, method_end)

                    method_info = {
                        'name': member.name,
                        'parameters': [
                            f"{param.type.name} {param.name}" for param in member.parameters
                        ],
                        'return_type': member.return_type.name if member.return_type else "void",
                        'documentation': extract_preceding_line_comment(lines, method_start),
                        'is_static': 'static' in member.modifiers,
                        'code': method_code
                    }
                    class_info['methods'].append(method_info)

            classes.append(class_info)

    return {
        'imports': imports,
        'classes': classes
    }




In [11]:
# Example usage
java_file_path = './example_java.java'  # Path to your Java file
result = parse_java_file(java_file_path)

# Print imports
print("Imports:")
for imp in result['imports']:
    print(f"- {imp['path']} (Static: {imp['is_static']}, Wildcard: {imp['is_wildcard']})")

# Print classes and their methods
print("\nClasses:")
for cls in result['classes']:
    print(f"Class: {cls['name']}")
    print(f"  Documentation: {cls['documentation']}")
    print(f"  Fields: {', '.join(cls['fields']) if cls['fields'] else 'No fields'}")
    for method in cls['methods']:
        print(f"  Method: {method['name']}")
        print(f"    Parameters: {', '.join(method['parameters']) if method['parameters'] else 'None'}")
        print(f"    Return Type: {method['return_type']}")
        print(f"    Documentation: {method['documentation']}")
        print(f"    Code:\n{method['code']}")


Imports:
- java.util.List (Static: False, Wildcard: False)
- java.util.ArrayList (Static: False, Wildcard: False)

Classes:
Class: Example
  Documentation: A simple utility class
  Fields: name
  Method: getName
    Parameters: None
    Return Type: String
    Documentation: No documentation provided.
    Code:
    public String getName() {
        return this.name;

  Method: add
    Parameters: int a, int b
    Return Type: int
    Documentation: No documentation provided.
    Code:
    public int add(int a, int b) {
        return a + b;

  Method: getSampleList
    Parameters: None
    Return Type: List
    Documentation: No documentation provided.
    Code:
    public List<String> getSampleList() {
        List<String> samples = new ArrayList<>();
        samples.add("Sample 1");
        samples.add("Sample 2");
        return samples;

Class: Utility
  Documentation: A utility class for mathematical and printing functions
  Fields: No fields
  Method: printWelcome
    Parameters:

# Testing with C Code

This section demonstrates how to extract key elements from C code using the **pycparser** library:

**Important**: Advised to read these to understand the code bellow:
* [blog](https://eli.thegreenplace.net/2015/on-parsing-c-type-declarations-and-fake-headers)
* [GitHub pycparser repo](https://github.com/eliben/pycparser/tree/main/examples)


Notes:

* Preprocessing: Before parsing, the C code must be preprocessed using gcc. This step generates a simplified version of the file (with an .i extension) by removing macros, includes, and other preprocessing directives. Preprocessing ensures that the code is in a clean, standardized format suitable for analysis by pycparser, which doesn't handle raw C files directly.

* Code Examples: Two samples C files (one for V1, and one for V2 and V3)  are created containing functions, global variables, structures, and documentation comments.

* Parsing Process:
  * Parse the preprocessed C file to extract:
     * Functions: Extract function names, return types, and parameters.
     * Structures: Identify structure names and their fields.
     * Global Variables: Retrieve global variable names and types.

* Documentation Handling:

   * In V1, comments and documentation are ignored, focusing solely on the structural elements of the code.
   * In V2, documentation comments are captured and linked to their respective code elements. This involves extracting single-line (//) and multi-line (/* */) comments and associating them with the nearest relevant constructs (e.g., functions, variables).
   *In V3, I have added the part of also extracting the code. This is very important for what I need these tools in the future.

* Output: A comprehensive breakdown of functions, structures, global variables, (in V2) their associated documentation, and (in V3) their associated code.

In [None]:
# Install GCC for preprocessing
!sudo apt update
!sudo apt install gcc -y

# Install pycparser Python library
!pip install pycparser

# Clone the pycparser repository to access fake headers
!git clone https://github.com/eliben/pycparser.git


[33m0% [Working][0m            Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
[33m0% [Waiting for headers] [Connecting to security.ubuntu.com (185.125.190.82)] [[0m                                                                               Get:2 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
                                                                               Hit:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
[33m0% [2 InRelease 15.6 kB/128 kB 12%] [Waiting for headers] [Waiting for headers][0m                                                                               Hit:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
[33m0% [2 InRelease 47.5 kB/128 kB 37%] [Waiting for headers] [Waiting for headers][0m                                                                               Get:5 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:6

## V1: Without parsing the comments

**pycparser** does not parsert the comments. Lets ignore this for now.

In [None]:
# Write example C code to a file
c_code = """
#include <stdio.h>

// A simple C function
int add(int a, int b) {
    return a + b;
}

int main() {
    int result = add(3, 4);
    printf("Result: %d\\n", result);
    return 0;
}
"""

# Save it as example_c.c
with open("example_c.c", "w") as file:
    file.write(c_code)



### Parsing the C Code

Before parsing a C file using pycparser, the code must be preprocessed.

The gcc command below generates a preprocessed file (example_c.i) from the source file (example_c.c).

This step removes macros, includes, and other preprocessing directives, making the code ready for analysis:

In [None]:
!gcc -E example_c.c -o example_c.i -nostdinc -Ipycparser/utils/fake_libc_include -D'__attribute__(x)='


### Parsing the Preprocessed File
Once the file is preprocessed (meaning we have a file with the extension ".i"), pycparser can be used to parse it and build an Abstract Syntax Tree (AST). The following Python code demonstrates this:

In [None]:
from pycparser import parse_file

# Path to the preprocessed file
file_path = "example_c.i"

# Parse and display the AST
try:
    ast = parse_file(file_path, use_cpp=False)
    print("AST parsed successfully!")
    ast.show()
except Exception as e:
    print(f"Error parsing the C file: {e}")

AST parsed successfully!
FileAST: 
  Typedef: size_t, [], ['typedef']
    TypeDecl: size_t, [], None
      IdentifierType: ['int']
  Typedef: __builtin_va_list, [], ['typedef']
    TypeDecl: __builtin_va_list, [], None
      IdentifierType: ['int']
  Typedef: __gnuc_va_list, [], ['typedef']
    TypeDecl: __gnuc_va_list, [], None
      IdentifierType: ['int']
  Typedef: va_list, [], ['typedef']
    TypeDecl: va_list, [], None
      IdentifierType: ['int']
  Typedef: __int8_t, [], ['typedef']
    TypeDecl: __int8_t, [], None
      IdentifierType: ['int']
  Typedef: __uint8_t, [], ['typedef']
    TypeDecl: __uint8_t, [], None
      IdentifierType: ['int']
  Typedef: __int16_t, [], ['typedef']
    TypeDecl: __int16_t, [], None
      IdentifierType: ['int']
  Typedef: __uint16_t, [], ['typedef']
    TypeDecl: __uint16_t, [], None
      IdentifierType: ['int']
  Typedef: __int_least16_t, [], ['typedef']
    TypeDecl: __int_least16_t, [], None
      IdentifierType: ['int']
  Typedef: __uint_l

### Parsing C Code with AST (Ignoring Documentation)

This script demonstrates how to parse a preprocessed C file using pycparser to extract key code elements such as functions, structures, and global variables. The parse_c_file function walks through the Abstract Syntax Tree (AST) and categorizes elements as follows:

* Functions: Extracts the function name, return type, and parameter details.
* Structures: Retrieves the structure name and its fields (name and type).
* Global Variables: Gathers global variable names and types.

The parsing process focuses on the structure of the code while skipping detailed documentation extraction at this stage. Although documentation could theoretically be inferred from comments, it is not implemented here due to the complexity of reliably linking comments to code elements. This limitation arises from:

* Lack of Comment Mapping: pycparser does not inherently map comments to specific AST nodes.
* Manual Effort: Detecting and associating documentation would require additional logic to handle multi-line comments, proximity to code, and edge cases.

In [None]:
from pycparser import c_ast, parse_file

def parse_c_file(file_path):
    try:
        # Parse the C code file
        ast = parse_file(file_path, use_cpp=False)
        print("AST successfully parsed!")  # Debugging output
    except Exception as e:
        print(f"Error parsing the C file: {e}")
        return None

    # Store results
    functions = []
    structures = []
    global_vars = []

    # Walk through the AST
    for node in ast.ext:
        if isinstance(node, c_ast.FuncDef):  # Function definitions
            try:
                # Extract return type
                return_type_node = node.decl.type.type
                return_type = ' '.join(return_type_node.names) if isinstance(return_type_node, c_ast.IdentifierType) else "void"

                # Extract parameters
                parameters = []
                if node.decl.type.args:
                    for param in node.decl.type.args.params:
                        param_type_node = param.type.type
                        param_type = ' '.join(param_type_node.names) if isinstance(param_type_node, c_ast.IdentifierType) else "void"
                        parameters.append({'name': param.name, 'type': param_type})


                # Append function details
                func_info = {
                    'name': node.decl.name,
                    'return_type': return_type,
                    'parameters': parameters
                }
                functions.append(func_info)
            except Exception as e:
                print(f"Error processing function {node.decl.name}: {e}")

        elif isinstance(node, c_ast.Struct):  # Structures
            struct_info = {
                'name': node.name,
                'fields': []
            }
            for decl in (node.decls or []):  # Structure fields
                field_type = ' '.join(decl.type.type.names) if isinstance(decl.type.type, c_ast.IdentifierType) else "void"
                struct_info['fields'].append({'name': decl.name, 'type': field_type})
            structures.append(struct_info)

        elif isinstance(node, c_ast.Decl):  # Global variables
            if isinstance(node.type, c_ast.TypeDecl):
                global_var = {
                    'name': node.name,
                    'type': ' '.join(node.type.type.names) if isinstance(node.type.type, c_ast.IdentifierType) else "void",
                }
                global_vars.append(global_var)

    return {
        'functions': functions,
        'structures': structures,
        'global_vars': global_vars,
    }

# Example usage
file_path = 'example_c.i'  # Preprocessed C file
result = parse_c_file(file_path)

# Display the results
if result:
    print("\nFunctions:")
    for func in result['functions']:
        print(f"Function: {func['name']}")
        print(f"  Return Type: {func['return_type']}")
        for param in func['parameters']:
            print(f"  Parameter: {param['type']} {param['name']}")

    print("\nStructures:")
    for struct in result['structures']:
        print(f"Structure: {struct['name']}")
        for field in struct['fields']:
            print(f"  Field: {field['type']} {field['name']}")

    print("\nGlobal Variables:")
    for var in result['global_vars']:
        print(f"Variable: {var['name']} ({var['type']})")


AST successfully parsed!

Functions:
Function: add
  Return Type: void
  Parameter: int a
  Parameter: int b
Function: main
  Return Type: void

Structures:

Global Variables:


## V2: With parsing comments - and improved in general

In [None]:
# Write example C code to a file
c_code = """
#include <stdio.h>

// A simple structure to represent a point in 2D space
struct Point {
    int x;  // X coordinate
    int y;  // Y coordinate
};

// A global variable for testing
int global_var = 42;

/**
 * Adds two integers together.
 *
 * @param a The first integer.
 * @param b The second integer.
 * @return The sum of a and b.
 */
int add(int a, int b) {
    return a + b;
}

/**
 * Prints a message.
 *
 * @param message The message to print.
 */
void print_message(const char* message) {
    printf("Message: %s\\n", message);
}

int main() {
    struct Point p = {3, 4};
    printf("Point: (%d, %d)\\n", p.x, p.y);

    int result = add(3, 4);
    printf("Result: %d\\n", result);

    print_message("Hello, C world!");
    return 0;
}
"""

# Save it as example_c_1.c
with open("example_c_1.c", "w") as file:
    file.write(c_code)


### Preprocessing the C File

In [None]:
!gcc -E example_c_1.c -o example_c_1.i -nostdinc -Ipycparser/utils/fake_libc_include -D'__attribute__(x)='


### Parsing the Preprocess File

In [None]:
from pycparser import parse_file

# Path to the preprocessed file
file_path = "example_c_1.i"

# Parse and display the AST
try:
    ast = parse_file(file_path, use_cpp=False)
    print("AST parsed successfully!")
    ast.show()
except Exception as e:
    print(f"Error parsing the C file: {e}")

AST parsed successfully!
FileAST: 
  Typedef: size_t, [], ['typedef']
    TypeDecl: size_t, [], None
      IdentifierType: ['int']
  Typedef: __builtin_va_list, [], ['typedef']
    TypeDecl: __builtin_va_list, [], None
      IdentifierType: ['int']
  Typedef: __gnuc_va_list, [], ['typedef']
    TypeDecl: __gnuc_va_list, [], None
      IdentifierType: ['int']
  Typedef: va_list, [], ['typedef']
    TypeDecl: va_list, [], None
      IdentifierType: ['int']
  Typedef: __int8_t, [], ['typedef']
    TypeDecl: __int8_t, [], None
      IdentifierType: ['int']
  Typedef: __uint8_t, [], ['typedef']
    TypeDecl: __uint8_t, [], None
      IdentifierType: ['int']
  Typedef: __int16_t, [], ['typedef']
    TypeDecl: __int16_t, [], None
      IdentifierType: ['int']
  Typedef: __uint16_t, [], ['typedef']
    TypeDecl: __uint16_t, [], None
      IdentifierType: ['int']
  Typedef: __int_least16_t, [], ['typedef']
    TypeDecl: __int_least16_t, [], None
      IdentifierType: ['int']
  Typedef: __uint_l

### Parsing C Code with AST & **Documentation**

This script extends the parsing functionality of pycparser by integrating a method to extract and associate comments from the original .c source file with code elements like functions, structures, and global variables.

**Problem**

The core problem is that pycparser does not handle comments within the C source code. As a result, documentation associated with functions, variables, or structures (e.g., comments) is not captured by default. This is crucial for analyzing or documenting the purpose of the code elements.

**Solution**

The script solves this by:

* Extracting Comments: Single-line (//) and multi-line (/* */) comments are captured using regular expressions. The comments are stored along with their respective line numbers for later association.

* Matching Comments to Code:
Comments are matched to code elements based on proximity, assuming that comments appearing immediately before a code element describe that element. This is done by comparing the line numbers of comments and code elements.

* Improved Type Handling:
The get_type_name function recursively resolves complex C types (e.g., pointers, arrays) to ensure accurate type representation for parameters, fields, and variables.

*Parsing and Linking:
The .i file (preprocessed by GCC) is parsed to extract the AST, and the .c file is read to extract comments. These are combined to provide detailed outputs that include documentation for each code element.

This approach allows for seamless documentation retrieval without relying on pycparser's internal comment parsing, which it lacks.

In [None]:
import re
from pycparser import c_ast, parse_file

def extract_comments(c_file_path):
    """Extract comments from the C file."""
    comments = []
    with open(c_file_path, 'r') as file:
        lines = file.readlines()
        for i, line in enumerate(lines):
            # Capture single-line comments
            single_line_comment = re.search(r'//(.*)', line)
            if single_line_comment:
                comments.append({'line': i + 1, 'text': single_line_comment.group(1).strip()})

            # Capture multi-line comments
            if '/*' in line:
                comment_text = line.strip()
                while '*/' not in comment_text:
                    i += 1
                    comment_text += lines[i].strip()
                comments.append({'line': i + 1, 'text': re.sub(r'/\*|\*/', '', comment_text).strip()})
    return comments

def match_comments_to_code(comments, code_elements):
    """Match comments to code elements by proximity."""
    for element in code_elements:
        line = element.get('line', -1)
        relevant_comments = [c['text'] for c in comments if c['line'] < line]
        element['documentation'] = relevant_comments[-1] if relevant_comments else "No documentation provided."
    return code_elements

def get_type_name(type_node):
    """Recursively get the type name from a type node."""
    if isinstance(type_node, c_ast.IdentifierType):  # Basic types (e.g., int, char)
        return ' '.join(type_node.names)
    elif isinstance(type_node, c_ast.TypeDecl):  # Type declarations
        return get_type_name(type_node.type)
    elif isinstance(type_node, c_ast.PtrDecl):  # Pointers
        return get_type_name(type_node.type) + " *"
    elif isinstance(type_node, c_ast.ArrayDecl):  # Arrays
        return get_type_name(type_node.type) + "[]"
    return "unknown_type"

def parse_c_file_with_docs(file_i_path, file_c_path):
    """Parse C code using pycparser for structure and regex for documentation."""
    try:
        # Parse the preprocessed C file
        ast = parse_file(file_i_path, use_cpp=False)
        print("AST successfully parsed!")
    except Exception as e:
        print(f"Error parsing the C file: {e}")
        return None

    # Extract comments from the C source file
    comments = extract_comments(file_c_path)

    # Parse code elements
    functions = []
    global_vars = []
    structures = []

    for node in ast.ext:
        if isinstance(node, c_ast.FuncDef):  # Functions
            func_info = {
                'name': node.decl.name,
                'return_type': get_type_name(node.decl.type.type),
                'parameters': [
                    {'name': param.name, 'type': get_type_name(param.type)}
                    for param in (node.decl.type.args.params if node.decl.type.args else [])
                ],
                'line': node.decl.coord.line,
            }
            functions.append(func_info)
        elif isinstance(node, c_ast.Decl) and isinstance(node.type, c_ast.TypeDecl):  # Global variables
            global_vars.append({
                'name': node.name,
                'type': get_type_name(node.type.type),
                'line': node.coord.line,
            })
        elif isinstance(node, c_ast.Struct):  # Structures
            structures.append({
                'name': node.name,
                'fields': [
                    {'name': decl.name, 'type': get_type_name(decl.type)}
                    for decl in (node.decls or [])
                ],
                'line': node.coord.line if hasattr(node, 'coord') else -1,
            })

    # Match comments to parsed code elements
    functions = match_comments_to_code(comments, functions)
    global_vars = match_comments_to_code(comments, global_vars)
    structures = match_comments_to_code(comments, structures)

    return {
        'functions': functions,
        'structures': structures,
        'global_vars': global_vars,
    }

# Example usage
file_i_path = "example_c_1.i"
file_c_path = "example_c_1.c"
result = parse_c_file_with_docs(file_i_path, file_c_path)

# Display the results
if result:
    print("\nFunctions:")
    for func in result['functions']:
        print(f"Function: {func['name']}")
        print(f"  Return Type: {func['return_type']}")
        print(f"  Documentation: {func['documentation']}")
        for param in func['parameters']:
            print(f"  Parameter: {param['type']} {param['name']}")

    print("\nStructures:")
    for struct in result['structures']:
        print(f"Structure: {struct['name']}")
        for field in struct['fields']:
            print(f"  Field: {field['type']} {field['name']}")
        print(f"  Documentation: {struct.get('documentation', 'No documentation provided.')}")

    print("\nGlobal Variables:")
    for var in result['global_vars']:
        print(f"Variable: {var['name']} ({var['type']})")
        print(f"  Documentation: {var['documentation']}")







AST successfully parsed!

Functions:
Function: add
  Return Type: int
  Documentation: ** Adds two integers together.** @param a The first integer.* @param b The second integer.* @return The sum of a and b.
  Parameter: int a
  Parameter: int b
Function: print_message
  Return Type: void
  Documentation: ** Prints a message.** @param message The message to print.
  Parameter: char * message
Function: main
  Return Type: int
  Documentation: ** Prints a message.** @param message The message to print.

Structures:

Global Variables:
Variable: global_var (int)
  Documentation: A global variable for testing


## V3: Extracting also the code of each function - The best version among the three.

In [None]:
import re
from pycparser import c_ast, parse_file

def extract_comments(c_file_path):
    """Extract comments from the C file."""
    comments = []
    with open(c_file_path, 'r') as file:
        lines = file.readlines()
        for i, line in enumerate(lines):
            # Capture single-line comments
            single_line_comment = re.search(r'//(.*)', line)
            if single_line_comment:
                comments.append({'line': i + 1, 'text': single_line_comment.group(1).strip()})

            # Capture multi-line comments
            if '/*' in line:
                comment_text = line.strip()
                while '*/' not in comment_text:
                    i += 1
                    comment_text += lines[i].strip()
                comments.append({'line': i + 1, 'text': re.sub(r'/\*|\*/', '', comment_text).strip()})
    return comments

def match_comments_to_code(comments, code_elements):
    """Match comments to code elements by proximity."""
    for element in code_elements:
        line = element.get('line', -1)
        relevant_comments = [c['text'] for c in comments if c['line'] < line]
        element['documentation'] = relevant_comments[-1] if relevant_comments else "No documentation provided."
    return code_elements

def get_type_name(type_node):
    """Recursively get the type name from a type node."""
    if isinstance(type_node, c_ast.IdentifierType):  # Basic types (e.g., int, char)
        return ' '.join(type_node.names)
    elif isinstance(type_node, c_ast.TypeDecl):  # Type declarations
        return get_type_name(type_node.type)
    elif isinstance(type_node, c_ast.PtrDecl):  # Pointers
        return get_type_name(type_node.type) + " *"
    elif isinstance(type_node, c_ast.ArrayDecl):  # Arrays
        return get_type_name(type_node.type) + "[]"
    return "unknown_type"

def extract_function_code(file_path, start_line):
    """Extract the code snippet for a function from the source file."""
    with open(file_path, 'r') as file:
        lines = file.readlines()
        # Guess the end of the function by counting braces
        open_braces = 0
        end_line = start_line - 1
        for i in range(start_line - 1, len(lines)):
            open_braces += lines[i].count('{')
            open_braces -= lines[i].count('}')
            if open_braces == 0 and '{' in lines[start_line - 1]:  # Function body ends
                end_line = i
                break
        return ''.join(lines[start_line - 1:end_line + 1])

def parse_c_file_with_docs_and_code(file_i_path, file_c_path):
    """Parse C code using pycparser for structure and regex for documentation."""
    try:
        # Parse the preprocessed C file
        ast = parse_file(file_i_path, use_cpp=False)
        print("AST successfully parsed!")
    except Exception as e:
        print(f"Error parsing the C file: {e}")
        return None

    # Extract comments from the C source file
    comments = extract_comments(file_c_path)

    # Parse code elements
    functions = []
    global_vars = []
    structures = []

    for node in ast.ext:
        if isinstance(node, c_ast.FuncDef):  # Functions
            start_line = node.decl.coord.line
            func_info = {
                'name': node.decl.name,
                'return_type': get_type_name(node.decl.type.type),
                'parameters': [
                    {'name': param.name, 'type': get_type_name(param.type)}
                    for param in (node.decl.type.args.params if node.decl.type.args else [])
                ],
                'line': start_line,
                'code': extract_function_code(file_c_path, start_line)
            }
            functions.append(func_info)
        elif isinstance(node, c_ast.Decl) and isinstance(node.type, c_ast.TypeDecl):  # Global variables
            global_vars.append({
                'name': node.name,
                'type': get_type_name(node.type.type),
                'line': node.coord.line,
            })
        elif isinstance(node, c_ast.Struct):  # Structures
            structures.append({
                'name': node.name,
                'fields': [
                    {'name': decl.name, 'type': get_type_name(decl.type)}
                    for decl in (node.decls or [])
                ],
                'line': node.coord.line if hasattr(node, 'coord') else -1,
            })

    # Match comments to parsed code elements
    functions = match_comments_to_code(comments, functions)
    global_vars = match_comments_to_code(comments, global_vars)
    structures = match_comments_to_code(comments, structures)

    return {
        'functions': functions,
        'structures': structures,
        'global_vars': global_vars,
    }

# Example usage
file_i_path = "example_c_1.i"  # Preprocessed C file
file_c_path = "example_c_1.c"  # Original C file
result = parse_c_file_with_docs_and_code(file_i_path, file_c_path)

# Display the results
if result:
    print("\nFunctions:")
    for func in result['functions']:
        print(f"Function: {func['name']}")
        print(f"  Return Type: {func['return_type']}")
        print(f"  Documentation: {func['documentation']}")
        print(f"  Code:\n{func['code']}")
        for param in func['parameters']:
            print(f"  Parameter: {param['type']} {param['name']}")

    print("\nStructures:")
    for struct in result['structures']:
        print(f"Structure: {struct['name']}")
        for field in struct['fields']:
            print(f"  Field: {field['type']} {field['name']}")
        print(f"  Documentation: {struct.get('documentation', 'No documentation provided.')}")

    print("\nGlobal Variables:")
    for var in result['global_vars']:
        print(f"Variable: {var['name']} ({var['type']})")
        print(f"  Documentation: {var['documentation']}")



AST successfully parsed!

Functions:
Function: add
  Return Type: int
  Documentation: ** Adds two integers together.** @param a The first integer.* @param b The second integer.* @return The sum of a and b.
  Code:
int add(int a, int b) {
    return a + b;
}

  Parameter: int a
  Parameter: int b
Function: print_message
  Return Type: void
  Documentation: ** Prints a message.** @param message The message to print.
  Code:
void print_message(const char* message) {
    printf("Message: %s\n", message);
}

  Parameter: char * message
Function: main
  Return Type: int
  Documentation: ** Prints a message.** @param message The message to print.
  Code:
int main() {
    struct Point p = {3, 4};
    printf("Point: (%d, %d)\n", p.x, p.y);

    int result = add(3, 4);
    printf("Result: %d\n", result);

    print_message("Hello, C world!");
    return 0;
}


Structures:

Global Variables:
Variable: global_var (int)
  Documentation: A global variable for testing
