In [5]:
import json
import pandas as pd
from collections import Counter

# Function provided to find the function tag from an AST node
def find_tag(root) -> str:
    # root is FunctionDefinition
    definition_node = root
    for definition_child in definition_node.get("children", []):
        if definition_child.get("kind") == "FunctionDeclarator":
            declarator_node = definition_child
            for declarator_child in declarator_node.get("children", []):
                if declarator_child.get("kind") == "IdentifierDeclarator":
                    return str(declarator_child.get("data"))
    return None

# Step 1: Read the .ndjson file
ndjson_file = "functionsASTs.ndjson"

# Store all function names
function_names = []

with open(ndjson_file, "r") as file:
    for line in file:
        try:
            ast_node = json.loads(line.strip())
            function_name = find_tag(ast_node)
            if function_name:
                function_names.append(function_name)
        except json.JSONDecodeError:
            print(f"Error parsing line: {line}")

# Step 2: Perform frequency analysis
function_counter = Counter(function_names)
total_functions = sum(function_counter.values())

# Step 3: Prepare data for the CSV
data = []
for function, freq in function_counter.items():
    percentage = (freq / total_functions) * 100
    data.append({"FunctionName": function, "Frequency": freq, "Percentage": round(percentage, 2)})

# Step 4: Create DataFrame, sort by frequency, and update header with total count
df = pd.DataFrame(data)
df = df.sort_values(by="Frequency", ascending=False)  # Sort by Frequency in descending order

# Update the header to include the total count of all functions
df.columns = [f"FunctionName", f"Frequency (Total: {total_functions})", "Percentage"]

# Save as CSV
df.to_csv("freq_analysis_gcj.csv", index=False)

# Display the DataFrame
df.head()


Unnamed: 0,FunctionName,Frequency (Total: 24507),Percentage
31,solve,1004,4.1
4,check,573,2.34
86,min,273,1.11
98,max,244,1.0
0,cmp,241,0.98
