In [13]:
import json
import math
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split


ndjson_file = "functionsASTs_dropped_singles_doubles.ndjson"
train_perc = 0.7
valid_perc = 0.3
# leaves some for the test because floor(x*0.7)

def write_ndjson(file_name, array):
    with open(file_name, 'w') as f:
        for item in array:
            f.write(json.dumps(item) + '\n')

# Function provided to find the function tag from an AST node
def find_tag(root) -> str:
    # root is FunctionDefinition
    definition_node = root
    for definition_child in definition_node.get("children", []):
        if definition_child.get("kind") == "FunctionDeclarator":
            declarator_node = definition_child
            for declarator_child in declarator_node.get("children", []):
                if declarator_child.get("kind") == "IdentifierDeclarator":
                    return str(declarator_child.get("data"))
    return None


function_names = []
name_ast = []
with open(ndjson_file, "r") as file:
    for line in file:
        try:
            ast_node = json.loads(line.strip())
            function_name = find_tag(ast_node)
            if function_name:
                function_names.append(function_name)
                name_ast.append({"FunctionName": function_name, "AST": line})
        except json.JSONDecodeError:
            print(f"Error parsing line: {line}")

function_counter = Counter(function_names)
total_functions = sum(function_counter.values())

name_freq = []
for function_name, freq in function_counter.items():
    percentage = (freq / total_functions) * 100
    name_freq.append({"FunctionName": function_name, "Frequency": freq, "Percentage": round(percentage, 2)})

df_name_ast = pd.DataFrame(name_ast)
print(df_name_ast.head())

train = []
validation = []
test = []

# divide into 3 datasets 
for function_name, freq in function_counter.items():
    train_size = math.floor(freq * train_perc)
    valid_size = math.floor(freq * valid_perc)
    #test_size = all - train - valid
    
    train_seen = 0
    valid_seen = 0
    for item in name_ast:
        item_name = item['FunctionName']
        if item_name == function_name:
            if train_seen < train_size:
                train.append(item['AST'])
                train_seen += 1 
            elif valid_seen < valid_size:
                validation.append(item['AST'])
                valid_seen += 1
            else:
                test.append(item['AST'])
            
            
#WRITING FILES
write_ndjson('strat_train_functionsASTs.ndjson', train)
write_ndjson('strat_validate_functionsASTs.ndjson', validation)
write_ndjson('strat_test_functionsASTs.ndjson', test)

#verification
print(f"Training set size: {len(train)}")
print(f"Validation set size: {len(validation)}")
print(f"Test set size: {len(test)}")

df = pd.DataFrame(name_freq)
df = df.sort_values(by="Frequency", ascending=False)
df.columns = [f"FunctionName", f"Frequency (Total: {total_functions})", "Percentage"]
df.to_csv("freq_analysis_gcj_dropped.csv", index=False)
df.head()


  FunctionName                                                AST
0          cmp  {"kind": "FunctionDefinition", "code_pos": "<1...
1        check  {"kind": "FunctionDefinition", "code_pos": "<1...
2         calc  {"kind": "FunctionDefinition", "code_pos": "<1...
3         fans  {"kind": "FunctionDefinition", "code_pos": "<2...
4         calc  {"kind": "FunctionDefinition", "code_pos": "<1...
Training set size: 10131
Validation set size: 3871
Test set size: 1444


Unnamed: 0,FunctionName,Frequency (Total: 15446),Percentage
14,solve,1004,6.5
1,check,573,3.71
35,min,273,1.77
39,max,244,1.58
0,cmp,241,1.56
