In [1]:
import json
import matplotlib.pyplot as plt
import numpy as np
from multiprocessing import Pool
import random

plt.style.use("_mpl-gallery")

In [2]:
with open("data/source/all.jsonl", "r") as fp:
    lines = fp.readlines()

with Pool() as p:
    objs = p.map(json.loads, lines)

py_objs = list(filter(lambda o: o["lang"] == "python", objs))
len(py_objs)

1218311

In [3]:
counter = {}
for o in objs:
    k = o["code"]
    v = o["test"]
    if k in counter:
        counter[k] += 1
    else:
        counter[k] = 0

len(counter.keys())

961661

In [4]:
mult_cnt = 0
for k, v in counter.items():
    if v > 1:
        mult_cnt += 1

mult_cnt

228757

In [5]:
mult_cnt / len(counter.keys())

0.23787696495958555

In [3]:
random.seed(0)
sampled = random.sample(py_objs, 100)
len(sampled)

100

In [None]:
for i, s in enumerate(sampled):
    with open(f"samples/pair_{i}.py", "w+") as fp:
        fp.write(f"# sample id {i+1}\n")
        fp.write(s["code"])
        fp.write("\n")
        fp.write(s["test"])

In [None]:
import ast


def get_call_cnt(obj) -> int | None:
    class FunctionCallCounter(ast.NodeVisitor):
        def __init__(self):
            super().__init__()
            self.call_count = 0  # Initialize the counter

        def visit_Call(self, node):
            self.call_count += 1  # Increment the counter for each function call
            self.generic_visit(node)  # Continue traversing to find more calls

    try:
        tree = ast.parse(obj["test"])
    except SyntaxError:
        return None
    counter = FunctionCallCounter()
    counter.visit(tree)
    return counter.call_count

In [None]:
with Pool() as p:
    cnts = p.map(get_call_cnt, sampled)