In [105]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, pipeline

tokenizer_GraphCodeBert = AutoTokenizer.from_pretrained("microsoft/graphcodebert-base")

model_GraphCodeBert = AutoModelForMaskedLM.from_pretrained("microsoft/graphcodebert-base")

In [106]:
import ast

def getLineAssignment(tree, lineno):
    return next((node for node in ast.walk(tree) if isinstance(node, ast.Name) and node.lineno == lineno), None)

def get_variables(expression):
    tree = ast.parse(expression)
    variables = []
    explored =[]
    for node in ast.walk(tree):
        if isinstance(node, ast.FunctionDef):
            for node2 in ast.walk(node):
                if isinstance(node2, ast.Assign):
                    nodeVariable = getLineAssignment(tree, node2.lineno).id
                    variable = [nodeVariable, node.name]
                    if variable not in variables:
                        variables.append([nodeVariable, node.name])
                        explored.append(node2)
                if isinstance(node2, ast.arg):
                    variable = [node2.arg, node.name]
                    if variable not in variables:
                        variables.append(variable)
        if not isinstance(node, ast.Assign):
            continue
        if node in explored:
            continue
        nodeVariable = getLineAssignment(tree, node.lineno).id
        variable = [nodeVariable, None]
        if variable not in variables:
            variables.append(variable)
    return variables

def replaceVar(tree, variable, replacement):
    num = 0
    for node in ast.walk(tree):
        if isinstance(node, ast.FunctionDef):
            for node2 in ast.walk(node):
                if isinstance(node2, ast.Name):
                    nodeVariable = node2.id
                    var = [nodeVariable, node.name]
                    if var == variable:
                        node2.id = replacement
                        num += 1
                if isinstance(node2, ast.arg):
                    var = [node2.arg, node.name]
                    if var == variable:
                        node2.arg = replacement
                        num += 1
        if not isinstance(node, ast.Name):
            continue
        var = [node.id, None]
        if var == variable:
            node.id = replacement
            num += 1
    return num

In [107]:
import astunparse
import keyword
import random

def renameIdentifiers(CODE):
    variables = get_variables(CODE)
    print(variables)
    tree = ast.parse(CODE)

    for i in range(0, len(variables)):
        numMask = replaceVar(tree, variables[i], '<mask>')
        masked = astunparse.unparse(tree)
        fill_mask_GCB = pipeline("fill-mask", model=model_GraphCodeBert, tokenizer=tokenizer_GraphCodeBert)
        candidates = fill_mask_GCB(masked)
        replacement = ''
        varNamesInScope = []
        for j in variables:
            if j[1] == variables[i][1]:
                varNamesInScope.append(j[0])
        if numMask == 1:
            jra = [t for t in range(0, len(candidates))]
            random.shuffle(jra)
            for j in jra:
                replacement = candidates[j]['token_str'].lstrip(' ')
                if replacement in varNamesInScope:
                    continue
                if keyword.iskeyword(replacement):
                    continue
                if replacement == 'self':
                    continue
                if len(replacement) > 0:
                    break
        else:
            jra = [t for t in range(0, len(candidates))]
            random.shuffle(jra)
            for j in jra:
                kra = [t for t in range(0, len(candidates[j]))]
                random.shuffle(kra)
                for k in kra:
                    replacement = candidates[j][k]['token_str'].lstrip(' ')
                    if replacement in varNamesInScope:
                        continue
                    if keyword.iskeyword(replacement):
                        continue
                    if replacement == 'self':
                        continue
                    if len(replacement) > 0:
                        break
        replace = ['<mask>', variables[i][1]]
        replaceVar(tree, replace, replacement)
        variables[i][0] = replacement
    print(variables)

    new_code = astunparse.unparse(tree)
    return new_code


In [110]:
CODE = """
import numpy as np

def add(a, b):
    jawd12 = a * b
    return jawd12

def average(arr1, arr2):
    jawd12 = np.mean(arr1) + np.mean(arr2)
    return jawd12

def main():
    jawd12 = add(1, 2)
    jawd13 = average([1, 2], [3, 4])
"""
print(CODE)
print("\n\n####################\n\n")
print(renameIdentifiers(CODE))

##TODO: add fragmentation for long programs to split into chunks of length ~528


import numpy as np

def add(a, b):
    jawd12 = a * b
    return jawd12

def average(arr1, arr2):
    jawd12 = np.mean(arr1) + np.mean(arr2)
    return jawd12

def main():
    jawd12 = add(1, 2)
    jawd13 = average([1, 2], [3, 4])



####################


[['jawd12', 'add'], ['a', 'add'], ['b', 'add'], ['jawd12', 'average'], ['arr1', 'average'], ['arr2', 'average'], ['jawd12', 'main'], ['jawd13', 'main']]
[['res', 'add'], ['r', 'add'], ['l', 'add'], ['r', 'average'], ['array', 'average'], ['l', 'average'], ['res', 'main'], ['l', 'main']]

import numpy as np

def add(r, l):
    res = (r * l)
    return res

def average(array, l):
    r = (np.mean(array) + np.mean(l))
    return r

def main():
    res = add(1, 2)
    l = average([1, 2], [3, 4])

