In [None]:
!pip install nb-black
!pip install protobuf==3.20.1
!pip install -q transformers sentencepiece

%load_ext lab_black

In [1]:
import json
import numpy as np
import os
import pandas as pd
import torch

import ast

In [15]:
with open("../data/raw/Conala/conala-corpus/conala-train.json") as f:
    train = json.load(f)

with open("../data/raw/Conala/conala-corpus/conala-test.json") as f:
    test = json.load(f)

In [16]:
def parse_to_pandas(data):
    df = pd.DataFrame(columns=["intent", "snippet"])
    for d in data:
        df = df.append(
            {
                "intent": d["intent"],
                "intent_std": d["rewritten_intent"],
                "snippet": d["snippet"],
            },
            ignore_index=True,
        )
    return df

In [17]:
train = parse_to_pandas(train)
train = train.drop_duplicates()

test = parse_to_pandas(test)
test = test.drop_duplicates()
print(train.shape)
print(test.shape)

(2367, 3)
(494, 3)


### Code Clone Detection
#### Preparing the dataset

Type 1: Identical code clones except for differences in white-spaces, layouts
and comments. It is known as exact clones. Table 1 presents an example of two
fragments of code clones where the difference between them is the comment
highlighted in grey. The pair of code fragments are exact copies of each other.
Hence, they are clones of Type 1.
– Type 2: Syntactically identical code clones except for differences in identifiers
name, data types, whitespace, layouts, and comments are Type 2 clones. As
shown in Table 2, the two fragments will be exact when we ignore the naming
differences (function name, name of input variables). These two code fragments
are Type 2 clones of each other.
– Type 3: Code clones with some modification, addition or deletion of lines in
addition to a difference in identifiers, data types, whitespaces, and comments.
Examples of two Type 3 code fragments are showed in Table 3. These two
code fragments are different in the function name and the addition of 2 lines
for another condition in the second code fragment.

In [12]:
train.shape

(2379, 3)

In [21]:
train[train.intent=='How to convert a list of multiple integers into a single integer?']

Unnamed: 0,intent,snippet,intent_std
0,How to convert a list of multiple integers int...,"sum(d * 10 ** i for i, d in enumerate(x[::-1]))",Concatenate elements of a list 'x' of multiple...
1,How to convert a list of multiple integers int...,"r = int(''.join(map(str, x)))",convert a list of integers into a single integer


In [26]:
from pprint import pprint

In [28]:
!pip install astunparse

Collecting astunparse
  Downloading astunparse-1.6.3-py2.py3-none-any.whl (12 kB)
Installing collected packages: astunparse
Successfully installed astunparse-1.6.3

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [27]:
tree = ast.parse(train.snippet[0])
pprint(ast.dump(tree))

("Module(body=[Expr(value=Call(func=Name(id='sum', ctx=Load()), "
 "args=[GeneratorExp(elt=BinOp(left=Name(id='d', ctx=Load()), op=Mult(), "
 "right=BinOp(left=Constant(value=10), op=Pow(), right=Name(id='i', "
 "ctx=Load()))), generators=[comprehension(target=Tuple(elts=[Name(id='i', "
 "ctx=Store()), Name(id='d', ctx=Store())], ctx=Store()), "
 "iter=Call(func=Name(id='enumerate', ctx=Load()), "
 "args=[Subscript(value=Name(id='x', ctx=Load()), "
 'slice=Slice(step=UnaryOp(op=USub(), operand=Constant(value=1))), '
 'ctx=Load())], keywords=[]), ifs=[], is_async=0)])], keywords=[]))], '
 'type_ignores=[])')


In [32]:
import inspect
import astunparse

In [38]:
train.snippet[0]

'sum(d * 10 ** i for i, d in enumerate(x[::-1]))'

In [36]:
# get back the source code
print(astunparse.unparse(ast.parse(train.snippet[0])))


sum(((d * (10 ** i)) for (i, d) in enumerate(x[::(- 1)])))



In [41]:
tree = ast.parse(train.snippet[0])

In [49]:
tree.body[0].value

<ast.Call at 0x7f6b6d2a68b0>

In [50]:
class AnalysisNodeVisitor(ast.NodeVisitor):
    def visit_Import(self,node):
        ast.NodeVisitor.generic_visit(self, node)

    def visit_ImportFrom(self,node):
        ast.NodeVisitor.generic_visit(self, node)

    def visit_Assign(self,node):
        print('Node type: Assign and fields: ', node._fields)
        ast.NodeVisitor.generic_visit(self, node)
    
    def visit_BinOp(self, node):
        print('Node type: BinOp and fields: ', node._fields)
        ast.NodeVisitor.generic_visit(self, node)

    def visit_Expr(self, node):
        print('Node type: Expr and fields: ', node._fields)
        ast.NodeVisitor.generic_visit(self, node)

    def visit_Num(self,node):
        print('Node type: Num and fields: ', node._fields)

    def visit_Name(self,node):
        print('Node type: Name and fields: ', node._fields)
        ast.NodeVisitor.generic_visit(self, node)

    def visit_Str(self, node):
        print('Node type: Str and fields: ', node._fields)

In [51]:
v = AnalysisNodeVisitor()
v.visit(tree)

Node type: Expr and fields:  ('value',)
Node type: Name and fields:  ('id', 'ctx')
Node type: BinOp and fields:  ('left', 'op', 'right')
Node type: Name and fields:  ('id', 'ctx')
Node type: BinOp and fields:  ('left', 'op', 'right')
Node type: Num and fields:  ('value', 'kind')
Node type: Name and fields:  ('id', 'ctx')
Node type: Name and fields:  ('id', 'ctx')
Node type: Name and fields:  ('id', 'ctx')
Node type: Name and fields:  ('id', 'ctx')
Node type: Name and fields:  ('id', 'ctx')
Node type: Num and fields:  ('value', 'kind')


In [34]:
# get a pretty-printed dump of the AST
print(astunparse.dump(ast.parse(train.snippet[0])))

Module(
  body=[Expr(value=Call(
    func=Name(
      id='sum',
      ctx=Load()),
    args=[GeneratorExp(
      elt=BinOp(
        left=Name(
          id='d',
          ctx=Load()),
        op=Mult(),
        right=BinOp(
          left=Constant(
            value=10,
            kind=None),
          op=Pow(),
          right=Name(
            id='i',
            ctx=Load()))),
      generators=[comprehension(
        target=Tuple(
          elts=[
            Name(
              id='i',
              ctx=Store()),
            Name(
              id='d',
              ctx=Store())],
          ctx=Store()),
        iter=Call(
          func=Name(
            id='enumerate',
            ctx=Load()),
          args=[Subscript(
            value=Name(
              id='x',
              ctx=Load()),
            slice=Slice(
              lower=None,
              upper=None,
              step=UnaryOp(
                op=USub(),
                operand=Constant(
                  value=

In [20]:
train[train.duplicated('intent')].loc[1]['intent']

'How to convert a list of multiple integers into a single integer?'

### Variable Misuse Detection
#### Preparing the dataset

### Code Summarization
#### Preparing the dataset

### Code Query Matching
#### Preparing the dataset

In [37]:
node_type_set = set()
node_type_ID_mapping = {}

embedding_size = 10

# Initialize the embedding matrix

embedding_matrix = np.zeros((len(node_type_ID_mapping), embedding_size))

# Recursive function to traverse and embed the AST

def encode_ast(node):

    if isinstance(node, ast.AST):
        
        node_type = node.__class__.__name__

        if node_type not in node_type_set:
            node_type_set.add(node_type)
            node_type_ID_mapping[node_type] = len(node_type_ID_mapping)

        node_id = node_type_ID_mapping.get(node_type)

        # Embed the current node
        embedding = np.random.rand(
            embedding_size
        )  # Replace with your desired embedding method
        embedding_matrix[node_id] = embedding

        # Recursively iterate for child nodes
        for child_node in ast.iter_child_nodes(node):
            encode_ast(child_node)


# Parse the Python code into an AST
tree = ast.parse(train.snippet[0])

# Embed the AST
encode_ast(tree)

# Print the embedding matrix
print(embedding_matrix)

KeyError: 'Mult'