In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import json
from annoy import AnnoyIndex
from mpl_toolkits.mplot3d import Axes3D
from sklearn import decomposition
from pathlib import Path
from transformers import pipeline
from tqdm import tqdm

# this part is to use a existing model to test on our data set

In [24]:
def jsonl_list_to_dataframe(file_list, columns=['language', 'docstring', 'code']):
    """Load a list of jsonl.gz files into a pandas DataFrame."""
    return pd.concat([pd.read_json(f,
                                   orient='records', 
                                   compression='gzip',
                                   lines=True)[columns] 
                      for f in file_list], sort=False)

In [25]:
def get_dfs(path, splits = ["train", "valid", "test"]):
    """Grabs the different data splits and converts them into dataframes"""
    dfs = []
    for split in ["train", "valid", "test"]:
        files = sorted((path/split).glob("**/*.gz"))
        df = jsonl_list_to_dataframe(files)
        dfs.append(df)
        
    return dfs

In [18]:
path = Path('/mnt/sdc/zheng/sde/data_bert')
java_df = get_dfs(path/"java/final/jsonl", ["valid"])[0]
python_df = get_dfs(path/"python/final/jsonl", ["valid"])[0]
python_df.head()

Unnamed: 0,language,docstring,code
0,python,Trains a k-nearest neighbors classifier for fa...,"def train(train_dir, model_save_path=None, n_n..."
1,python,Recognizes faces in given image using a traine...,"def predict(X_img_path, knn_clf=None, model_pa..."
2,python,Shows the face recognition results visually.\n...,"def show_prediction_labels_on_image(img_path, ..."
3,python,Convert a dlib 'rect' object to a plain tuple ...,"def _rect_to_css(rect):\n """"""\n Convert ..."
4,python,"Make sure a tuple in (top, right, bottom, left...","def _trim_css_to_bounds(css, image_shape):\n ..."


In [19]:
langs_df = pd.concat([java_df, python_df]).sample(frac = 0.001)
feature_extractor = pipeline(
    "feature-extraction",
    model="huggingface/CodeBERTa-small-v1",
    tokenizer="huggingface/CodeBERTa-small-v1"
)
len(langs_df)

867

In [27]:
fill_mask = pipeline(
    "fill-mask",
    model="huggingface/CodeBERTa-small-v1",
    tokenizer="huggingface/CodeBERTa-small-v1"
)

In [31]:
# The sun <mask>.
# =>

fill_mask("private int drain(byte[] b, int off, int len) {\n    int remaining = Math.min(len, byteBuffer.remaining());\n    byteBuffer.put(b, off, remaining);\n    <mask> remaining;\n  }")

[{'sequence': '<s> private int drain(byte[] b, int off, int len) {\n    int remaining = Math.min(len, byteBuffer.remaining());\n    byteBuffer.put(b, off, remaining); return remaining;\n  }</s>',
  'score': 0.6020331382751465,
  'token': 345},
 {'sequence': '<s> private int drain(byte[] b, int off, int len) {\n    int remaining = Math.min(len, byteBuffer.remaining());\n    byteBuffer.put(b, off, remaining); // remaining;\n  }</s>',
  'score': 0.1525881588459015,
  'token': 413},
 {'sequence': '<s> private int drain(byte[] b, int off, int len) {\n    int remaining = Math.min(len, byteBuffer.remaining());\n    byteBuffer.put(b, off, remaining); += remaining;\n  }</s>',
  'score': 0.03793520852923393,
  'token': 1039},
 {'sequence': '<s> private int drain(byte[] b, int off, int len) {\n    int remaining = Math.min(len, byteBuffer.remaining());\n    byteBuffer.put(b, off, remaining); = remaining;\n  }</s>',
  'score': 0.03272952511906624,
  'token': 272},
 {'sequence': '<s> private int dra

In [29]:
fill_mask("This is a <mask>.")

# This is the beginning of a beautiful <mask>.
# =>

[{'sequence': '<s> This is a value.</s>',
  'score': 0.03230028599500656,
  'token': 602},
 {'sequence': '<s> This is a null.</s>',
  'score': 0.020313432440161705,
  'token': 469},
 {'sequence': '<s> This is a boolean.</s>',
  'score': 0.015249653719365597,
  'token': 1411},
 {'sequence': '<s> This is a strict.</s>',
  'score': 0.01490972749888897,
  'token': 7056},
 {'sequence': '<s> This is a Boolean.</s>',
  'score': 0.014499250799417496,
  'token': 4409}]

In [20]:
def get_features(methods, extractor):
    method_features = []
    for method in tqdm(methods):
        features = np.mean(extractor(method)[0], axis = 0)
        method_features.append(features)
    
    return method_features

In [21]:
X = get_features(langs_df.code.values, feature_extractor)
y = langs_df.language.values

100%|██████████| 867/867 [01:17<00:00, 11.21it/s]


In [22]:
np.shape(X)

(867, 768)

In [9]:
for i, lang in enumerate(y):
    if lang == 'java':
        y[i] = 0
    elif lang == 'python':
        y[i] = 1
y = y.astype(np.int32)

%matplotlib inline

In [1]:
fig = plt.figure(1, figsize=(4, 3))
plt.clf()
ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=64, azim=134)

plt.cla()
pca = decomposition.PCA(n_components=3)
pca.fit(X)
X = pca.transform(X)

for name, label in [('java', 0), ('python', 1)]:
    ax.text3D(X[y == label, 0].mean(),
              X[y == label, 1].mean() + 1.5,
              X[y == label, 2].mean(), name,
              horizontalalignment='center',
              bbox=dict(alpha=.5, edgecolor='w', facecolor='w'))
# Reorder the labels to have colors matching the cluster results
# y = np.choose(y, [0, 1]).astype(np.float)
ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=y, cmap=plt.cm.rainbow,
           edgecolor='k')

ax.w_xaxis.set_ticklabels([])
ax.w_yaxis.set_ticklabels([])
ax.w_zaxis.set_ticklabels([])

plt.show()

NameError: name 'plt' is not defined

In [13]:
import pylab as pl
from sklearn.manifold import TSNE
import copy
X = copy.deepcopy(X_O)
X = TSNE(n_components=2, random_state=None,).fit_transform(X)

# Reorder the labels to have colors matching the cluster results
# y = np.choose(y, [0, 1]).astype(np.float)
pl.figure()
for name, label in [('java', 0), ('python', 1)]:
    pl.scatter(X[y==label,0], X[y==label,1], label=name)
pl.legend(['java','python'])
pl.title('2d t-SNE of python and java')

pl.show()

NameError: name 'X_O' is not defined

# In this part, we train a BPE on our own first, then train the model

In [5]:
# In this part, we train a BPE on our own first, then train the model
path = Path('/mnt/sdc/zheng/sde/data_bert/java')

In [6]:
java_df = get_dfs(path/"final/jsonl", ["train"])

In [60]:
java_df_val = get_dfs(path/"final/jsonl", ["valid"])
java_df_test = get_dfs(path/"final/jsonl", ["test"])

In [61]:
print(len(java_df), len(java_df_val), len(java_df_test))

3 3 3


In [7]:
paths = [str(x) for x in Path("/mnt/sdc/zheng/sde/data/transformer/data").glob("**/train.txt")]

In [44]:
paths

['/mnt/sdc/zheng/sde/data/transformer/data/train.txt']

In [13]:
from tokenizers import ByteLevelBPETokenizer
# Initialize a tokenizer
import time

for size in [2000, 5000, 10000, 20000]:
    os.makedirs('./'+str(size), exist_ok = True)
    start = time.time()

    tokenizer = ByteLevelBPETokenizer()

    # Customize training
    tokenizer.train(files=paths, vocab_size=size, min_frequency=2, special_tokens=[
        "<s>",
        "<pad>",
        "</s>",
        "<unk>",
        "<mask>",
    ])
    end = time.time()
    print("Test --- %s seconds ---" % (end - start))
    
    # Save files to disk
    tokenizer.save('./'+str(size), "esperberto_"+str(size)+"size")

Test --- 623.974826335907 seconds ---
Test --- 632.8605041503906 seconds ---
Test --- 637.707799911499 seconds ---
Test --- 644.5306007862091 seconds ---


In [24]:
size = 10000
vocab_json =json.load(open('/home/zheng/sde_course/hugg/'+str(size)+'/esperberto_'+str(size)+'size-vocab.json','r'))

In [25]:
vocab_json

{'<s>': 0,
 '<pad>': 1,
 '</s>': 2,
 '<unk>': 3,
 '<mask>': 4,
 '!': 5,
 '"': 6,
 '#': 7,
 '$': 8,
 '%': 9,
 '&': 10,
 "'": 11,
 '(': 12,
 ')': 13,
 '*': 14,
 '+': 15,
 ',': 16,
 '-': 17,
 '.': 18,
 '/': 19,
 '0': 20,
 '1': 21,
 '2': 22,
 '3': 23,
 '4': 24,
 '5': 25,
 '6': 26,
 '7': 27,
 '8': 28,
 '9': 29,
 ':': 30,
 ';': 31,
 '<': 32,
 '=': 33,
 '>': 34,
 '?': 35,
 '@': 36,
 'A': 37,
 'B': 38,
 'C': 39,
 'D': 40,
 'E': 41,
 'F': 42,
 'G': 43,
 'H': 44,
 'I': 45,
 'J': 46,
 'K': 47,
 'L': 48,
 'M': 49,
 'N': 50,
 'O': 51,
 'P': 52,
 'Q': 53,
 'R': 54,
 'S': 55,
 'T': 56,
 'U': 57,
 'V': 58,
 'W': 59,
 'X': 60,
 'Y': 61,
 'Z': 62,
 '[': 63,
 '\\': 64,
 ']': 65,
 '^': 66,
 '_': 67,
 '`': 68,
 'a': 69,
 'b': 70,
 'c': 71,
 'd': 72,
 'e': 73,
 'f': 74,
 'g': 75,
 'h': 76,
 'i': 77,
 'j': 78,
 'k': 79,
 'l': 80,
 'm': 81,
 'n': 82,
 'o': 83,
 'p': 84,
 'q': 85,
 'r': 86,
 's': 87,
 't': 88,
 'u': 89,
 'v': 90,
 'w': 91,
 'x': 92,
 'y': 93,
 'z': 94,
 '{': 95,
 '|': 96,
 '}': 97,
 '~': 98,
 

In [45]:
# choose 1% data to train a small model
paths = [str(x) for x in Path("/mnt/sdc/zheng/sde/data/transformer/data").glob("**/*.txt")]

In [71]:
name_dict = {0:'train_slice_001.txt',1:'valid_slice_001.txt',2:'test_slice_001.txt'}
for i in range(3):
    print_df = java_df[i].sample(frac = 0.01)
    print(print_df.shape)
    f = open('/home/zheng/'+name_dict[i],'a')
    for code_item in print_df['code']:        
        print(code_item,file=f)

(4545, 3)
(153, 3)
(269, 3)


In [69]:
print_df.columns

Index(['language', 'docstring', 'code'], dtype='object')

In [40]:
print(langs_df.shape, langs_df_val.shape, langs_df_test.shape)

(4545, 3) (4545, 3) (4545, 3)


In [None]:
for item in langs_df:
    print(item['code'], file=f)

In [67]:
langs_df['code'].iloc[0]

'public static Operation createUndeployOperation(final Set<UndeployDescription> undeployDescriptions) {\n        Assertions.requiresNotNullOrNotEmptyParameter("undeployDescriptions", undeployDescriptions);\n        final CompositeOperationBuilder builder = CompositeOperationBuilder.create(true);\n        for (UndeployDescription undeployDescription : undeployDescriptions) {\n            addUndeployOperationStep(builder, undeployDescription);\n        }\n        return builder.build();\n    }'

In [73]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing


tokenizer = ByteLevelBPETokenizer(
    '/home/zheng/sde_course/hugg/'+str(size)+'/esperberto_'+str(size)+'size-vocab.json',
    '/home/zheng/sde_course/hugg/'+str(size)+'/esperberto_'+str(size)+'size-merges.txt',
)
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

print(
    tokenizer.encode("Mi estas Julien.")
)
# Encoding(num_tokens=7, ...)
# tokens: ['<s>', 'Mi', 'Ġestas', 'ĠJuli', 'en', '.', '</s>']

Encoding(num_tokens=11, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])


In [None]:
tokenizer

In [74]:
tokenizer

Tokenizer(vocabulary_size=10000, model=ByteLevelBPE, add_prefix_space=False, lowercase=False, dropout=None, unicode_normalizer=None, continuing_subword_prefix=None, end_of_word_suffix=None, trim_offsets=False)