## Imports and tokenizer

In [1]:
import os
import shutil
from transformers import AutoTokenizer
from collections import defaultdict
import json

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [2]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")

## Create a corpus from all raw data

In [3]:
#loads in data from all json files

src = "/workspaces/galeras-benchmark/datasets/code_smells"

arrOfData = []

src_files = os.listdir(src)
for file_name in src_files:
    full_file_name = os.path.join(src, file_name)
    src_files_2 = os.listdir(full_file_name)
    for json_name in src_files_2:
        json_file_name = os.path.join(full_file_name, json_name)
        if os.path.isfile(json_file_name):
            arrOfData += [json.load(open(json_file_name, encoding="utf-8"))]

In [4]:
word_freqs = defaultdict(int)

for i in range(len(arrOfData)):
    for j in range(len(arrOfData[i])):
        words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(arrOfData[i][j]["code"])
        new_words = [word for word, offset in words_with_offsets]
        for word in new_words:
            word_freqs[word] += 1

In [5]:
alphabet = []

for word in word_freqs.keys():
    for letter in word:
        if letter not in alphabet:
            alphabet.append(letter)
alphabet.sort()

vocab = ["<|endoftext|>"] + alphabet.copy()

In [6]:
splits = {word: [c for c in word] for word in word_freqs.keys()}

In [7]:
def compute_pair_freqs(splits):
    pair_freqs = defaultdict(int)
    for word, freq in word_freqs.items():
        split = splits[word]
        if len(split) == 1:
            continue
        for i in range(len(split) - 1):
            pair = (split[i], split[i + 1])
            pair_freqs[pair] += freq
    return pair_freqs

In [8]:
pair_freqs = compute_pair_freqs(splits)


#finding best pairs
best_pair = ""
max_freq = None

for pair, freq in pair_freqs.items():
    if max_freq is None or max_freq < freq:
        best_pair = pair
        max_freq = freq
        
merges = {}
        

In [9]:
#merges pairs
def merge_pair(a, b, splits):
    for word in word_freqs:
        split = splits[word]
        if len(split) == 1:
            continue

        i = 0
        while i < len(split) - 1:
            if split[i] == a and split[i + 1] == b:
                split = split[:i] + [a + b] + split[i + 2 :]
            else:
                i += 1
        splits[word] = split
    return splits

#finds most frequent pairs and adds them to splits
def makePairs(splits, vSize):    
    vocab_size = vSize
    while len(vocab) < vocab_size:
        pair_freqs = compute_pair_freqs(splits)
        best_pair = ""
        max_freq = None
        for pair, freq in pair_freqs.items():
            if max_freq is None or max_freq < freq:
                best_pair = pair
                max_freq = freq
        splits = merge_pair(*best_pair, splits)
        merges[best_pair] = best_pair[0] + best_pair[1]
        vocab.append(best_pair[0] + best_pair[1])

In [10]:
#replce 414 with length of vocab
makePairs(splits, 414)

## Deduplicate file

#### manually

In [11]:
def tokenize(text):
    pre_tokenize_result = tokenizer._tokenizer.pre_tokenizer.pre_tokenize_str(text)
    pre_tokenized_text = [word for word, offset in pre_tokenize_result]
    splits = [[l for l in word] for word in pre_tokenized_text]
    for pair, merge in merges.items():
        for idx, split in enumerate(splits):
            i = 0
            while i < len(split) - 1:
                if split[i] == pair[0] and split[i + 1] == pair[1]:
                    split = split[:i] + [merge] + split[i + 2 :]
                else:
                    i += 1
            splits[idx] = split

    return sum(splits, [])

In [12]:
#call these functions with word freqs
def jaccard(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float(intersection) / union

def jaccardSet(list1, list2):
    list1 = set(list1)
    list2 = set(list2)
    
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float(intersection) / union

In [13]:
def isDupeHelper(data, i, j):
    
    #nvm this is broken, will only ever iterate a single time 
    if jaccard(data[i]["BPE_tokens"], data[j]["BPE_tokens"]) >= 0.7 and jaccardSet(data[i]["BPE_tokens"],data[j]["BPE_tokens"]) >= 0.8:
        return True
    else:
        return False    

In [14]:
#manual dedupes
#for code section
src = "/workspaces/galeras-benchmark/datasets/code_smells/aider/data_1.json"
data = json.load(open(src))

tokenSet = set()

for i in range(len(data)):
    #compare each code sections to every code section following it
    tokenSet.add(data[i]["code"])
    data[i].update({"BPE_tokens": tokenize(data[i]["code"])})

#for i in len(list): iterate through every item

def Deduper(data):
    
    retSet = set()
    
    for i in range(len(data)): 
        dupe = False
        for j in range(i, len(data)):
            if isDupeHelper(data, i, j):
                retSet.add(data[i]["code"])
    return retSet

In [15]:
Lset = Deduper(data)

goodSet = tokenSet - Lset
retFile = []

for i in range(len(data)): 
    if data[i]["code"] in goodSet:
        del data[i]["BPE_tokens"] #this is because python has an interpertation problem with the special characters used in BPE
        retFile.append(data[i])

In [16]:
with open("testing_dedupe", "w") as f:
    json.dump(retFile, f, ensure_ascii=False, indent=4)

#### with tool

In [17]:
#with tool
from dpu_utils.codeutils.deduplication import DuplicateDetector

In [20]:
src = "/workspaces/galeras-benchmark/datasets/code_smells/aider/data_1.json"
#data = json.load(open(src), encoding="utf8")
data = json.load(open(src))


dupes = DuplicateDetector()


def Deduper(data):
    dupes = DuplicateDetector()

    totalSet = set()
          
    for i in range(len(data)):

        if data[i]["code"] not in totalSet:   
            dupes.add_file(data[i]["code"], tokenize(data[i]["code"]))

        totalSet.add(data[i]["code"])
        
    exclude = dupes.compute_ids_to_exclude()
    
    retFileTool = []

    for i in range(len(data)): 
        if data[i]["code"] not in exclude: 
            retFileTool.append(data[i])
            
    return retFileTool

In [21]:
retFileTool = Deduper(data)

In [22]:
with open("testing_dedupe", "w") as f:
    json.dump(retFile, f, ensure_ascii=False, indent=4)

## For the dataset

What deduplicating the entire dataset with this tool might look like

In [24]:
src = "/workspaces/galeras-benchmark/datasets/code_smells"

src_files = os.listdir(src)


for file_name in src_files:
    full_file_name = os.path.join(src, file_name)
    src_files_2 = os.listdir(full_file_name)
    for json_name in src_files_2:
        json_file_name = os.path.join(full_file_name, json_name)
        if os.path.isfile(json_file_name):
            data = json.load(open(json_file_name, encoding="utf-8"))
            totalSet = set()
            dupes = DuplicateDetector()
            for i in range(len(data)):
                if data[i]["code"] not in totalSet:   
                    dupes.add_file(data[i]["code"], tokenize(data[i]["code"]))
                    totalSet.add(data[i]["code"])
               
            #will throw value error if there are no dupes in file
            try:
                exclude = dupes.compute_ids_to_exclude()
            except TypeError:
                print('error')
            
            retFileTool = []

            for i in range(len(data)): 
                if data[i]["code"] not in exclude: 
                    retFileTool.append(data[i])        

            with open(json_file_name + "Deduped.json", "w") as f:
                json.dump(retFile, f, ensure_ascii=False, indent=4)

KeyboardInterrupt: 