## Imports and tokenizer

In [1]:
import os
import shutil
from transformers import AutoTokenizer
from collections import defaultdict
import json

In [2]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")

## Create a corpus from all raw data

In [3]:
#loads in data from all json files

src = "C:\\Users\\theso\\semeruDatasets"

arrOfData = []

src_files = os.listdir(src)
for file_name in src_files:
    full_file_name = os.path.join(src, file_name)
    src_files_2 = os.listdir(full_file_name)
    for json_name in src_files_2:
        json_file_name = os.path.join(full_file_name, json_name)
        if os.path.isfile(json_file_name):
            arrOfData += [json.load(open(json_file_name, encoding="utf-8"))]

In [7]:
word_freqs = defaultdict(int)

for i in range(len(arrOfData)):
    #print(i / len(arrOfData) * 100)
    for j in range(len(arrOfData[i])):
        words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(arrOfData[i][j]["code"])
        new_words = [word for word, offset in words_with_offsets]
        for word in new_words:
            word_freqs[word] += 1

In [8]:
alphabet = []

for word in word_freqs.keys():
    for letter in word:
        if letter not in alphabet:
            alphabet.append(letter)
alphabet.sort()

vocab = ["<|endoftext|>"] + alphabet.copy()

print(vocab)
print(len(vocab))

['<|endoftext|>', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', '¡', '¢', '£', '¤', '¥', '¦', '§', '¨', '©', 'ª', '«', '¬', '®', '¯', '°', '±', '²', '³', '´', 'µ', '¶', '·', '¸', '¹', 'º', '»', '¼', '½', '¾', '¿', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Ç', 'È', 'É', 'Ê', 'Ë', 'Ì', 'Í', 'Î', 'Ï', 'Ð', 'Ñ', 'Ò', 'Ó', 'Ô', 'Õ', 'Ö', '×', 'Ø', 'Ù', 'Ú', 'Û', 'Ü', 'à', 'á', 'â', 'ã', 'ä', 'å', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'ì', 'í', 'ï', 'ð', 'ĉ', 'Ċ', 'Č', 'č', 'Ġ', 'Ģ', 'ģ', 'Ĥ', 'ĥ', 'Ħ', 'ħ', 'Ĩ', 'ĩ', 'Ī', 'ī', 'Ĭ', 'ĭ', 'Į', 'į', 'İ', 'ı', 'Ĳ', 'ĳ', 'Ĵ', 'ĵ', 'Ķ', 'ķ', 'ĸ', 'Ĺ', '

In [9]:
splits = {word: [c for c in word] for word in word_freqs.keys()}

In [10]:
def compute_pair_freqs(splits):
    pair_freqs = defaultdict(int)
    for word, freq in word_freqs.items():
        split = splits[word]
        if len(split) == 1:
            continue
        for i in range(len(split) - 1):
            pair = (split[i], split[i + 1])
            pair_freqs[pair] += freq
    return pair_freqs

In [11]:
pair_freqs = compute_pair_freqs(splits)


#finding best pairs
best_pair = ""
max_freq = None

for pair, freq in pair_freqs.items():
    if max_freq is None or max_freq < freq:
        best_pair = pair
        max_freq = freq
        
merges = {}
        
print(best_pair, max_freq)

('Ġ', 'Ġ') 47819588


In [12]:
#merges pairs
def merge_pair(a, b, splits):
    for word in word_freqs:
        split = splits[word]
        if len(split) == 1:
            continue

        i = 0
        while i < len(split) - 1:
            if split[i] == a and split[i + 1] == b:
                split = split[:i] + [a + b] + split[i + 2 :]
            else:
                i += 1
        splits[word] = split
    return splits

#finds most frequent pairs and adds them to splits
def makePairs(splits, vSize):    
    vocab_size = vSize
    while len(vocab) < vocab_size:
        pair_freqs = compute_pair_freqs(splits)
        best_pair = ""
        max_freq = None
        for pair, freq in pair_freqs.items():
            if max_freq is None or max_freq < freq:
                best_pair = pair
                max_freq = freq
        splits = merge_pair(*best_pair, splits)
        merges[best_pair] = best_pair[0] + best_pair[1]
        vocab.append(best_pair[0] + best_pair[1])

In [14]:
#replce 414 with length of vocab
makePairs(splits, 414)

print(vocab)
print(len(vocab))

['<|endoftext|>', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', '¡', '¢', '£', '¤', '¥', '¦', '§', '¨', '©', 'ª', '«', '¬', '®', '¯', '°', '±', '²', '³', '´', 'µ', '¶', '·', '¸', '¹', 'º', '»', '¼', '½', '¾', '¿', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Ç', 'È', 'É', 'Ê', 'Ë', 'Ì', 'Í', 'Î', 'Ï', 'Ð', 'Ñ', 'Ò', 'Ó', 'Ô', 'Õ', 'Ö', '×', 'Ø', 'Ù', 'Ú', 'Û', 'Ü', 'à', 'á', 'â', 'ã', 'ä', 'å', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'ì', 'í', 'ï', 'ð', 'ĉ', 'Ċ', 'Č', 'č', 'Ġ', 'Ģ', 'ģ', 'Ĥ', 'ĥ', 'Ħ', 'ħ', 'Ĩ', 'ĩ', 'Ī', 'ī', 'Ĭ', 'ĭ', 'Į', 'į', 'İ', 'ı', 'Ĳ', 'ĳ', 'Ĵ', 'ĵ', 'Ķ', 'ķ', 'ĸ', 'Ĺ', '

## Deduplicate file

#### manually

In [18]:
def tokenize(text):
    pre_tokenize_result = tokenizer._tokenizer.pre_tokenizer.pre_tokenize_str(text)
    pre_tokenized_text = [word for word, offset in pre_tokenize_result]
    splits = [[l for l in word] for word in pre_tokenized_text]
    for pair, merge in merges.items():
        for idx, split in enumerate(splits):
            i = 0
            while i < len(split) - 1:
                if split[i] == pair[0] and split[i + 1] == pair[1]:
                    split = split[:i] + [merge] + split[i + 2 :]
                else:
                    i += 1
            splits[idx] = split

    return sum(splits, [])

In [19]:
#call these functions with word freqs
def jaccard(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float(intersection) / union

def jaccardSet(list1, list2):
    list1 = set(list1)
    list2 = set(list2)
    
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float(intersection) / union

In [20]:
def isDupeHelper(data, i, j):
    
    #nvm this is broken, will only ever iterate a single time 
    if jaccard(data[i]["BPE_tokens"], data[j]["BPE_tokens"]) >= 0.7 and jaccardSet(data[i]["BPE_tokens"],data[j]["BPE_tokens"]) >= 0.8:
        return True
    else:
        return False    

In [21]:
#manual dedupes
#for code section
src = "C:\\Users\\theso\\semeruDatasets\\kivy\\data_1.json"
data = json.load(open(src))

tokenSet = set()

for i in range(len(data)):
    #compare each code sections to every code section following it
    tokenSet.add(data[i]["code"])
    data[i].update({"BPE_tokens": tokenize(data[i]["code"])})

#for i in len(list): iterate through every item

def Deduper(data):
    
    retSet = set()
    
    for i in range(len(data)): 
        dupe = False
        for j in range(i, len(data)):
            if isDupeHelper(data, i, j):
                #print(str(jaccard(data[i]["BPE_tokens"], data[j]["BPE_tokens"])) + " " + str(jaccardSet(data[i]["BPE_tokens"],data[j]["BPE_tokens"]))) 
                retSet.add(data[i]["code"])
                #print(retSet)
    return retSet

In [22]:
Lset = Deduper(data)

goodSet = tokenSet - Lset
retFile = []

for i in range(len(data)): 
    if data[i]["code"] in goodSet:
        del data[i]["BPE_tokens"] #this is because python has an interpertation problem with the special characters used in BPE
        retFile.append(data[i])

In [23]:
with open("testing_dedupe", "w") as f:
    json.dump(retFile, f, ensure_ascii=False, indent=4)

In [24]:
print("size after removing duplicates: " + str(len(retFile)))

size after removing duplicates: 288


#### with tool

In [25]:
#with tool
from dpu_utils.codeutils.deduplication import DuplicateDetector

In [43]:
src = "C:\\Users\\theso\\semeruDatasets\\kivy\\data_1.json"
data = json.load(open(src))


dupes = DuplicateDetector()


def Deduper(data):
    dupes = DuplicateDetector()

    totalSet = set()
          
    for i in range(len(data)):

        if data[i]["code"] not in totalSet:   
            dupes.add_file(data[i]["code"], tokenize(data[i]["code"]))

        totalSet.add(data[i]["code"])
        
    what = dupes.compute_duplicates()
        
    exclude = dupes.compute_ids_to_exclude()
    
    retFileTool = []

    for i in range(len(data)): 
        if data[i]["code"] not in exclude: 
            retFileTool.append(data[i])
            
    return retFileTool

In [44]:
retFileTool = Deduper(data)

In [41]:
with open("testing_dedupe", "w") as f:
    json.dump(retFile, f, ensure_ascii=False, indent=4)

In [42]:
print("size after removing duplicates: " + str(len(retFileTool)))

size after removing duplicates: 261


## For the dataset

What deduplicating the entire dataset with this tool might look like

In [49]:
src = "C:\\Users\\theso\\semeruDatasets"

src_files = os.listdir(src)

dupes = DuplicateDetector()

totalSet = set()


for file_name in src_files:
    full_file_name = os.path.join(src, file_name)
    print(file_name)
    src_files_2 = os.listdir(full_file_name)
    for json_name in src_files_2:
        json_file_name = os.path.join(full_file_name, json_name)
        #print(json_file_name)
        if os.path.isfile(json_file_name):
            data = json.load(open(json_file_name, encoding="utf-8"))
            for i in range(len(data)):
                print(i)
                if data[i]["code"] not in totalSet:   
                    dupes.add_file(data[i]["code"], tokenize(data[i]["code"]))
                    
exclude = dupes.compute_ids_to_exclude()
    
retFileTool = []

for i in range(len(data)): 
    if data[i]["code"] not in exclude: 
        retFileTool.append(data[i])        


In [None]:
with open(file_name + "Deduped", "w") as f:
                json.dump(retFile, f, ensure_ascii=False, indent=4)
            print("finished")