In [26]:
""" Script to check vocabulary overlaps """
import re

from supervenn import supervenn
import matplotlib.pyplot as plt
plt.rcParams["font.family"] = "Nimbus Roman"
from os import listdir

from itertools import combinations
from operator import itemgetter

def regex_cnt(string, pattern):
    return len(re.findall(pattern, string))

def load_vocab_txt(dir):

    # Load from file
    with open(dir, 'r') as fvocab:
        vocab = fvocab.readlines()

    # Possible special tokens - update if needed
    special = ["##", "Ġ"]

    # Check which special token it is on half the data
    check = " ".join(vocab[:len(vocab)//2])
    special_token = " "
    for s in special:
        if regex_cnt(check, s) > 1000:
            special_token = s
            break
    # print(special_token)

    # Replace with "##" special token
    if special_token != "##":
        for i in range(0, len(vocab)):
            vocab[i] = vocab[i].replace(special_token, "##")
            vocab[i] = vocab[i].replace("\n", "")
            # print(vocab[i])
    else:
        for i in range(0, len(vocab)):
            vocab[i] = vocab[i].replace("\n", "")  

    # Clean special tokens [CLS]...
    vocab = [v for v in vocab if type(re.match(r"[\<\[][\w\\\/]+[\>\]]", v)) == type(None)]

    # Return the set
    return set(vocab)

def sub_lists(my_list):
    subs = []  # Create an empty list 'subs' to store the sublists

    # Iterate through the range of numbers from 0 to the length of 'my_list' + 1
    for i in range(0, len(my_list) + 1):
        # Use the 'combinations' function to generate all combinations of 'my_list' of length 'i'
        temp = [list(x) for x in combinations(my_list, i)]

        # Check if 'temp' contains any elements; if so, extend the 'subs' list with the generated sublists
        if len(temp) > 0:
            subs.extend(temp)

    return subs  # Return the list of generated sublists


DIR = "./RESULTS/VOCABS/"

files = listdir(DIR)
# vocabs = [f for f in files if f.endswith(".txt")]
vocabs = ["SciBERT.txt", "BERT.txt", "CliReBERT.txt", "BioBERT.txt"]
list_of_vocabs = []
for v in vocabs:
    list_of_vocabs.append(load_vocab_txt(DIR+v))

labels = [l.replace(".txt", "") for l in vocabs]
# exit()
# vocab1 = load_vocab_txt(DIR1)
# vocab2 = load_vocab_txt(DIR2)
# labels.pop(0)
# print(len(list_of_vocabs[2]))
# list_of_vocabs[2] = list_of_vocabs[2] | list_of_vocabs[0]
# print(len(list_of_vocabs[2]))
# list_of_vocabs.pop(0)

index_combinations = sub_lists(range(0, len(list_of_vocabs)))
print(index_combinations)

for comb in index_combinations:
    if len(comb) > 1:
        labels_temp = itemgetter(*comb)(labels)
        vocab_list_temp = itemgetter(*comb)(list_of_vocabs)

        if len(comb) > 2 and len(comb) < 3:
            continue
            supervenn(vocab_list_temp, labels_temp, widths_minmax_ratio=0.1, fontsize=20)
        elif len(comb) >= 3: 
            supervenn(vocab_list_temp, labels_temp, widths_minmax_ratio=0.1, rotate_col_annotations=True, col_annotations_area_height=1.2, fontsize=20, chunks_ordering="minimize gaps")
        else:
            continue
            supervenn(vocab_list_temp, labels_temp, fontsize=20)

        filename = "_".join(labels_temp)
        # plt.title("Supervenn: {}".format(labels_temp))
        plt.tight_layout()
        figure = plt.gcf() # get current figure
        figure.set_size_inches(16, 9)
        plt.savefig(f"./REPORTS/IMAGES/VENN/NEW/{filename}", dpi=900)
        plt.clf()


[[], [0], [1], [2], [3], [0, 1], [0, 2], [0, 3], [1, 2], [1, 3], [2, 3], [0, 1, 2], [0, 1, 3], [0, 2, 3], [1, 2, 3], [0, 1, 2, 3]]


<Figure size 1600x900 with 0 Axes>

In [22]:
SciBERT = list_of_vocabs[0]
BERT = list_of_vocabs[1]
CliReBERT = list_of_vocabs[2] 
BioBERT = list_of_vocabs[3]

In [23]:
unique_BioBERT = BioBERT.difference(SciBERT, BERT, CliReBERT)
print(unique_BioBERT)
print(len(unique_BioBERT))

9707


In [25]:
intersect_SciBERT_CliReBERT_BioBERT = (SciBERT & CliReBERT & BioBERT).difference(BERT)
print(intersect_SciBERT_CliReBERT_BioBERT)
print(len(intersect_SciBERT_CliReBERT_BioBERT))

{'##ucks', '##bur', '##tles', '##onium', '##icious', '##read', '##icking', '##lex', '##esi', '##cription', '##eper', '##ogel', '##iper', '##asis', '##ared', '##ilities', '##ushing', '##cripts', '##uries', '##umes', '##ression', '##ergic', '##ief', '##ione', '##unted', '##aut', '##come', '##ational', '##etics', '##oil', '##ceptor', '##ograph', '##elves', '##ends', '##fts', '##phe', '##ables', '##ulture', '##otes', '##rene', '##iant', '##estone', '##ifiable', '##aving', '##mass', '##ires', '##uster', '##eptide', '##ronic', '##ought', '##icked', '##iding', '##stro', '##acity', '##arity', '##ases', '##usive', '##hou', '##arma', '##amel', '##bark', '##rout', '##ilt', '##ulatory', '##inet', '##cover', '##ymph', '##asing', '##cell', '##otherapy', '##actions', '##ender', '##ificant', '##ahan', '##oring', '##iment', '##iler', '##icks', '##data', '##actor', '##anti', '##icidal', '##state', '##action', '##ival', '##opy', '##ardt', '##green', '##omb', '##rome', '##ilton', '##encing', '##iciency', 

In [5]:
plt.style.available

['Solarize_Light2',
 '_classic_test_patch',
 '_mpl-gallery',
 '_mpl-gallery-nogrid',
 'bmh',
 'classic',
 'dark_background',
 'fast',
 'fivethirtyeight',
 'ggplot',
 'grayscale',
 'seaborn-v0_8',
 'seaborn-v0_8-bright',
 'seaborn-v0_8-colorblind',
 'seaborn-v0_8-dark',
 'seaborn-v0_8-dark-palette',
 'seaborn-v0_8-darkgrid',
 'seaborn-v0_8-deep',
 'seaborn-v0_8-muted',
 'seaborn-v0_8-notebook',
 'seaborn-v0_8-paper',
 'seaborn-v0_8-pastel',
 'seaborn-v0_8-poster',
 'seaborn-v0_8-talk',
 'seaborn-v0_8-ticks',
 'seaborn-v0_8-white',
 'seaborn-v0_8-whitegrid',
 'tableau-colorblind10']

In [2]:
print(labels)
for vocab_set in list_of_vocabs:
    print(len(vocab_set))

['ClimateBERT_added', 'CliReBERT']
235
30517


In [4]:
count = 0
for element in list_of_vocabs[0]:
    if element in list_of_vocabs[1] or "##"+element in list_of_vocabs[1]:
        count += 1
    else:
        print(element)
        
print(count)
print(count/len(list_of_vocabs[0])*100)

CO2
CH4
N2O
+/-
Committee
GHG
229
97.44680851063829
