In [1]:
import numpy as np
import pandas as pd
import re
import requests
import bs4
import time

In [2]:
file = '/tempo/merged_clean.csv'

#convert df into a numpy array
df = pd.read_csv(file)

# convert the relationships into categories
rel_cat = df['Relation'].astype('category')

# label each unique value in the column 'Relation' with a number
labels = rel_cat.cat.codes.to_numpy()

# get the conversion dictionary between the labels and the numbers
conversion_dict = np.array(list(enumerate(rel_cat.cat.categories)))

In [3]:
# extract all unique words (both word A and word B) that are 3 or 4 letters long
acronyms = np.concatenate((df['Word A'], df['Word B']))
acronyms = np.unique(acronyms)
# filter by length
acronyms = acronyms[np.where(np.array([len(acronym) < 5 for acronym in acronyms]) == True)]
print(acronyms)

['aadl' 'aam' 'ab' 'abc' 'abm' 'abv' 'ac' 'acf' 'acl' 'acl2' 'aco' 'ada'
 'adam' 'add' 'ade' 'adls' 'adp' 'adts' 'aer' 'aes' 'afs' 'agda' 'agg'
 'ahp' 'ai' 'al' 'alc' 'alp' 'amcs' 'ann' 'anns' 'ant' 'aop' 'api' 'apis'
 'ar' 'arc' 'asa' 'asic' 'asl' 'asm' 'asp' 'asr' 'at' 'atl' 'atpg' 'au'
 'auc' 'auv' 'bbc' 'bbo' 'bcg' 'bci' 'bcis' 'bdd' 'bdds' 'beam' 'beta'
 'bfo' 'bim' 'blas' 'bleu' 'blob' 'blog' 'blur' 'bmc' 'bmi' 'bn' 'bnc'
 'bnf' 'bnns' 'bns' 'bot' 'bp' 'bpa' 'bpm' 'bpmn' 'bpn' 'bpnn' 'bpp'
 'bpso' 'bptt' 'bug' 'bus' 'ca' 'cad' 'cam' 'cart' 'casl' 'cba' 'cbir'
 'cbr' 'cca' 'ccg' 'cd' 'ce' 'cer' 'cf' 'cfg' 'cfgs' 'cgal' 'cgp' 'cgs'
 'chat' 'chc' 'chin' 'chr' 'chrs' 'ci' 'ciao' 'cit' 'clip' 'clir' 'clp'
 'cmac' 'cmc' 'cnn' 'cnns' 'cob' 'code' 'cog' 'coil' 'coq' 'core' 'cp'
 'cpg' 'cpgs' 'cps' 'cpu' 'cqa' 'cql' 'cqp' 'cr' 'crf' 'crfs' 'crm' 'crt'
 'csa' 'cscl' 'cseq' 'csp' 'csps' 'csr' 'ct' 'cta' 'ctc' 'ctl' 'cuda'
 'cure' 'cut' 'cv' 'cvc3' 'da' 'dag' 'dbms' 'dbn' 'dbns' 'dca' 'dcc' 

In [4]:
def regex_finder(acronym, sent):
    """
    Check the sentences using regex
    """
    if acronym[-1] == 's':
        definition = regex_finder(acronym[:-1], sent)
        if definition:
            return definition

    # split the sentence into words by spaces and hyphens
    words = re.split(r'[-\s]', sent)
    original_words = words.copy()
    prepositions = ["at", "on", "by", "in", "by", "to", "of"]

    # delete prepositions and words without a single letter
    words = [word for word in words if re.search(r'[a-zA-Z]', word) and word not in prepositions]
    
    position = None

    # get the word position of the acronym in the sentence, using regex. It can be case insensitive. The acronym can be in brackets
    for i in range(len(words)):
        if re.search(rf"\b{acronym.upper()}\b", words[i]):
            position = i
    # get the number of letters in the acronym, excluding special characters
    if position is not None:
        length = len(acronym)
        # disregard if it would cause an index out of bounds error
        if position - length >= 0:
            words_before = words[position - length:position]
            # remove special characters frpm each word
            words_before = [re.sub(r"[^a-zA-Z]", "", word) for word in words_before]
        else:
            words_before = None
        if position + length < len(words):
            words_after = words[position + 1:position + 1 + length]
            words_after = [re.sub(r"[^a-zA-Z]", "", word) for word in words_after]
        else:
            words_after = None
        
        # print(words_before)
        # check if the words_before and words_after start with the same letters
        try:
            if words_before:
                for i in range(length):
                    if words_before[i][0].lower() != acronym[i]:
                        break
                else:
                    return " ".join(words_before)
        
            if words_after:
                for i in range(length):
                    if words_after[i][0].lower() != acronym[i]:
                        break
                else:
                    return " ".join(words_after)
        except:
                raise Exception(f"{words_after=} {acronym=} {length=} {position=} {sent=}")
        return None

def API_finder(acronym):
    """
    This function takes a string and returns the definition of the acronym.
    """
    time.sleep(5)
    url = f"https://www.acronymfinder.com/Information-Technology/{acronym}.html"
    response = requests.get(url)
    # find the class "result-list__body__meaning"
    soup = bs4.BeautifulSoup(response.text, 'html.parser')
    meaning = soup.find(class_="result-list__body__meaning")
    if meaning is None:
        return None

    # delete everything after the bracket
    definition = re.sub(r"\(.*\)", "", meaning.text)
    length = len(acronym)
    # only keep the first {length} words of the definition
    definition = definition.split()[:length]

    return " ".join(definition)

def get_acronym_defintion(acronym):
    # get all the indexes where acronym is either word A or word B
    indexes = df[(df['Word A'] == acronym) | (df['Word B'] == acronym)].index

    for i in indexes:
        definition = regex_finder(acronym, df['Sentence'][i])
        if definition:
            return definition
    # definition = API_finder(acronym)
    # if definition:
    #     return definition
    return None

sent = "We investigate the fading cognitive multiple access wiretap channel (CMAC-WT), in which two secondary-user transmitters (STs) send secure messages to a secondary-user receiver (SR) in the presence of an eavesdropper (ED) and subject to interference threshold constraints at multiple primary-user receivers (PRs)."
word = "wt"
print(regex_finder(word, sent))

which two


In [211]:
acronym = 'wt'
for i in range(len(df)):
    if  (df['Word A'][i] == acronym) | (df['Word B'][i] == acronym):
        print(df['Word A'][i], ' ', df['Word B'][i])
        print(df['Sentence'][i])

st   wt
We investigate the fading cognitive multiple access wiretap channel (CMAC-WT), in which two secondary-user transmitters (STs) send secure messages to a secondary-user receiver (SR) in the presence of an eavesdropper (ED) and subject to interference threshold constraints at multiple primary-user receivers (PRs).
st   wt
The validity of the ladder approximation (LA) in QCD and QED in the context of the corresponding Schwinger-Dyson (SD) equations and Slavnov-Taylor (ST) and Ward-Takahashi (WT) identities is investigated.
artificial neural network   wt
An artificial neural network (ANN) is used to obtain an optimal coordination signal to improve frequency response. As a proof of concept, the proposed coordination is tested on a 9-bus test system that includes a wind farm with 5 WTs.
ann   wt
An artificial neural network (ANN) is used to obtain an optimal coordination signal to improve frequency response. As a proof of concept, the proposed coordination is tested on a 9-bus test sy

In [5]:
definitions = dict()
for acronym in acronyms:
    definition = get_acronym_defintion(acronym)
    if definition:
        print(f"{acronym} : {definition}")
        definitions[acronym] = definition

# acronym = "elm"
# definition = get_acronym_defintion(acronym)
# print(f"{acronym} -> {definition}")

aadl : Architecture Analysis Design Language
aam : active appearance model
ab : Aharonov Bohm
abc : artificial bee colony
aco : Ant Colony Optimization
add : Algebraic Decision Diagrams
adls : Activities Daily Living
adp : adaptive dynamic programming
adts : abstract data type
aer : Automatic emotion recognition
aes : advanced encryption standard
afs : Andrew File System
agg : abstract ground graph
ahp : Analytic Hierarchy Process
ai : artificial intelligence
alp : Abductive Logic Programming
amcs : Adaptive Mixed Criticality
ann : Artificial Neural Network
anns : Artificial Neural Network
aop : Aspect Oriented Programming
api : Application Program Interface
apis : Application Programming Interface
ar : Augmented Reality
arc : Astrophysical Research Consortium
asa : Adaptive simulated annealing
asic : application specific integrated circuit
asm : Active Shape Models
asp : answer set programming
asr : automatic speech recognition
atl : adaptive triplet loss
au : Action Unit
auc : area u

In [9]:
# save the dictionary to a file (numpy)
np.save('/tempo/processed-files/definitions.npy', definitions)

In [194]:
print(f"% of acronyms with definitions: {len(definitions)}/{len(acronyms)} = {round(len(definitions) / len(acronyms) * 100,2)}%")


% of acronyms with definitions: 399/698 = 57.16%


In [213]:
# count the number of pairs in the df, where at least one of the words has a definition
count = 0
total = 0
for i in range(len(df)):
    if (df['Word A'][i] in acronyms) | (df['Word B'][i] in acronyms):
        total += 1
    if (df['Word A'][i] in definitions) | (df['Word B'][i] in definitions):
        count += 1

print(f"Acronyms with a definition / Total acronyms: \t {count}/{total} = {round(count / total * 100,2)}%")
print(f"Total acronyms / Total pairs: \t\t\t {total}/{len(df)} = {round(total / len(df) * 100,2)}%")
print(f"Acronyms with a definition / Total pairs: \t {count}/{len(df)} = {round(count / len(df) * 100,2)}%")

Acronyms with a definition / Total acronyms: 	 173912/235460 = 73.86%
Total acronyms / Total pairs: 			 235460/1440317 = 16.35%
Acronyms with a definition / Total pairs: 	 173912/1440317 = 12.07%
