In [41]:
banned_ent_types = {"ORGANIZATION", "EVENT", "GPE", "LOC"}
banned_pos = ["PUNCT", "SPACE"]
reflexive_deps = ["expl:poss", "expl:pv", "iobj", "obj"]
root_forms = ["ROOT", "advcl", "acl", "cop", "conj", "parataxis"]

reflexive_short_to_long_form = {
    "mi-": "îmi",
    "ți-": "îți",
    "și-": "își",
    "v-": "vă",
    "s-": "se",
    "ne-": "ne",
    "te-": "te"
}

ud_to_dex = {
        "VERB": "V",
        "AUX": "V",
        "PART": "I",
        "NOUN": "M",
        "PROPN": "SP",
        "PRON": "P",
        "DET": "P",
        "SCONJ": "I",
        "CCONJ": "I",
        "NUM": "P",
        "INTJ": "I",
        "ADV": "I",
        "ADP": "I",
        "ADJ": "A"
   }
end_of_phrase = ["!", "?", ".", "\n"]

json_archive = "utils_json.zip"
json_archive_url = f"https://github.com/PetruTH/nlp_lic/releases/download/Resources/{json_archive}"
UNIDENTIFIED_TOKEN = "unidentified"
MAPARE_PATH = "util/forme_morfologice.json"
ALL_INFLECTED_FORMS_PATH = "util/inflected_form_lexemeId_inflectionId.json"
WORD_TO_ID_POS_PATH = "util/word_id_pos.json"
ID_TO_WORD_POS_PATH = "util/id_word_pos.json"
ID_TO_INFLECTED_FORMS_PATH = "util/wordId_inflected_forms.json"
print("gata..")

gata..


In [42]:
from data_loader import load_jsons
from util_data import (
    UNIDENTIFIED_TOKEN,
    ud_to_dex,
    banned_pos,
    banned_ent_types,
    reflexive_deps,
    reflexive_short_to_long_form
)
from spacy.tokens import Token

mapare, all_inflected_forms, word_to_id_pos, id_to_word_pos, id_to_inflected_forms, entry_lexeme, tree_entry, relation, synonyms, context = load_jsons()


def get_all_forms_worker(token: Token) -> [int]:
    """
    thiw will extract every word having inflected form == token.text
    """
    token_text = token.text
    if "-" in token.text:
        token_text = token_text.replace("-", "")

    all_inflected_words_found = all_inflected_forms.find_all_inflected_forms_double_verification(
                token_text, token_text.lower()
            )

    if all_inflected_words_found == UNIDENTIFIED_TOKEN:
        return []

    words_prel = []
    only_one_word = [word['lexemeId'] for word in all_inflected_words_found]

    if len(set(only_one_word)) == 1:
        words_prel.append(str(only_one_word[0]))
    for word in all_inflected_words_found:
        pos_found = mapare.find_dexonline_pos_id(word['inflectionId'])
        """
            mapare.mapping['DEXONLINE_MORPH']: ["morph dexonline", "pos dexonline"],
            this will help for mapping spacy pos to dexonline pos
            mapping spacy pos with dexonline pos
            looking after an id found from dexonline
        """

        if ud_to_dex[token.pos_] == pos_found:
            if str(word['lexemeId']) not in words_prel:
                words_prel.append(str(word['lexemeId']))

        elif ud_to_dex[token.pos_] == "M" and pos_found == "F":
            if str(word['lexemeId']) not in words_prel:
                words_prel.append(str(word['lexemeId']))

        elif ud_to_dex[token.pos_] == "M" and pos_found == "N":
            if str(word['lexemeId']) not in words_prel:
                words_prel.append(str(word['lexemeId']))

    words_prel.sort(key=lambda x: int(x))

    return words_prel


def get_all_forms(token: Token) -> [{str, str}]:
    """
        This function will return all the inflected forms for a certain token given as a parameter.
        It will search for that token in dexonline database and it will find the lexemeId.
        Based on get_all_forms_worker, it will choose the word from the list returned that
        has lemma like the first form found in dexonline database. After, that,
        based on that lexemeId, it will return all inflected forms found with the same lexemeId (a list of
        dictionaries containig words form and morphological details also from dexonline database)
    """
    words_prel = get_all_forms_worker(token)
    token_text = token.text

    if len(words_prel) > 1:
        for element in words_prel:
            if id_to_word_pos.find_id_to_word_pos_form(element) == token.lemma_:
                id = element

    elif len(words_prel) == 1:
        id = words_prel[0]

    elif len(words_prel) == 0:
        words_found = word_to_id_pos.find_word_id_pos_double_verification(token.lemma_, token_text)

        if words_found != UNIDENTIFIED_TOKEN:
            words_prel = [str(x['id']) for x in words_found]
            id = words_prel[0]
        else:
            return []

    result = id_to_inflected_forms.find_id_to_inflected_forms(id)

    return result


def validate_token(token: Token) -> bool:
    """
        Function that validates if a token can be found in dexonline database.
        It will exclude words that describe names or places, organizations, etc.
    """
    if "-" in token.text:
        return True
    if token.pos_ in banned_pos:
        return False
    if token.lang_ != "ro":
        return False
    if not token.text.isalpha():
        return False
    if token.ent_type_ in banned_ent_types:
        return False
    return True


def get_wanted_form(token: Token, pos_finder: str, person: str, number: str) -> str:
    """
       This function will return the morph form wanted by pos_finder, person and number
    """
    all_morph = get_all_forms(token)
    print(type(all_morph, "!!!!!"))
    for wanted_form in all_morph:
        if pos_finder in wanted_form['pos'] and person in wanted_form['pos'] and number in wanted_form['pos']:
            return wanted_form['form']
    return "UNKNOWN"


def verify_word_at_certain_pos(token: Token, pos_verifier: str) -> bool:
    """
    verifiy if a token is contains a specified string in its part of speech
    for example this function will return true if a verb has this description from dexonline
    as its pos "Verb, Indicativ, perfect simplu, persoana I, singular" and pos_verifier parameter
    is "perfect simplu" or "persoana I", etc
    """
    all_morph = get_all_forms(token)
    for wanted_form in all_morph:
        if token.text == wanted_form['form']:
            for pos in pos_verifier:
                if pos in wanted_form['pos']:
                    return True


def is_composed_subj(token: Token) -> bool:
    # extra step to verify if there is a composed subject (like 'eu cu tine mergem')
    if not token.pos_ == "VERB" and not token.pos_ == "AUX":
        if len(list(token.children)):
            for t in token.children:
                if t.text not in ["m", "te", "s"]:
                    return 1
        return 0


def get_right_person_and_number(token: Token) -> (str, str):
    """
        This function will get the person and number data from token.morph
        and will convert these into dexonline database format information
        in order to select right form of verb.
    """
    # extract correct person and number for a phrase
    person = token.morph.get("Person", ['3'])
    number = token.morph.get("Number", ['Sing'])

    if is_composed_subj(token):
        number = ["Plur"]

    # formatting number and person to be recognized dexonline json
    actual_number = "plural" if number == ["Plur"] else "singular"

    if person == ['1']:
        actual_person = "I"
    elif person == ['2']:
        actual_person = "II"
    elif person == ['3']:
        actual_person = "III"

    return actual_number, actual_person


def forme_reflexive_verifier(token: Token) -> str:
    """
        This function will map short reflexive forms into long ones
        using data from reflexive_deps from util_data.py
    """
    word_added = token.text
    if token.dep_ in reflexive_deps:
        case_condition = token.morph.get("Case", ["dummy"])[0] in ["Dat", "Acc"]
        variant_condition = token.morph.get("Variant", ["dummy"])[0] == "Short"
        if case_condition and variant_condition:
            word_added = reflexive_short_to_long_form[token.text]

    return word_added



from spacy.tokens import Token

Token.set_extension("forms_", method=get_all_forms, force=True)
Token.set_extension("is_valid", method=validate_token, force=True)

 [INFO] 2024-01-03 14:26:23,030 root:177 - Start loading needed data in memory!
 [INFO] 2024-01-03 14:26:23,033 data_loader:189 - Mapare file loaded.
 [INFO] 2024-01-03 14:26:28,568 data_loader:191 - All inflected forms file loaded.
 [INFO] 2024-01-03 14:26:28,982 data_loader:193 - Mapping word to id and pos file loaded.
 [INFO] 2024-01-03 14:26:29,382 data_loader:195 - Mapping word id to word and pos file loaded.
 [INFO] 2024-01-03 14:26:33,241 data_loader:197 - Mapping id to inflected forms file loaded.
 [INFO] 2024-01-03 14:26:33,445 data_loader:199 - Mapping entry id to lexeme id file loaded.
 [INFO] 2024-01-03 14:26:33,637 data_loader:201 - Mapping tree id to entry id file loaded.
 [INFO] 2024-01-03 14:26:33,683 data_loader:203 - Mapping meaning id to tree id file loaded.
 [INFO] 2024-01-03 14:26:33,811 data_loader:205 - Mapping synonyms file loaded.
 [INFO] 2024-01-03 14:26:34,185 data_loader:207 - Mapping contexts file loaded.
 [INFO] 2024-01-03 14:26:34,185 root:209 - The data 

In [85]:
import re
from spacy.tokens import Token


def find_lexeme_ids(inflected_forms: [str]) -> [str]:
    possible_lexeme_ids = []

    if inflected_forms != ["UNKNOWN"]:
        for inflected_form in inflected_forms:
            if inflected_form.get("lexemeId") not in possible_lexeme_ids:
                possible_lexeme_ids.append(inflected_form.get("lexemeId"))
  
    
    return possible_lexeme_ids

def find_inflection_possibilites(token: Token, inflected_forms: [str], pos_wanted: str) -> [str]:
    inflection_possibilites = []

    if inflected_forms != ["UNKNOWN"]:
        for inflected_form in inflected_forms:
            inflectionId = mapare.find_dexonline_pos_id(inflected_form["inflectionId"])
            
            inflected_form_id = str(inflected_form["inflectionId"])

            if inflectionId == pos_wanted and inflected_form_id not in inflection_possibilites:
                inflection_possibilites.append(str(inflected_form["inflectionId"]))
            elif inflectionId in ["VT", "V"] and pos_wanted in ["V", "VT"] and inflected_form_id not in inflection_possibilites:
                inflection_possibilites.append(str(inflected_form["inflectionId"]))
            elif inflectionId in ["M", "F", "N"] and pos_wanted in ["M", "F", "N"] and inflected_form_id not in inflection_possibilites:
                inflection_possibilites.append(str(inflected_form["inflectionId"]))
            elif token.dep_ in ["ROOT", "nmod"] and inflected_form_id not in inflection_possibilites:
                inflection_possibilites.append(str(inflected_form["inflectionId"]))

    return inflection_possibilites

def find_matching_lexemeIds(token: Token, possible_lexeme_ids: [str], pos_wanted: str) -> [str]:
    lexeme_ids = [] 

    for lexemeId in possible_lexeme_ids:
        variant = id_to_word_pos.find_id_to_word_pos(lexemeId)
        if variant['pos'] == pos_wanted:
            lexeme_ids.append(lexemeId)
        elif variant['pos'] in ["VT", "V"] and pos_wanted in ["V", "VT"]:
            lexeme_ids.append(lexemeId)
        elif variant['pos'] in ["M", "F", "N"] and pos_wanted in ["M", "F", "N"]:
            lexeme_ids.append(lexemeId)
        # elif token.dep_ in ["ROOT", "nmod"]:
        #     lexeme_ids.append(lexemeId)
    return lexeme_ids

def find_entryIds(lexeme_ids: str) -> str:
    entry_ids = []
    for lexemeId in lexeme_ids:
        all_entries = entry_lexeme.find_entry_lexeme(lexemeId)
        if all_entries != ["no entry"]:
            for entry in all_entries:
                entry_ids.append(entry)

    return entry_ids

def find_treeIds(entry_ids: str) -> str:
    tree_ids = []
    for entryId in entry_ids:
        tree_entries = tree_entry.find_tree_entry(entryId)
        if tree_entries != ["no entry tree"]:
            for treeId in tree_entries:
                tree_ids.append(treeId)
    
    return tree_ids

def find_meaningIds(tree_ids: str) -> str:
    meaning_ids = []

    for treeId in tree_ids:
        all_meaningIds = relation.find_relation(str(treeId))
        if all_meaningIds != ["no relation"]:
            for meaningId in all_meaningIds:
                meaning_ids.append(meaningId)

    return meaning_ids


def synonyms_builder(token: Token, pos_wanted: str)  -> ([str], [str]):
    token_text = re.sub('[^a-zA-ZăâîșțĂÂÎȘȚ]', '', token.text.lower())
    inflected_forms = all_inflected_forms.find_all_inflected_forms(token_text)
    
    inflection_possibilities = find_inflection_possibilites(token, inflected_forms, pos_wanted)
    possible_lexeme_ids = find_lexeme_ids(inflected_forms)
    lexeme_ids = find_matching_lexemeIds(token, possible_lexeme_ids, pos_wanted)
    entry_ids = find_entryIds(lexeme_ids)
    tree_ids = find_treeIds(entry_ids)
    meaning_ids = find_meaningIds(tree_ids)

    candidate_synonyms_base_form = []
    
    for meaningId in meaning_ids:
        possible_synonyms = synonyms.find_synonyms(meaningId)
        if possible_synonyms != ["no synonyms"]:
            for synonym in possible_synonyms:
                syn_to_add = re.sub('[^a-zA-ZăâîșțĂÂÎȘȚ ]', '', synonym[1]).split(" ")
                
                for syn in syn_to_add:
                    syn_to_add_helper = all_inflected_forms.find_all_inflected_forms(syn, unidentified={"lexemeId": "UNKNOWN"})
                    if syn_to_add == ["UNKOWN"]:
                        break

                    syn_tuple = (syn, syn_to_add_helper[0].get("lexemeId", "dummy"))
                    if syn_tuple not in candidate_synonyms_base_form and syn_tuple[0] != token_text:
                        candidate_synonyms_base_form.append(syn_tuple)

    candidate_synonyms_base_form = [syn for i, syn in enumerate(candidate_synonyms_base_form) if i == 0 or syn[1] != candidate_synonyms_base_form[i-1][1]]

    return tree_ids, inflection_possibilities, candidate_synonyms_base_form

def is_valid_for_syn(token: Token) -> bool:
    if token.pos_ == "PUNCT":
        return False
    if "aux" in token.dep_:
        return False
    if not token.text.isalpha():
        return False
    return True


def get_synonyms(token: Token) -> [str]:
    if is_valid_for_syn(token):
        pos_found = ud_to_dex[token.pos_]
        tree_ids, inflection_possibilites, candidate_synonyms_base_form = synonyms_builder(token, pos_found)
        synonyms_found = []

        for syn in candidate_synonyms_base_form:
            inflected_forms_syn = id_to_inflected_forms.find_id_to_inflected_forms(str(syn[1]))

            for inflectionId in inflection_possibilites:
                inflection = mapare.find_dexonline_pos_detail(str(inflectionId))
                               
                for pos_syn in inflected_forms_syn:
                    pos_found_on_syn = pos_syn.get("pos")
                    form_found_on_syn = pos_syn.get("form")
                    if pos_found_on_syn == inflection:
                            if form_found_on_syn not in synonyms_found:
                                synonyms_found.append(form_found_on_syn)
        
        contexts_found = {}
        for treeId in tree_ids:
            contexts_found[treeId] = context.find_context(treeId)            

        return synonyms_found, contexts_found
    else:
        return []




""" 
    Short demo to show how it actually works. Uncomment and run the main() function.
"""

Token.set_extension("get_synonyms", method=get_synonyms, force=True)

import time
import spacy
t1 = time.time()


reader = open("/Users/inttstbrd/Desktop/licenta/nlp_lic/text.txt", "r")
text = reader.read()

# pentru teste
cuv = "port"
# contextele pe care vreau sa l testez
context1 = "Eu locuiesc aproape de port."
context2 = "Eu ador un port popular."

nlp = spacy.load("ro_core_news_sm")
doc = nlp(context2)

from transformers import AutoTokenizer, AutoModel
from scipy.spatial.distance import cosine
import torch

model_name = "dumitrescustefan/bert-base-romanian-cased-v1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def calculate_context_similarity(input_sentence, sentences):
    encoded_sentences = [tokenizer(sentence, return_tensors="pt") for sentence in sentences]
    input_encoded = tokenizer(input_sentence, return_tensors="pt")

    with torch.no_grad():
        context_vectors = [model(**sentence)["last_hidden_state"][:, 0, :].squeeze() for sentence in encoded_sentences]
        input_vector = model(**input_encoded)["last_hidden_state"][:, 0, :].squeeze()

    similarities = [1 - cosine(input_vector.numpy(), context_vector.numpy()) for context_vector in context_vectors]
    print(sum(similarities)/len(similarities))

    # for idx, sentence in enumerate(sentences):
    #         print(f"Similaritatea pentru exemplul {idx + 1} cu '{input_sentence}': {similarities.pop(0):.4f}")
    #         print(f"Propoziția corespunzătoare: '{sentence}'\n")
    

def main():
    for token in doc:
        if token.text == cuv:
            syns = token._.get_synonyms()
            # asigura te ca le iei pe toate la singural in loc de token.lemma  
            if syns:
                print(token, "lista de sinonime dexonline:", syns[0], "\n")
                for ctx in syns[1]:
                    print(ctx, syns[1][ctx])

                    # compar propozitia cu toate exemplele din contexte diferite si afisez media
                    calculate_context_similarity(context1, syns[1][ctx])

                print(syns[1])
            
    t2 = time.time() - t1
    print("TIMP: ", t2)

main()



port lista de sinonime dexonline: ['loc'] 

44381 ['Faptul de a purta sau de a deține.', 'Conduită obișnuită, firească, normală.', 'Îmbrăcăminte caracteristică unui popor, unei regiuni, unei epoci etc.', '$A purta portul$ (cuiva) = a se asemăna, a fi la fel cu cineva, a se adapta la felul de a fi al cuiva.', 'Îmbrăcăminte folosită la anumite ocazii.', '@purta@', '$Portul armelor este interzis.$', '$Ori te poartă cum ți-e vorba, ori vorbește cum ți-e portul.$', '$Nu te cunoscusem, fă, mai dinainte! Dar așa ți-e portul?$ COȘBUC, P. I 247.', '$Ori te poartă cum ți-e vorba, ori vorbește cum ți-e portul,$ se spune unui fățarnic, care una vorbește și alta face.', 'Aspect, înfățișare a unui lucru.', '$A luat-o pe lîngă casele cu port turcesc.$ GALACTION, O. I 188.', '$Cu furca-n brîu, cu gîndul dus, Era frumoasă de nespus în portu-i de la țară.$ IOSIF, V. 41.', '$S-a-ntins poporul adunat Să joace-n drum după tilinci: Feciori, la zece fete, cinci, Cu zdrîngăneii la opinci, Ca-n port de sat.$ C