In [1]:
banned_ent_types = {"ORGANIZATION", "EVENT", "GPE", "LOC"}
banned_pos = ["PUNCT", "SPACE"]
reflexive_deps = ["expl:poss", "expl:pv", "iobj", "obj"]
root_forms = ["ROOT", "advcl", "acl", "cop", "conj", "parataxis"]

reflexive_short_to_long_form = {
    "mi-": "îmi",
    "ți-": "îți",
    "și-": "își",
    "v-": "vă",
    "s-": "se",
    "ne-": "ne",
    "te-": "te"
}

ud_to_dex = {
        "VERB": "V",
        "AUX": "V",
        "PART": "I",
        "NOUN": "M",
        "PROPN": "SP",
        "PRON": "P",
        "DET": "P",
        "SCONJ": "I",
        "CCONJ": "I",
        "NUM": "P",
        "INTJ": "I",
        "ADV": "I",
        "ADP": "I",
        "ADJ": "A",
        "X": "V"
   }
end_of_phrase = ["!", "?", ".", "\n"]

json_archive = "utils_json.zip"
json_archive_url = f"https://github.com/PetruTH/nlp_lic/releases/download/Resources/{json_archive}"
UNIDENTIFIED_TOKEN = "unidentified"
MAPARE_PATH = "util/forme_morfologice.json"
ALL_INFLECTED_FORMS_PATH = "util/inflected_form_lexemeId_inflectionId.json"
WORD_TO_ID_POS_PATH = "util/word_id_pos.json"
ID_TO_WORD_POS_PATH = "util/id_word_pos.json"
ID_TO_INFLECTED_FORMS_PATH = "util/wordId_inflected_forms.json"
print("gata..")

gata..


In [2]:
from data_loader import load_jsons
from util_data import (
    UNIDENTIFIED_TOKEN,
    ud_to_dex,
    banned_pos,
    banned_ent_types,
    reflexive_deps,
    reflexive_short_to_long_form
)
from spacy.tokens import Token

mapare, all_inflected_forms, word_to_id_pos, id_to_word_pos, id_to_inflected_forms, entry_lexeme, tree_entry, relation, synonyms, context = load_jsons()


def get_all_forms_worker(token: Token) -> [int]:
    """
    thiw will extract every word having inflected form == token.text
    """
    token_text = token.text
    if "-" in token.text:
        token_text = token_text.replace("-", "")

    all_inflected_words_found = all_inflected_forms.find_all_inflected_forms_double_verification(
                token_text, token_text.lower()
            )

    if all_inflected_words_found == UNIDENTIFIED_TOKEN:
        return []

    words_prel = []
    only_one_word = [word['lexemeId'] for word in all_inflected_words_found]

    if len(set(only_one_word)) == 1:
        words_prel.append(str(only_one_word[0]))
    for word in all_inflected_words_found:
        pos_found = mapare.find_dexonline_pos_id(word['inflectionId'])
        """
            mapare.mapping['DEXONLINE_MORPH']: ["morph dexonline", "pos dexonline"],
            this will help for mapping spacy pos to dexonline pos
            mapping spacy pos with dexonline pos
            looking after an id found from dexonline
        """

        if ud_to_dex[token.pos_] == pos_found:
            if str(word['lexemeId']) not in words_prel:
                words_prel.append(str(word['lexemeId']))

        elif ud_to_dex[token.pos_] == "M" and pos_found == "F":
            if str(word['lexemeId']) not in words_prel:
                words_prel.append(str(word['lexemeId']))

        elif ud_to_dex[token.pos_] == "M" and pos_found == "N":
            if str(word['lexemeId']) not in words_prel:
                words_prel.append(str(word['lexemeId']))

    words_prel.sort(key=lambda x: int(x))

    return words_prel


def get_all_forms(token: Token) -> [{str, str}]:
    """
        This function will return all the inflected forms for a certain token given as a parameter.
        It will search for that token in dexonline database and it will find the lexemeId.
        Based on get_all_forms_worker, it will choose the word from the list returned that
        has lemma like the first form found in dexonline database. After, that,
        based on that lexemeId, it will return all inflected forms found with the same lexemeId (a list of
        dictionaries containig words form and morphological details also from dexonline database)
    """
    words_prel = get_all_forms_worker(token)
    token_text = token.text

    if len(words_prel) > 1:
        for element in words_prel:
            if id_to_word_pos.find_id_to_word_pos_form(element) == token.lemma_:
                id = element

    elif len(words_prel) == 1:
        id = words_prel[0]

    elif len(words_prel) == 0:
        words_found = word_to_id_pos.find_word_id_pos_double_verification(token.lemma_, token_text)

        if words_found != UNIDENTIFIED_TOKEN:
            words_prel = [str(x['id']) for x in words_found]
            id = words_prel[0]
        else:
            return []

    result = id_to_inflected_forms.find_id_to_inflected_forms(id)

    return result


def validate_token(token: Token) -> bool:
    """
        Function that validates if a token can be found in dexonline database.
        It will exclude words that describe names or places, organizations, etc.
    """
    if "-" in token.text:
        return True
    if token.pos_ in banned_pos:
        return False
    if token.lang_ != "ro":
        return False
    if not token.text.isalpha():
        return False
    if token.ent_type_ in banned_ent_types:
        return False
    return True


def get_wanted_form(token: Token, pos_finder: str, person: str, number: str) -> str:
    """
       This function will return the morph form wanted by pos_finder, person and number
    """
    all_morph = get_all_forms(token)
    for wanted_form in all_morph:
        if pos_finder in wanted_form['pos'] and person in wanted_form['pos'] and number in wanted_form['pos']:
            return wanted_form['form']
    return "UNKNOWN"


def verify_word_at_certain_pos(token: Token, pos_verifier: str) -> bool:
    """
    verifiy if a token is contains a specified string in its part of speech
    for example this function will return true if a verb has this description from dexonline
    as its pos "Verb, Indicativ, perfect simplu, persoana I, singular" and pos_verifier parameter
    is "perfect simplu" or "persoana I", etc
    """
    all_morph = get_all_forms(token)
    for wanted_form in all_morph:
        if token.text == wanted_form['form']:
            for pos in pos_verifier:
                if pos in wanted_form['pos']:
                    return True


def is_composed_subj(token: Token) -> bool:
    # extra step to verify if there is a composed subject (like 'eu cu tine mergem')
    if not token.pos_ == "VERB" and not token.pos_ == "AUX":
        if len(list(token.children)):
            for t in token.children:
                if t.text not in ["m", "te", "s"]:
                    return 1
        return 0


def get_right_person_and_number(token: Token) -> (str, str):
    """
        This function will get the person and number data from token.morph
        and will convert these into dexonline database format information
        in order to select right form of verb.
    """
    # extract correct person and number for a phrase
    person = token.morph.get("Person", ['3'])
    number = token.morph.get("Number", ['Sing'])

    if is_composed_subj(token):
        number = ["Plur"]

    # formatting number and person to be recognized dexonline json
    actual_number = "plural" if number == ["Plur"] else "singular"

    if person == ['1']:
        actual_person = "I"
    elif person == ['2']:
        actual_person = "II"
    elif person == ['3']:
        actual_person = "III"

    return actual_number, actual_person


def forme_reflexive_verifier(token: Token) -> str:
    """
        This function will map short reflexive forms into long ones
        using data from reflexive_deps from util_data.py
    """
    word_added = token.text
    if token.dep_ in reflexive_deps:
        case_condition = token.morph.get("Case", ["dummy"])[0] in ["Dat", "Acc"]
        variant_condition = token.morph.get("Variant", ["dummy"])[0] == "Short"
        if case_condition and variant_condition:
            word_added = reflexive_short_to_long_form[token.text]

    return word_added



from spacy.tokens import Token

Token.set_extension("forms_", method=get_all_forms, force=True)
Token.set_extension("is_valid", method=validate_token, force=True)


def find_lexeme_ids(inflected_forms: [str]) -> [str]:
    possible_lexeme_ids = []

    if inflected_forms != ["UNKNOWN"]:
        for inflected_form in inflected_forms:
            if inflected_form.get("lexemeId") not in possible_lexeme_ids:
                possible_lexeme_ids.append(inflected_form.get("lexemeId"))
  
    
    return possible_lexeme_ids

def find_inflection_possibilites(token: Token, inflected_forms: [str], pos_wanted: str) -> [str]:
    inflection_possibilites = []

    if inflected_forms != ["UNKNOWN"]:
        for inflected_form in inflected_forms:
            inflectionId = mapare.find_dexonline_pos_id(inflected_form["inflectionId"])
            
            inflected_form_id = str(inflected_form["inflectionId"])

            if inflectionId == pos_wanted and inflected_form_id not in inflection_possibilites:
                inflection_possibilites.append(str(inflected_form["inflectionId"]))
            elif inflectionId in ["VT", "V"] and pos_wanted in ["V", "VT"] and inflected_form_id not in inflection_possibilites:
                inflection_possibilites.append(str(inflected_form["inflectionId"]))
            elif inflectionId in ["M", "F", "N"] and pos_wanted in ["M", "F", "N"] and inflected_form_id not in inflection_possibilites:
                inflection_possibilites.append(str(inflected_form["inflectionId"]))
            elif token.dep_ in ["ROOT", "nmod"] and inflected_form_id not in inflection_possibilites:
                inflection_possibilites.append(str(inflected_form["inflectionId"]))

    return inflection_possibilites

def find_matching_lexemeIds(possible_lexeme_ids: [str], pos_wanted: str) -> [str]:
    lexeme_ids = [] 
   
    for lexemeId in possible_lexeme_ids:
        variant = id_to_word_pos.find_id_to_word_pos(lexemeId)
        if variant['pos'] == pos_wanted:
            lexeme_ids.append(lexemeId)
        elif variant['pos'] in ["VT", "V", "AUX"] and pos_wanted in ["V", "VT", "AUX"]:
            lexeme_ids.append(lexemeId)
        elif variant['pos'] in ["M", "F", "N"] and pos_wanted in ["M", "F", "N"]:
            lexeme_ids.append(lexemeId)
    
    return lexeme_ids

def find_entryIds(lexeme_ids: str) -> str:
    entry_ids = []
    for lexemeId in lexeme_ids:
        all_entries = entry_lexeme.find_entry_lexeme(lexemeId)
        if all_entries != ["no entry"]:
            for entry in all_entries:
                entry_ids.append(entry)

    return entry_ids

def find_treeIds(entry_ids: str) -> str:
    tree_ids = []
    for entryId in entry_ids:
        tree_entries = tree_entry.find_tree_entry(entryId)
        if tree_entries != ["no entry tree"]:
            for treeId in tree_entries:
                tree_ids.append(treeId)
    
    return tree_ids

def find_meaningIds(tree_ids: str) -> str:
    meaning_ids = []

    for treeId in tree_ids:
        all_meaningIds = relation.find_relation(str(treeId))
        if all_meaningIds != ["no relation"]:
            for meaningId in all_meaningIds:
                meaning_ids.append(meaningId)

    return meaning_ids

 [INFO] 2024-04-08 21:38:14,358 root:177 - Start loading needed data in memory!
 [INFO] 2024-04-08 21:38:14,361 data_loader:189 - Mapare file loaded.
 [INFO] 2024-04-08 21:38:20,098 data_loader:191 - All inflected forms file loaded.
 [INFO] 2024-04-08 21:38:21,186 data_loader:193 - Mapping word to id and pos file loaded.
 [INFO] 2024-04-08 21:38:22,303 data_loader:195 - Mapping word id to word and pos file loaded.
 [INFO] 2024-04-08 21:38:25,506 data_loader:197 - Mapping id to inflected forms file loaded.
 [INFO] 2024-04-08 21:38:26,660 data_loader:199 - Mapping entry id to lexeme id file loaded.
 [INFO] 2024-04-08 21:38:26,849 data_loader:201 - Mapping tree id to entry id file loaded.
 [INFO] 2024-04-08 21:38:26,898 data_loader:203 - Mapping meaning id to tree id file loaded.
 [INFO] 2024-04-08 21:38:27,052 data_loader:205 - Mapping synonyms file loaded.
 [INFO] 2024-04-08 21:38:28,389 data_loader:207 - Mapping contexts file loaded.
 [INFO] 2024-04-08 21:38:28,390 root:209 - The data 

In [3]:

import re
from spacy.tokens import Token
from json_creator import incarcare_eficienta

def synonyms_builder_step1(token: Token, pos_wanted: str)  -> ([str], [str]):
    token_text = re.sub('[^a-zA-ZăâîșțĂÂÎȘȚ]', '', token.text.lower())
    inflected_forms = all_inflected_forms.find_all_inflected_forms(token_text)
    inflection_possibilities = find_inflection_possibilites(token, inflected_forms, pos_wanted)
    possible_lexeme_ids = find_lexeme_ids(inflected_forms)
    lexeme_ids = find_matching_lexemeIds(possible_lexeme_ids, pos_wanted)
    entry_ids = find_entryIds(lexeme_ids)
    tree_ids = find_treeIds(entry_ids)
    meaning_ids = find_meaningIds(tree_ids)

    if len(inflection_possibilities) > 1:
        inflection_possibilities =  inflection_filter(token=token, inflection_possibilities=inflection_possibilities)
    return tree_ids, inflection_possibilities, meaning_ids

def synonyms_builder_step2(meaning_ids, tree_id_forced, token):
    candidate_synonyms_base_form = []
    token_text = re.sub('[^a-zA-ZăâîșțĂÂÎȘȚ]', '', token.text.lower())
    # print(tree_id_forced, "tree ids forced step2")

    for meaningId in meaning_ids:
        
        possible_synonyms = synonyms.find_synonyms(meaningId)
        tree_ids_verifier = [syn[0] for syn in possible_synonyms]
        # print(tree_ids_verifier, f"tree ids for meaning id {meaningId}", int(tree_id_forced[0]) in tree_ids_verifier)

        # print(possible_synonyms)
        if possible_synonyms != ["no synonyms"]:
        
            for synonym in possible_synonyms:
                # aici adauga si treeid si mai departe selectezi doar form si cand le sugerezi dai pe cele cu matching treeid (chosencontext de mai jos)
                # undo ca inainte stergi syn_treeId
                syn_to_add = re.sub('[^a-zA-ZăâîșțĂÂÎȘȚ ]', '', synonym[1]).split(" ")
               
                for syn in syn_to_add:
                    syn_to_add_helper = all_inflected_forms.find_all_inflected_forms(syn, unidentified={"lexemeId": "UNKNOWN"})
                    if syn_to_add == ["UNKOWN"]:
                        break

                    syn_tuple = (syn, syn_to_add_helper[0].get("lexemeId", "dummy"))
                    if syn_tuple not in candidate_synonyms_base_form and syn_tuple[0] != token_text:  
                        if int(tree_id_forced[0]) in tree_ids_verifier:      
                            candidate_synonyms_base_form.append(syn_tuple)

    candidate_synonyms_base_form = [syn for i, syn in enumerate(candidate_synonyms_base_form) if i == 0 or syn[1] != candidate_synonyms_base_form[i-1][1]]
    # print(candidate_synonyms_base_form, "!!!!")
    return candidate_synonyms_base_form

def is_valid_for_syn(token: Token) -> bool:
    if token.pos_ == "PUNCT":
        return False
    if "aux" in token.dep_:
        return False
    if not token.text.isalpha():
        return False
    return True


def get_synonyms(token: Token, tree_id_forced = []) -> [str]:
    if is_valid_for_syn(token):
        pos_found = ud_to_dex[token.pos_]
        tree_ids, inflection_possibilites, meaning_ids = synonyms_builder_step1(token, pos_found)

        candidate_synonyms_base_form = synonyms_builder_step2(meaning_ids, tree_id_forced, token)

        synonyms_found = []
        # print(candidate_synonyms_base_form, "candidate la sinonime")
        for syn in candidate_synonyms_base_form:
            inflected_forms_syn = id_to_inflected_forms.find_id_to_inflected_forms(str(syn[1]))

            for inflectionId in inflection_possibilites:
                inflection = mapare.find_dexonline_pos_detail(str(inflectionId))

                for pos_syn in inflected_forms_syn:
                    pos_found_on_syn = pos_syn.get("pos")
                    form_found_on_syn = pos_syn.get("form")
                    if pos_found_on_syn == inflection:
                            if form_found_on_syn not in synonyms_found:
                                synonyms_found.append(form_found_on_syn)
                                
       
        return synonyms_found
    else:
        return []

def force_plural_noun(token):
    associated_tokens = token.subtree
    for token in associated_tokens:
        if token.pos_ in ["DET", "NUM", "PRON"]:
            if token.morph.get("Number", ["dummy"])[0] == "Plur":
                return True
    return False

def force_person_and_number_verb(token):
    number, person = "_", "_"
    inf = False
    subtree = token.subtree
    
    for t in subtree:
        if t == token:
            break
        if t.dep_ == "nsubj":
            number = t.morph.get("Number", ["Sing"])[0]

            if number == "Plur":
                number = "plural"
            else:
                number = "singular"
                        
            if t.pos_ == "NOUN":
                person = "a III-a"
            elif t.pos_ == "PRON":
                person = t.morph.get("Person", ["dummy"])[0]

                if person == "1":
                    person = "I"
                elif person == "2":
                    person = "a II-a"
                elif person == "3":
                    person = "a III-a"
        elif t.dep_ == "mark":
            inf = True
    
            
    return inf, number, person

def get_verb_tense(token):
    mood = token.morph.get("Mood", ["Ind"])[0]
    tense = token.morph.get("Tense", ["dummy"])[0]
    verbform = token.morph.get("VerbForm", ["dummy"])[0]

    if tense == "Imp" or tense == "Pres" and mood == "Indicativ" and verbform == "Inf":
        tense = "imperfect"
    elif tense == "Pres":
        tense = "prezent"
    elif tense == "Past":
        tense = "perfect simplu"
    elif tense == "Pqp":
        tense = "mai mult ca perfect"
    return tense

def build_inflection_for_verb(token, inflection_dex_details):
    mood = "Indicativ" if token.morph.get("Mood", ["Ind"])[0] == "Ind" else "None"

    inf, number, person = force_person_and_number_verb(token)

    tense = get_verb_tense(token)

    if token.dep_ == "ccomp" or inf == True:
        found_dex_pos = "Verb, Infinitiv prezent"
    else:
        found_dex_pos = f"Verb, {mood}, {tense}, persoana {person}, {number}"

    if found_dex_pos == inflection_dex_details:
        return True
    else:
        found_dex_pos = f"Verb, {mood}, prezent, persoana {person}, {number}"
        if found_dex_pos == inflection_dex_details:
            return True
    return False

def get_case_for_noun(token):
    case = token.morph.get("Case", ["Acc", "Nom"])
    if "Acc" in case[0] or "Nom" in case[0]:
        case = "Acc, Nom"

    if case == "Acc, Nom":
        case = "Nominativ-Acuzativ"
    elif case == "Dat, Gen":
        case = "Genitiv-Dativ"
    else:
        case = "Vocativ"
    return case

def get_definite(token):
    definite = token.morph.get("Definite", ["dummy"])[0]
    if definite == "Ind":
        definite = "nearticulat"
    else:
        definite = "articulat"
    return definite

def get_number(token):
    number = token.morph.get("Number", ["Sing"])[0]
    
    if force_plural_noun(token):
        number = "plural"
    elif number == "Sing":
        number = "singular"
    else:
        number = "plural"
    return number

def get_gender(token):
    gender = token.morph.get("Gender", ["dummy"])[0]
    if gender == "Masc":
        gender = "masculin"
    else: 
        gender = "feminin"

def get_case_for_pron(token):
    case = token.morph.get("Poss", ["No"])[0]
    if case == "Yes":
        case = "Genitiv-Dativ"
    else:
        case = "Nominativ-Acuzativ"
    return case

def build_inflection_for_noun(token, inflection_dex_details):
    case = get_case_for_noun(token)
    definite = get_definite(token)
    number = get_number(token)
    gender = get_gender(token)

    if token.pos_ == "NOUN":
        dex_pos = "Substantiv"
    elif token.pos_ == "ADJ":
        dex_pos = "Adjectiv"

    found_dex_pos = f"{dex_pos} {gender}, {case}, {number}, {definite}"
    if found_dex_pos == inflection_dex_details:
        return True
    elif f"{dex_pos} neutru, {case}, {number}, {definite}" == inflection_dex_details:
        return True
    return False
    
def build_inflection_for_pron(token, inflection_dex_details):
    case = get_case_for_pron(token)
    number = get_number(token)

    for gender in ["masculin", "feminin"]:  
        found_dex_pos = f"Pronume, {gender}, {case}, {number}"
        if found_dex_pos == inflection_dex_details:
            return True
    return False

def inflection_filter(token, inflection_possibilities):
    if '85' in inflection_possibilities:
        return '85'

    for infl in inflection_possibilities:
        inflection_dex_details = mapare.find_dexonline_pos_detail(str(infl))
        if token.pos_ in ["VERB", "AUX"]:  
            if build_inflection_for_verb(token, inflection_dex_details) is True:
                inflection_possibilities = [infl]


        elif token.pos_ in ["NOUN", "ADJ"]:

            if build_inflection_for_noun(token, inflection_dex_details) is True:
                inflection_possibilities = [infl]

        elif token.pos_ in ["PRON", "DET"]:
            
            if build_inflection_for_pron(token, inflection_dex_details) is True:
                inflection_possibilities = [infl]

    return inflection_possibilities



  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from transformers import AutoTokenizer, AutoModel
import torch
import logging
logging.getLogger('torch').setLevel(logging.ERROR)
import json
import numpy

all_contexts = json.load(open("util/context.json"))

tokenizer = AutoTokenizer.from_pretrained("dumitrescustefan/bert-base-romanian-uncased-v1", do_lower_case=True)
model = AutoModel.from_pretrained("dumitrescustefan/bert-base-romanian-uncased-v1")


def get_embeddings(text):
        tokenized_text = tokenizer(text, return_tensors="pt")

        with torch.no_grad():
            outputs = model(**tokenized_text)

        word_embeddings = outputs.last_hidden_state
        averaged_embedding = torch.mean(word_embeddings, dim=0)

        return averaged_embedding

cos = torch.nn.CosineSimilarity(dim=0)

def calculate_similarity(input, compared_to):
    if type(input) != "<class 'torch.Tensor'>":
        input=torch.tensor(input)
   
    return cos(input, torch.tensor(compared_to))


def transform_with_mean_pooling(numpy_array, target_shape=(384,)):
    pooled_array = numpy.reshape(numpy_array, (-1, target_shape[0]))
    mean_array = numpy.mean(pooled_array, axis=0) 
    return mean_array


def get_similarity_scores_for_syns(actual_context: list, syn_candidate_context: list, mean: bool, reshapeFlag = True):
    if actual_context is None or syn_candidate_context is None:
        return 0
    meansim = []
    
    if reshapeFlag == True:
        if actual_context[0].shape != (384,):
            actual_context = [transform_with_mean_pooling(numpy.array(act_ctx)) for act_ctx in actual_context]

    for act_ctx in actual_context:
        for syn_ctx in syn_candidate_context:
            actual_similarity = calculate_similarity(input=act_ctx, compared_to=syn_ctx)
            meansim.append(actual_similarity)

    if len(meansim):
        if mean is True:
            return sum(meansim)/len(meansim)
        else:
            return max(meansim)
    else:
        return 0


def get_context_for_each_candidate_syn(token_text, pos_wanted):
    pos_wanted = ud_to_dex[pos_wanted]
    inflected_forms = all_inflected_forms.find_all_inflected_forms(token_text)
    possible_lexeme_ids = find_lexeme_ids(inflected_forms)
    lexeme_ids = find_matching_lexemeIds(possible_lexeme_ids, pos_wanted)
    entry_ids = find_entryIds(lexeme_ids)
    tree_ids = find_treeIds(entry_ids)

    contexts_found = {}

    for treeId in tree_ids:
        contexts_found[treeId] = incarcare_eficienta(treeId)
    return contexts_found   

""" 
    Short demo to show how it actually works. Uncomment and run the main() function.
"""

Token.set_extension("get_synonyms", method=get_synonyms, force=True)


In [5]:
def raw_word(text):
    diacritics = {
      "ă": "a",
      "â": "a",
      "î": "i",
      "ș": "s",
      "ț": "t",
      "č": "c",
      "ş": "s",
      "ž": "z",
      "Ä": "A",
      "Â": "A",
      "Î": "I",
      "Ș": "S",
      "Ț": "T",
      "Č": "C",
      "Ș": "S",
      "Ž": "Z",
      "á": "a",
      "é": "e",
      "í": "i",
      "ó": "o",
      "ú": "u",
      "ű": "u",
      "Á": "A",
      "É": "E",
      "Í": "I",
      "Ó": "O",
      "Ú": "U",
      "Ű": "U",
      "ö": "o",
      "Ö": "O",
      "ü": "u",
      "Ü": "U",
    }

    for k, v in diacritics.items():
        text = text.replace(k, v)
    return text.lower()

def count_consecutive_vowels(word):
    vowels = "aeiouAEIOU"
    consecutive_vowel_count = 0
    total_consecutive_vowels = 0
    for char in word:
        if char in vowels:
            consecutive_vowel_count += 1
        else:
            total_consecutive_vowels += consecutive_vowel_count
            consecutive_vowel_count = 0
        
    total_consecutive_vowels += consecutive_vowel_count
    
    return total_consecutive_vowels


def aproximate_syllables(word: str):
    vowels = "aeiouăîâe"
    groups = ["ch", "gh"]
    word = raw_word(word).lower()
    for group in groups:
        if group == "ch":
            word = word.replace(group, "C")
        elif group == "gh":
            word = word.replace(group, "G")
    
    i = 1
    syllables = []
    last_syllable_index = 0
    while i < len(word) - 1:
        current_char = word[i]
        last_char = word[i-1]
        next_char = word[i+1]
        if i+2 < len(word):
            next2_char = word[i+2]
        # RULE1
        if current_char not in vowels and last_char in vowels and next_char in vowels:
            syllables.append(word[last_syllable_index : i])
            last_syllable_index = i
        # RULE2
        elif current_char not in vowels and next_char not in vowels and last_char in vowels and next2_char in vowels:
            # case 1
            if current_char in "bcdfghptv" and next_char in "lr":
                syllables.append(word[last_syllable_index : i])
            # case 2
            else:
                syllables.append(word[last_syllable_index : i+1])
                i+=1
            last_syllable_index = i
        # RULE3
        elif current_char not in vowels and last_char in vowels:
            cons_group = [current_char]
            j = i + 1 
            while j < len(word):
                if word[j] not in vowels:
                    cons_group.append(word[j])
                else:
                    break
                j+=1
            special_cons_groups = [["l", "p", "t"], ["m", "p", "t"],  ["n", "c", "t"],  ["n", "c", "s"], ["n", "d", "v"], ["r", "c", "t"], ["r", "t", "f"], ["s", "t", "m"]]
            
            # case1
            if cons_group in special_cons_groups:
                syllables.append(word[last_syllable_index:j-1])
                last_syllable_index = j-1
            # case2
            else:    
                syllables.append(word[last_syllable_index:i+1])
                last_syllable_index = i+1
            i=j

        i+=1

    syllables.append(word[last_syllable_index:])
    
    # handle hiat, diftong, triftong
    for syllable in syllables:
        vowels_num = count_consecutive_vowels(syllable)
        if 1 < vowels_num <= 3:
            double_vowel = True
            for i in range(len(syllable) - 1):
                
                if syllable[i] in vowels and syllable[i+1] != syllable[i]:
                    # print(syllable[i])
                    double_vowel = False
            if syllable == syllables[-1] and syllable[-1] == "i" and syllable[i-2] == "i":
                double_vowel = False

            if double_vowel is True:
                syllables.append("dbl_vowel")
        
        elif vowels_num > 3:   
            syllables.append("4vowel")
        # iae, ieu, oeu, oau, eoau, eoeu
        
        for long_hiat in ["iae", "ieu", "oeu", "oau", "eoau", "eoeu"]:
            if long_hiat in syllable:
                syllables.append("lng_hi")

            # print(syllable)
    return syllables




In [1]:
from wordfreq import zipf_frequency
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

def choose_meaning(contexts_found, actual_context):
    max_score = -100
    key_to_return = ""
    actual_context = torch.mean(get_embeddings(actual_context), dim=0)

    for key in contexts_found:
        score = get_similarity_scores_for_syns(actual_context=[actual_context], syn_candidate_context=contexts_found[key], mean=False)
        print(score)
        if score > max_score:
            max_score = score
            key_to_return = key

    return key_to_return


def heuristic_comparator(word: str, actual_context: str, token_pos: str, dexonline_order: int, syns_number: int):
    # these can be modified, still testing
    len_weight = -50
    number_of_syllabes_weight = -120
    freq_in_ro_lang_weight = 30
    similarity_with_actual_context_weight = 1500
    dexonline_order_weight = 90

    def transform_with_mean_pooling(numpy_array, target_shape=(384,)):
        pooled_array = numpy.reshape(numpy_array, (-1, target_shape[0]))
        mean_array = numpy.mean(pooled_array, axis=0) 
        return mean_array

    emb1 = [torch.tensor(transform_with_mean_pooling(numpy.array(get_embeddings(actual_context))))]
   
    sin_context = get_context_for_each_candidate_syn(token_text=word, pos_wanted=token_pos)
    
    similarity_score = 0
    if len(sin_context):
        for key in sin_context:
            similarity_score = get_similarity_scores_for_syns(actual_context=emb1, syn_candidate_context=sin_context[key], mean=False)
    
    try:
        base_form = get_all_forms(nlp(word)[0])[0].get("form")
    except:
        base_form = word

    word_len = len_weight * len(word)
    apx_syllables_number = number_of_syllabes_weight * len(aproximate_syllables(word))
    freq = freq_in_ro_lang_weight * (zipf_frequency(base_form, 'ro')) 
    dexonline_order = dexonline_order_weight * (syns_number - dexonline_order)
    
    return word_len + apx_syllables_number + freq + similarity_with_actual_context_weight * similarity_score + dexonline_order


def get_matching_syns(token, actual_context, pos_found):
    tree_ids, inflection_possibilites, meaning_ids = synonyms_builder_step1(token, pos_found)
    contexts_found = {}
        
    for treeId in tree_ids:
        contexts_found[treeId] = incarcare_eficienta(treeId)    
    print(tree_ids)
    try:
        if len(contexts_found.keys()) > 1:
            chosen_context = str(choose_meaning(contexts_found=contexts_found, actual_context=actual_context))
        else:
            chosen_context = str(list(contexts_found.keys())[0])
        print(chosen_context)
        sinonime_posibile_dex = token._.get_synonyms([chosen_context])
        
        sinonime_to_return = []
        for x in sinonime_posibile_dex[:10]:
            print(x)

        for i in range(len(sinonime_posibile_dex)):
            syn = sinonime_posibile_dex[i]
            if syn == token.text[1:] or syn == sinonime_posibile_dex[i-1][1:]:
                continue
            else:
                sinonime_to_return.append((syn, heuristic_comparator(syn, actual_context, token.pos_, i, len(sinonime_posibile_dex))))
        
        return sorted(sinonime_to_return, key=lambda x: x[1], reverse=True)
    
    except IndexError:
        print("no synonyms")
        pass



import time
import spacy
t1 = time.time()

# pentru teste
cuv = "pas"
cuv2="uram"
# contextele pe care vreau sa l testez
actual_context = "Noi ne deplasam la pas."

nlp = spacy.load("ro_core_news_sm")
doc = nlp(actual_context)

def main():
    for token in doc:
        if token.text == cuv:
            pos_found = ud_to_dex[token.pos_]
            syns = get_matching_syns(token, actual_context=doc.text, pos_found=pos_found)
            if syns:
                for x in syns[:10]:
                    if x[0] == cuv:
                        continue
                    else:
                        print(x)
                
    t2 = time.time() - t1
    print("TIMP: ", t2)

main()

# force meaning id ca filter pentru verb ca uram (aduci sinonimele in functie de chosen_context si fortezi)


NameError: name 'ud_to_dex' is not defined

In [136]:
#MODEL EVALUATION

# random tree id din care "fur" doua exemple din lista de contexte = test data
# compar test data cu alte 10 id uri + id ul din care l-am creat si ma astept sa fie match pe id
import copy
import random
from json_creator import incarcare_eficienta
import math

def get_random_treeId(minrange: int, maxrange: int) -> int:
    return random.randint(minrange, maxrange)

def prepare_treeIds(max_retries):
    rand_treeIds = {}
    max_elements = 0
    for _ in range(10):
        for attempt in range(max_retries):
            treeId = get_random_treeId(1, 2888812)
            try:
                # treeIdData = context.find_context(treeId)
                treeIdData = incarcare_eficienta(treeId)
                if len(treeIdData) > 2:
                    rand_treeIds[treeId] = treeIdData
                    max_elements += 1
                    if max_elements > 14:
                        return rand_treeIds
            except:
                pass 

    return rand_treeIds


def get_random_test_data(rand_treeIds: dict) -> list:
    randomTreeId = random.choice(list(rand_treeIds.keys()))
    num_elements = min(int(math.sqrt(len(rand_treeIds[randomTreeId]))), len(rand_treeIds[randomTreeId]))

    max_idx = len(rand_treeIds[randomTreeId]) - 1  
    idx_list = random.sample(range(max_idx + 1), num_elements)

    test_data_list = []
    for idx in reversed(idx_list):
        try:
            test_data = rand_treeIds[randomTreeId].pop(idx)
            test_data_list.append(test_data)
        except IndexError:
            pass

    return test_data_list, rand_treeIds, randomTreeId

def model_evaluation():
    rand_treeIds_initial = prepare_treeIds(max_retries=100)
    rand_treeIds_param = copy.deepcopy(rand_treeIds_initial)

    test_data, rand_treeIds, randomTreeId = get_random_test_data(rand_treeIds_param)

    sim_scores = []
    
    for key in rand_treeIds.keys():
        sim_scores.append((randomTreeId, key, get_similarity_scores_for_syns(test_data, rand_treeIds[key], mean=True, reshapeFlag=True)))
        
        # cu nlp.similarity
        # for tst in test_data:
        #     for tr in rand_treeIds[key]:
                # print(tst, tr, nlp(tst).similarity(nlp(tr)))
                # sim_scores.append((randomTreeId, key, nlp(tst).similarity(nlp(tr))))

    sim_scores.sort(key=lambda x: x[2], reverse=True)
    return sim_scores

def count_percentage_equal_to_one(list_data):

    count_one = len([element for element in list_data if element == 1])
    total_elements = len(list_data)

    if total_elements == 0:
        return 0.0

    return (count_one / total_elements) * 100

results = []
for _ in range(10):
    evl = model_evaluation()
    score = 1 if evl[0][0] == evl[0][1] or evl[1][0] == evl[1][1] else 0
    if score == 1:
        print(evl)
    results.append(score)

print(count_percentage_equal_to_one(results))

[(26248, 241918, tensor(0.7635, dtype=torch.float64)), (26248, 26248, tensor(0.6032, dtype=torch.float64)), (26248, 58218, tensor(0.5412, dtype=torch.float64)), (26248, 64581, tensor(0.5079, dtype=torch.float64)), (26248, 73302, tensor(0.4759, dtype=torch.float64)), (26248, 55612, tensor(0.3508, dtype=torch.float64)), (26248, 73347, tensor(0.3470, dtype=torch.float64)), (26248, 64235, tensor(0.3011, dtype=torch.float64)), (26248, 59349, tensor(0.2716, dtype=torch.float64)), (26248, 220204, tensor(0.2604, dtype=torch.float64)), (26248, 34771, tensor(0.2095, dtype=torch.float64)), (26248, 27874, tensor(0.2086, dtype=torch.float64)), (26248, 11667, tensor(0.1959, dtype=torch.float64)), (26248, 47232, tensor(0.0689, dtype=torch.float64)), (26248, 57744, tensor(0.0270, dtype=torch.float64))]
[(236008, 236008, tensor(0.3590, dtype=torch.float64)), (236008, 229771, tensor(0.3181, dtype=torch.float64)), (236008, 209382, tensor(0.2590, dtype=torch.float64)), (236008, 62506, tensor(0.2455, dtype

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
tokenizer = AutoTokenizer.from_pretrained("dumitrescustefan/bert-base-romanian-uncased-v1", do_lower_case=True)
model = AutoModel.from_pretrained("dumitrescustefan/bert-base-romanian-uncased-v1")
# pt fine tuning + antrenare mai departe cred ca trebuie un nou dataset cu label, iar apoi folosit pipelineul de evaluare de mai sus pt modelul tunat

print(model)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(50000, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [None]:
# Load the RO-STS dataset
train_file = "dataset/RO-STS.train.tsv"
dev_file = "dataset/RO-STS.dev.tsv"
test_file = "dataset/RO-STS.test.tsv"

def read_tsv_file(filename):
  """Reads a TSV file and returns a list of dictionaries."""
  with open(filename, "r", encoding="utf-8") as f:
    next(f, None)  
    data = []
    for line in f:
      fields = line.strip().split("\t")
      data_point = {"sentence1": fields[1], "sentence2": fields[2], "score": float(fields[0])}
      data.append(data_point)
  return data

dataset = {}

dataset["train"] = read_tsv_file(train_file)
dataset["dev"] = read_tsv_file(dev_file)
dataset["test"] = read_tsv_file(test_file)


In [131]:
#EXPERIMENT

from transformers import AutoTokenizer, AutoModel
import torch, util
tokenizer = AutoTokenizer.from_pretrained("dumitrescustefan/bert-base-romanian-uncased-v1", do_lower_case=True)
model = AutoModel.from_pretrained("dumitrescustefan/bert-base-romanian-uncased-v1")

# schimb aici exemplele si similaritatea cosinus are un sens ok 
# consider ca principala problema este de la formatul datelor din DEX si ca ar trebui o "curatare" a unor caractere: =, nume de scriitori, $, etc
sentence1 = nlp("El a plecat joi cu prietenii la pescuit.")
sentence2 = nlp("Eu voi merge miercuri la pescuit.")
# 220812 context id
# 220816 context id
# sentence1 = nlp(context.find_context(220812)[0])
# sentence2 = nlp(context.find_context(220816)[0])

print(sentence1.similarity(sentence2))

# print(sentence1, sentence2)
# def get_embeddings(text):
#         tokenized_text = tokenizer(text, return_tensors="pt")

#         with torch.no_grad():
#             outputs = model(**tokenized_text)

#         word_embeddings = outputs.last_hidden_state
#         averaged_embedding = torch.mean(word_embeddings, dim=0)

#         return averaged_embedding

# def transform_with_mean_pooling(numpy_array, target_shape=(384,)):
#     pooled_array = numpy.reshape(numpy_array, (-1, target_shape[0]))
#     mean_array = numpy.mean(pooled_array, axis=0) 
#     return mean_array

# emb1 = torch.tensor(transform_with_mean_pooling(numpy.array(get_embeddings(sentence1))))
# emb2 = torch.tensor(transform_with_mean_pooling(numpy.array(get_embeddings(sentence2))))

# embeddings1 = torch.mean(get_embeddings(sentence1), dim=0)
# embeddings2 = torch.mean(get_embeddings(sentence2), dim=0)

# print(embeddings2.shape)
# # Calculate cosine similarity
# similarity_score = calculate_similarity(embeddings1, embeddings2)
# print(f"Similarity score between sentences: {similarity_score}")


# print(emb1.shape)
# similarity_score2 = calculate_similarity(emb1, emb2)
# print(f"Similarity score between sentences: {similarity_score2}")



0.6837704277788871


  print(sentence1.similarity(sentence2))


In [10]:
# calcul embedding cu modelul
# am salvat embedding urile tuturor contextelor in folderul context_stock astfel:
#   -> fiecare fisier are 45 de cuvinte ordonate dupa treeId (fiecare cuvant (reprezentat prin treeId) are o lista de numpy arrays care reprezinta embeddingurile exemplelor)
#   -> caut eficient => iau tree id si caut binar pe intervalele date din fisierele minrange_maxrange_range.feather
#   -> pt calcul de similaritati intre contexte folosesc calculate_similarity

# din experimentul de mai sus am vazut ca modelul pare sa calculeze bine similaritatile intre propozitii "naturale"
# daca dau in schimb doua exemple din contexte din dexonline, se comporta mai ciudat, poate si din cauza formatului lor
# exemplele din dexonline mai contin caractere: $, #, nume scrise cu litere mari, "propozitii" care nu au predicat
# trebuie sa preprocesez? sa antrenez modelul si pe setul de date din dexonline? cum pot face asta pentru ca ar trebui sa stabilesc manual o valoare de cos similarity?

# am incercat si alte modele dar rezultatul este acelasi. problema cred ca vine de la reprezentarea embedding urilor


# 30% las asa asta e
# criterii de organizare a sinonimelor (nu va mai fi similaritate, lungime de caractere, nr de silabe, frecv in a corpus)
# cu bert (tokeni din bert - fertility)


# SORTARE IN FUNCTIE DE SIMILARITATEA CONTEXTELOR SINONIMELOR
 # for sin in sinonime_posibile_dex:
    #     if sin != token.text:
    #         i+=1
    #         sin_context = get_context_for_each_candidate_syn(token_text=sin, pos_wanted=token.pos_)

    #         for key in sin_context:
    #             if key in all_keys_verified:
    #                 continue
    #             else:
    #                 similarity_score = get_similarity_scores_for_syns(actual_context=context_actual_token[chosen_context], syn_candidate_context=sin_context[key], mean=True)
    #                 scores.append((similarity_score, key, sin))
    #                 all_keys_verified.append(key)
                    

In [102]:
def raw_word(text):
    diacritics = {
      "ă": "a",
      "â": "a",
      "î": "i",
      "ș": "s",
      "ț": "t",
      "č": "c",
      "ş": "s",
      "ž": "z",
      "Ä": "A",
      "Â": "A",
      "Î": "I",
      "Ș": "S",
      "Ț": "T",
      "Č": "C",
      "Ș": "S",
      "Ž": "Z",
      "á": "a",
      "é": "e",
      "í": "i",
      "ó": "o",
      "ú": "u",
      "ű": "u",
      "Á": "A",
      "É": "E",
      "Í": "I",
      "Ó": "O",
      "Ú": "U",
      "Ű": "U",
      "ö": "o",
      "Ö": "O",
      "ü": "u",
      "Ü": "U",
    }

    for k, v in diacritics.items():
        text = text.replace(k, v)
    return text.lower()

def count_consecutive_vowels(word):
    vowels = "aeiouAEIOU"
    consecutive_vowel_count = 0
    total_consecutive_vowels = 0
    for char in word:
        if char in vowels:
            consecutive_vowel_count += 1
        else:
            total_consecutive_vowels += consecutive_vowel_count
            consecutive_vowel_count = 0
        
    total_consecutive_vowels += consecutive_vowel_count
    
    return total_consecutive_vowels


def split_syllables(word: str):
    vowels = "aeiouăîâe"
    diacritics = "ăâîșțčşžÄÂÎȘȚČȘŽáéíóúűÁÉÍÓÚŰöÖüÜ"
    groups = ["ch", "gh"]
    word = raw_word(word).lower()
    for group in groups:
        if group == "ch":
            word = word.replace(group, "C")
        elif group == "gh":
            word = word.replace(group, "G")
    
    i = 1
    syllables = []
    last_syllable_index = 0
    while i < len(word) - 1:
        current_char = word[i]
        last_char = word[i-1]
        next_char = word[i+1]
        if i+2 < len(word):
            next2_char = word[i+2]
        # RULE1
        if current_char not in vowels and last_char in vowels and next_char in vowels:
            syllables.append(word[last_syllable_index : i])
            last_syllable_index = i
        # RULE2
        elif current_char not in vowels and next_char not in vowels and last_char in vowels and next2_char in vowels:
            # case 1
            if current_char in "bcdfghptv" and next_char in "lr":
                syllables.append(word[last_syllable_index : i])
            # case 2
            else:
                syllables.append(word[last_syllable_index : i+1])
                i+=1
            last_syllable_index = i
        # RULE3
        elif current_char not in vowels and last_char in vowels:
            cons_group = [current_char]
            j = i + 1 
            while j < len(word):
                if word[j] not in vowels:
                    cons_group.append(word[j])
                else:
                    break
                j+=1
            special_cons_groups = [["l", "p", "t"], ["m", "p", "t"],  ["n", "c", "t"],  ["n", "c", "s"], ["n", "d", "v"], ["r", "c", "t"], ["r", "t", "f"], ["s", "t", "m"]]
            
            # case1
            if cons_group in special_cons_groups:
                syllables.append(word[last_syllable_index:j-1])
                last_syllable_index = j-1
            # case2
            else:    
                print("AICI")
                syllables.append(word[last_syllable_index:i+1])
                last_syllable_index = i+1
            i=j
        # RULE4
        
        # if len(syllables):
            # a mereu vocala restu semivocale
            # daca e doar e = vocala restu semivocale etc, fa un ranking si poti doar sa "aproximezi"
            # print(syllables)
            # print(word[last_syllable_index:], "restul cuv")


        i+=1

    syllables.append(word[last_syllable_index:])
    
    for syllable in syllables:
        vowels_num = count_consecutive_vowels(syllable)
        if 1 < vowels_num <= 3:
            double_vowel = True
            for i in range(len(syllable) - 1):
                
                if syllable[i] in vowels and syllable[i+1] != syllable[i]:
                    # print(syllable[i])
                    double_vowel = False
            if syllable == syllables[-1] and syllable[-1] == "i" and syllable[i-2] == "i":
                double_vowel = False

            if double_vowel is True:
                syllables.append("dbl_vowel")
        
        elif vowels_num > 3:   
            syllables.append("4vowel")
        # iae, ieu, oeu, oau, eoau, eoeu
        
        for long_hiat in ["iae", "ieu", "oeu", "oau", "eoau", "eoeu"]:
            if long_hiat in syllable:
                syllables.append("lng_hi")

            # print(syllable)
    return syllables


# Example usage
words = ["steaua", "antiaerian", "alee", "unghi", "examen", "exercițiu", "leoaica", "achitat", "leghe", "oglindă", "sculptură", "somptuos", "funcții", "arctic", "jertfă", "astmatic", "sculptor"]
for word in words:
    syllables = split_syllables(word)
    print(f"{word}: {syllables}")



steaua: ['steaua', '4vowel']
antiaerian: ['an', 'tiae', 'rian', 'lng_hi']
alee: ['a', 'lee', 'dbl_vowel']
unghi: ['un', 'Gi']
examen: ['e', 'xa', 'men']
exercițiu: ['e', 'xer', 'ci', 'tiu']
leoaica: ['leoai', 'ca', '4vowel']
achitat: ['a', 'Ci', 'tat']
leghe: ['le', 'Ge']
oglindă: ['o', 'glin', 'da']
sculptură: ['sculp', 'tu', 'ra']
somptuos: ['somp', 'tuos']
funcții: ['func', 'tii']
arctic: ['arc', 'tic']
jertfă: ['jert', 'fa']
astmatic: ['ast', 'ma', 'tic']
sculptor: ['sculp', 'tor']


In [184]:
from wordfreq import zipf_frequency
word = nlp("băteam")
base_form = get_all_forms(word[0])[0].get("form")
print(zipf_frequency(base_form, 'ro'))

4.55
