In [None]:
from benchmark import Lexical_Benchmark
be = Lexical_Benchmark()

UWIN: Total definitions in dictionary: 182,371
UWIN: WordMap Built
40 unknown word ids cleared
117 invalid hypernyms are deleted
104635 invalid hyponyms are deleted
UWIN built

URIMAL_SAM: Total words in dictionary: 1,139,969
URIMAL_SAM: Total senses in dictionary: 422,677
URIMAL_SAM built

CORPUS: MODU built: 7,265 sentences
CORPUS: Sejong built: 2,355 sentences
CORPUS: modu+sejong concatenated: 9,620 sentences
WORD_DICTIONARY: 6,669 unique lexemes and 9,393 unique scodes are assembled (sejong, modu)
Calling BertTokenizer.from_pretrained() with the path to a single file or url is deprecated
VECTORIZERS: general vectorizers created
VECTORS: General Vectors Built.
VECTORS: Alternative Vectors Built.


In [None]:
import os, re, pickle

classifiers={}
path = "./classifiers/"
for name in os.listdir(path):
    model = re.search("(bert|w2v|ft)", name).group()
    lex = re.search("[가-힣]+", name).group()
    mode = re.search("(general|alterative)", name).group()
    if model not in classifiers:
        classifiers.update({model: {lex: pickle.load(open(path+name, "rb"))}})
    else:
        classifiers[model].update({lex: pickle.load(open(path+name, "rb"))})

In [None]:
import pickle
w2v_general_vecs = pickle.load(
    open("./vectors/concated_w2v_general_vectors.pkl", "rb"))
ft_general_vecs = pickle.load(
    open("./vectors/concated_ft_general_vectors.pkl", "rb"))
bert_general_vecs = pickle.load(
    open("./vectors/concated_bert_general_vectors.pkl", "rb"))

In [None]:
def get_general_vector(vectors):
        vector = vectors[0][0]  # first vector
        labels = []
        for i, (v, l) in enumerate(vectors):
            if i != 0:
                vector = np.concatenate((vector, v), axis=0)
            labels.append(l)
        vectors = vector.copy()
        return vectors, labels

# train uniformed sum vector
def get_uniform_vector(vectors):
    vector = vectors[0]  # first vector
    labels = []
    for i, v in enumerate(vectors):
        if i != 0:
            vector += v
    vectors = vector
    return vectors

# train weighted sum vector
def get_weighted_vector(vectors, frequencies):
    whole_freq = sum([freq for sclass, freq in frequencies[lexeme].items()])

    target_s_freq = frequencies[vectors[0][1]][vectors[0][2]]
    a = target_s_freq/whole_freq
    vector = a*vectors[0][0]
    labels = []
    for i, (v, lex, s, l) in enumerate(vectors):
        if i != 0:
            try:
                target_s_freq = frequencies[lex][s]
            except KeyError:
                print(s, lex)
                raise
            a = target_s_freq/whole_freq
            vector += (a*v)
        labels.append(l)
    vectors = vector
    return vectors, labels

# get random indices within certain percent
def random_indices(length, exclusion=[], percentage=0.3):
    import random

    l = list(range(length))
    result = []
    chance = int(len(l)*percentage)

    while chance > 0:
        chosen = random.choice(l)
        if chosen not in exclusion:
            chance-=1
            result.append(chosen)
            index = l.index(chosen)
            l.remove(chosen)
    return result

def return_random_values_of_list(l, exclusion=[], percentage=0.3):
    indicies = random_indices(len(l), exclusion, percentage)

    new_list=[]
    for index in indicies:
        new_list.append(l[index])
    assert len(new_list) == int(len(l)*percentage)
    
    return new_list, indicies


def investigate_sclass_classifiers(vecs, model, mode="general", percentage=0.5, quiet=False):
    from sklearn import neural_network as nn
    from tqdm.notebook import tqdm
    from random import shuffle
    import numpy as np
    import pickle
    import random
    import os, re

    sclasses=set()
    for vec in vecs:
        if vec is not None:
            sclasses.add(vec[4]) # sclass name
    sclasses.remove(None)
    
    results = ["{} / {} 결과\n".format(model, mode)]
    
    with tqdm(sclasses, leave=False, bar_format="{percentage:2.2f}% {bar} {desc} | {remaining}") as t:
        for s, target_sclass in enumerate(sclasses):
            mlp = nn.MLPClassifier(max_iter=300,
                                    activation="relu",
                                    hidden_layer_sizes=(64, 64, 64),
                                    solver="adam")
            
            target_cnt=0
            non_target_cnt=0

            target_vecs_training=[]
            non_target_vecs_training=[]
            
            target_vecs_test=[]
            non_target_vecs_test=[]

            for n, vec in enumerate(vecs):
                t.set_description_str("센스 클래스: {}({}/{}) | 벡터 번호 {}/{}".format(target_sclass, s+1, len(sclasses), n+1, len(vecs)))
                if vec is not None:
                    test = vec[0]
                    training = vec[1]
                    lexeme = vec[2]
                    scode = vec[3]
                    sclass = vec[4]

                    if training is not None:
                        if target_sclass == sclass: # positive
                            target_vecs_training.append((training, 1))
                            target_vecs_test.append((test, 1))
                            target_cnt+=1
                        else: # negative
                            non_target_vecs_training.append((training, 0))
                            non_target_vecs_test.append((test, 0))
                            non_target_cnt+=1

            # Splitting (test/train 50:50)

            target_vecs_training, exclusion_target = return_random_values_of_list(target_vecs_training, percentage=percentage)
            target_vecs_test, _ = return_random_values_of_list(target_vecs_test, exclusion_target, percentage)

            non_target_vecs_training, exclusion_non_target = return_random_values_of_list(non_target_vecs_training, percentage=percentage)
            non_target_vecs_test, _ = return_random_values_of_list(non_target_vecs_test, exclusion_non_target, percentage)

            # Making test / train data with the same numbers
            if target_cnt < non_target_cnt:
                mn_val = target_cnt 
            else:
                mn_val = non_target_cnt

            target_vecs_training = target_vecs_training[:mn_val]
            target_vecs_test = target_vecs_test[:mn_val]
            non_target_vecs_training = non_target_vecs_training[:mn_val]
            non_target_vecs_test = non_target_vecs_test[:mn_val]

            training_vectors = target_vecs_training + non_target_vecs_training
            vectors_test = target_vecs_test + non_target_vecs_test
            
            # Shuffling
            shuffle(training_vectors)
            shuffle(vectors_test)
        
            # TRAINING
            vector = training_vectors[0][0]  # first vector
            labels_training = []
            for i, (v, l) in enumerate(training_vectors):
                if i != 0:
                    vector = np.concatenate((vector, v), axis=0)
                labels_training.append(l)
            training_vectors = vector
            
            mlp = mlp.fit(training_vectors, np.array(labels_training))


            score = mlp.score(vectors_test, np.array(labels_test))


            # Writing Results & Saving Classifier
            result = "센스 클래스 '{}'| positive: {:,} / negative: {:,} |정확도: {}\n".format(target_sclass, target_cnt, non_target_cnt, round(score, 2))
            
            if not quiet:
                print(result, end="")

            results.append(result)
            with open("./results/{}_{}_accuracies.txt".format(model, mode), "w") as f:
                f.write("".join(results))
            with open("./classifiers/{}_{}_{}.pkl".format(target_sclass, model, mode), "wb") as f:
                pickle.dump(mlp, f)

            t.update()



In [None]:
import pickle
w2v_general_vecs = pickle.load(open("./vectors/concated_w2v_general_vectors.pkl", "rb"))
ft_general_vecs = pickle.load(open("./vectors/concated_ft_general_vectors.pkl", "rb"))
bert_general_vecs = pickle.load(open("./vectors/concated_bert_general_vectors.pkl", "rb"))
frequencies = pickle.load(open("./dictionary/matched_words/frequencies.pkl", "rb"))
sense_classes = pickle.load(open("./dictionary/sense_classes.pkl", "rb"))

In [None]:
def train_ambiguity_classifier(vecs, model, sense_classes, mode="general", percentage=0.5, quiet=False):
    from sklearn import neural_network as nn
    from tqdm.notebook import tqdm
    from random import shuffle
    import numpy as np
    import pickle
    import random
    import os, re

    sclasses=set()
    for vec in vecs:
        if vec is not None:
            sclasses.add(vec[4]) # sclass name
    sclasses.remove(None)

    mlp = nn.MLPClassifier(max_iter=300,
                                activation="relu",
                                hidden_layer_sizes=(64, 64, 64),
                                solver="adam")
        
    
    non_target_vecs_training=[]
    non_target_vecs_test=[]
    target_vecs_training=[]
    target_vecs_test=[]
    non_target_cnt=0
    target_cnt=0

    with tqdm(vecs, leave=False, bar_format="{percentage:2.2f}% {bar} {desc} | {remaining}") as t:
        for n, vec in enumerate(vecs):
            t.update()
            t.set_description_str("벡터 번호 {}/{}".format(n+1, len(vecs)))
            if vec is not None:
                test = vec[0]
                training = vec[1]
                lexeme = vec[2]
                scode = vec[3]
                sclass = vec[4]

                if training is not None:
                    if len(sense_classes[lexeme]) >= 2: # ambiguous
                        target_vecs_training.append((training, 1))
                        #target_vecs_test.append((test, 1))
                        target_cnt+=1
                    else: # unambiguous
                        non_target_vecs_training.append((training, 0))
                        #non_target_vecs_test.append((test, 0))
                        non_target_cnt+=1

    training_vectors = target_vecs_training + non_target_vecs_training
    
    # Shuffling
    shuffle(training_vectors)

    # TRAINING
    vector = training_vectors[0][0]  # first vector
    labels_training = []
    for i, (v, l) in enumerate(training_vectors):
        if i != 0:
            vector = np.concatenate((vector, v), axis=0)
        labels_training.append(l)
    training_vectors = vector
    
    mlp = mlp.fit(training_vectors, np.array(labels_training))

    # Writing Results & Saving Classifier

    with open("./classifiers/{}_{}_({}_{}).pkl".format(model, "ambiguity_detector.classifier", target_cnt, non_target_cnt), "wb") as f:
        pickle.dump(mlp, f)


In [None]:
train_ambiguity_classifier(w2v_general_vecs, model="w2v", sense_classes=sense_classes)
train_ambiguity_classifier(ft_general_vecs, model="ft", sense_classes=sense_classes)
train_ambiguity_classifier(bert_general_vecs, model="bert", sense_classes=sense_classes)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=44115.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=44115.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=45095.0), HTML(value='')))

In [None]:
def get_general_vector(vectors):
        vector = vectors[0][0]  # first vector
        for i, v in enumerate(vectors):
            if i != 0:
                vector = np.concatenate((vector, v), axis=0)
        return vector

# train uniformed sum vector
def get_uniform_vector(vectors):
    vector = vectors[0]  # first vector
    labels = []
    for i, v in enumerate(vectors):
        if i != 0:
            vector += v
    vectors = vector
    return vectors


# train weighted sum vector
def get_weighted_vector(vectors, frequencies):
    whole_freq = sum([freq for scode, (sclass, freq) in frequencies[vectors[0][1]].items()])
    try:
        target_s_freq = frequencies[vectors[0][1]][vectors[0][2]][1]
        a = target_s_freq/whole_freq
    except KeyError:
        print(vectors[0][1], vectors[0][2])
    
    vector = a*vectors[0][0]
    labels = []
    for i, (v, lex, s) in enumerate(vectors):
        if i != 0:
            try:
                target_s_freq = frequencies[lex][s][1]
            except KeyError:
                print(s, lex)
                raise
            a = target_s_freq/whole_freq
            vector += (a*v)
    vectors = vector
    return vectors

def ambiguity_experiment(model, mode, vecs, sense_classes, frequencies):
    assert mode in ["general", "uniformed", "weighted"]

    # get all lexemes
    lexs=set()
    frequencies={}

    for vec in vecs:
        if vec is None:
            continue
        #test = vec[0]
        #training = vec[1]
        lexeme = vec[2]
        scode = vec[3]
        sclass = vec[4]
        
        lexs.add(lexeme)
        if scode is not None:
            if lexeme in frequencies:
                if scode in frequencies[lexeme]:
                    frequencies[lexeme][scode][1]+=1
                else:
                    frequencies[lexeme].update({scode:[sclass, 1]})

            else:
                frequencies.update({lexeme:{scode: [sclass, 1]}})

    from statistics import mean
    from tqdm import tqdm
    from collections import Counter
    import os, pickle

    if model == "w2v":
        path = "./classifiers_all/ambiguity detection/w2v_ambiguity_detector.classifier_(2627_966).pkl"
    elif model == "ft":
        path = "./classifiers_all/ambiguity detection/ft_ambiguity_detector.classifier_(2627_966).pkl"
    elif model == "bert":
        path = "./classifiers_all/ambiguity detection/bert_ambiguity_detector.classifier_(2627_966).pkl"

    mlp = pickle.load(open(path, "rb"))

    strings=["lexeme\tambiguous\tmode\tmodel\tscore\n"]
    
    scores=[]
    answer_sheet=[]

    for lex in tqdm(lexs):
        target_vectors =[]
        ambiguity_labels=[] # if 1, it is ambiguous

        for vec in vecs:
            if vec is not None:
                test = vec[0]
                training = vec[1]
                lexeme = vec[2]
                scode = vec[3]
                sclass = vec[4]
                if lexeme == lex and test is not None:
                    if mode != "weighted":
                        target_vectors.append(test)
                    elif mode == "weighted":
                        target_vectors.append((test, lexeme, scode))
                    

        if len(target_vectors)==0:
            continue
        
        if mode == "uniformed":
            vector = get_uniform_vector(target_vectors)
        elif mode == "weighted":
            vector = get_weighted_vector(target_vectors, frequencies)


        answer = 1 if len(frequencies[lex])>=2 else 0 # ambiguity answer by lex
        if mode in ["uniformed", "weighted"]:
            binary = mlp.predict(vector)[0]
            if binary == answer: # correct
                answer_sheet.append(1)
            else: # incorrect
                answer_sheet.append(0)

        else: # for general vectors
            for vector in target_vectors:
                binary = mlp.predict(vector)[0]

                if binary == answer: # correct
                    answer_sheet.append(1)
                else: # incorrect
                    answer_sheet.append(0)

        
        corrects = Counter(answer_sheet)[1]
        score = round((corrects /len(answer_sheet))*100, 4)
        scores.append(score)
        strings.append("{}\t{}\t{}\t{}\t{}\n".format(lex, answer , mode, model, score))    
        with open("./results/ambiguity_{}_{}_result.tsv".format(model, mode), "w") as f:
            f.write("".join(strings))

    strings.append("\nAverage score: {}\n".format(round(mean(scores), 4)))
    with open("./results/ambiguity_{}_{}_result.tsv".format(model, mode), "w") as f:
        f.write("".join(strings))

In [None]:
import pickle
w2v_general_vecs = pickle.load(open("./vectors/concated_w2v_general_vectors.pkl", "rb"))
ft_general_vecs = pickle.load(open("./vectors/concated_ft_general_vectors.pkl", "rb"))
bert_general_vecs = pickle.load(open("./vectors/concated_bert_general_vectors.pkl", "rb"))
frequencies = pickle.load(open("./dictionary/matched_words/frequencies.pkl", "rb"))
sense_classes = pickle.load(open("./dictionary/sense_classes.pkl", "rb"))

In [None]:
ambiguity_experiment("w2v", "general", w2v_general_vecs, sense_classes, frequencies)
ambiguity_experiment("ft", "general", ft_general_vecs, sense_classes, frequencies)
ambiguity_experiment("bert", "general", bert_general_vecs, sense_classes, frequencies)

100%|██████████| 6455/6455 [01:56<00:00, 55.47it/s]
100%|██████████| 6455/6455 [02:11<00:00, 49.14it/s]
100%|██████████| 6455/6455 [01:07<00:00, 95.05it/s] 


In [None]:
ambiguity_experiment("w2v", "uniformed", w2v_general_vecs, sense_classes, frequencies)
ambiguity_experiment("ft", "uniformed", ft_general_vecs, sense_classes, frequencies)
ambiguity_experiment("bert", "uniformed", bert_general_vecs, sense_classes, frequencies)

100%|██████████| 6455/6455 [01:47<00:00, 60.03it/s]
100%|██████████| 6455/6455 [02:01<00:00, 52.96it/s]
100%|██████████| 6455/6455 [01:24<00:00, 76.75it/s]


In [None]:
ambiguity_experiment("w2v", "weighted", w2v_general_vecs, sense_classes, frequencies)
ambiguity_experiment("ft", "weighted", ft_general_vecs, sense_classes, frequencies)
ambiguity_experiment("bert", "weighted", bert_general_vecs, sense_classes, frequencies)

100%|██████████| 6455/6455 [01:57<00:00, 55.15it/s]
100%|██████████| 6455/6455 [01:57<00:00, 54.78it/s]
100%|██████████| 6455/6455 [01:14<00:00, 86.37it/s] 


In [None]:
def ambiguity_baseline(vecs):
    # get all lexemes
    lexs=set()
    frequencies={}

    for vec in vecs:
        if vec is None:
            continue
        #test = vec[0]
        #training = vec[1]
        lexeme = vec[2]
        scode = vec[3]
        sclass = vec[4]
        
        lexs.add(lexeme)
        if scode is not None:
            if lexeme in frequencies:
                if scode in frequencies[lexeme]:
                    frequencies[lexeme][scode][1]+=1
                else:
                    frequencies[lexeme].update({scode:[sclass, 1]})

            else:
                frequencies.update({lexeme:{scode: [sclass, 1]}})

    from collections import Counter
    from statistics import mean
    
    strings=["lexeme\tambiguous\tscore\n"]
    
    scores=[]
    answer_sheet=[]

    for lex in lexs:
        
        answer = 1 if len(frequencies[lex])>=2 else 0 # ambiguity answer by lex
        binary = int(random()*2) # random answer
        if binary == answer: # correct
            answer_sheet.append(1)
        else: # incorrect
            answer_sheet.append(0)
        
        corrects = Counter(answer_sheet)[1]
        score = round((corrects /len(answer_sheet))*100, 4)
        scores.append(score)
        strings.append("{}\t{}\t{}\n".format(lex, answer, score))    
        with open("./results/ambiguity_{}_result.tsv".format("baseline"), "w") as f:
            f.write("".join(strings))

    strings.append("\nAverage score: {}\n".format(round(mean(scores), 4)))
    with open("./results/ambiguity_{}_result.tsv".format("baseline"), "w") as f:
        f.write("".join(strings))

import pickle
w2v_general_vecs = pickle.load(open("./vectors/concated_w2v_general_vectors.pkl", "rb"))
ambiguity_baseline(w2v_general_vecs)

In [None]:
def get_general_vector(vectors):
        vector = vectors[0][0]  # first vector
        for i, v in enumerate(vectors):
            if i != 0:
                vector = np.concatenate((vector, v), axis=0)
        return vector

# train uniformed sum vector
def get_uniform_vector(vectors):
    vector = vectors[0]  # first vector
    labels = []
    for i, v in enumerate(vectors):
        if i != 0:
            vector += v
    vectors = vector
    return vectors


# train weighted sum vector
def get_weighted_vector(vectors, frequencies):
    whole_freq = sum([freq for scode, (sclass, freq) in frequencies[vectors[0][1]].items()])
    try:
        target_s_freq = frequencies[vectors[0][1]][vectors[0][2]][1]
        a = target_s_freq/whole_freq
    except KeyError:
        print(vectors[0][1], vectors[0][2])
    
    vector = a*vectors[0][0]
    labels = []
    for i, (v, lex, s) in enumerate(vectors):
        if i != 0:
            try:
                target_s_freq = frequencies[lex][s][1]
            except KeyError:
                print(s, lex)
                raise
            a = target_s_freq/whole_freq
            vector += (a*v)
    vectors = vector
    return vectors

In [None]:
def experiment(model, mode, vecs, frequencies):
    assert mode in ["general", "uniformed", "weighted"]

    # get all lexemes
    lexs=set()
    sclasses=set()
    frequencies={}

    for vec in vecs:
        if vec is None:
            continue
        #test = vec[0]
        #training = vec[1]
        lexeme = vec[2]
        scode = vec[3]
        sclass = vec[4]
        
        lexs.add(lexeme)
        sclasses.add(sclass)
        if sclass is not None:
            if lexeme in frequencies:
                if scode in frequencies[lexeme]:
                    frequencies[lexeme][scode][1]+=1
                else:
                    frequencies[lexeme].update({scode:[sclass, 1]})

            else:
                frequencies.update({lexeme:{scode: [sclass, 1]}})

    sclasses.remove(None)

    from statistics import mean
    from tqdm import tqdm
    from collections import Counter
    import os
    path = "./classifiers/"
    names = os.listdir(path)
    strings=["lexeme\tfrequency of lexeme\tsclasses num\tnum correct mlp\tnum all mlp\tmode\tmodel\tscore\n"]
    scores=[]

    for lex in tqdm(lexs):
        abs_freq=0
        target_vectors =[]
        try:
            sclasses = [frequencies[lex][scode][0] for scode in frequencies[lex]]
        except KeyError:
            continue

        for vec in vecs:
            if vec is not None:
                test = vec[0]
                training = vec[1]
                lexeme = vec[2]
                scode = vec[3]
                sclass = vec[4]
                
                if lexeme == lex and sclass in sclasses:
                    abs_freq+=1
                    if mode != "weighted":
                        target_vectors.append(test)
                    elif mode == "weighted":
                        target_vectors.append((test, lexeme, scode))
        
        if len(target_vectors)==0:
            continue
        
        if mode == "uniformed":
            vector = get_uniform_vector(target_vectors)
        elif mode == "weighted":
            vector = get_weighted_vector(target_vectors, frequencies)


        clfs = pickle.load(open("./classifiers_all/general_classifiers.pkl", "rb"))
        clfs_test={0:[], 1:[]}

        for sclass_candid in clfs[model]:
            if sclass_candid in sclasses:
                clfs_test[1].append(clfs[model][sclass_candid])
            else:
                clfs_test[0].append(clfs[model][sclass_candid])

        answer_sheet=[] 
        if mode in ["uniformed", "weighted"]:
            for mlp in clfs_test[0]: # the answer should be negative
                binary = mlp.predict(vector)[0]
                if binary == 0: # correct
                    answer_sheet.append(1)
                else: # incorrect
                    answer_sheet.append(0)
            
            for mlp in clfs_test[1]: # the answer should be positive
                binary = mlp.predict(vector)[0]
                if binary == 1: # correct
                    answer_sheet.append(1)
                else: # incorrect
                    answer_sheet.append(0)
        else: # for general vectors
            for vector in target_vectors:
                for mlp in clfs_test[0]: # the answer should be negative
                    binary = mlp.predict(vector)[0]
                    if binary == 0: # correct
                        answer_sheet.append(1)
                    else: # incorrect
                        answer_sheet.append(0)
                
                for mlp in clfs_test[1]: # the answer should be positive
                    binary = mlp.predict(vector)[0]
                    if binary == 1: # correct
                        answer_sheet.append(1)
                    else: # incorrect
                        answer_sheet.append(0)

        
        corrects = Counter(answer_sheet)[1]
        score = round((corrects /len(answer_sheet))*100, 4)
        scores.append(score)
        #abs_freq = sum([v[1] for k, v in frequencies[lex].items()])
        strings.append("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(lex, abs_freq, len(sclasses), corrects, len(answer_sheet), mode, model, score))    
        with open("./results/{}_{}_result.tsv".format(model, mode), "w") as f:
            f.write("".join(strings))

    strings.append("\nAverage score: {}\n".format(round(mean(scores), 4)))
    with open("./results/{}_{}_result.tsv".format(model, mode), "w") as f:
        f.write("".join(strings))

In [None]:
import pickle
w2v_general_vecs = pickle.load(open("./vectors/concated_w2v_general_vectors.pkl", "rb"))
ft_general_vecs = pickle.load(open("./vectors/concated_ft_general_vectors.pkl", "rb"))
bert_general_vecs = pickle.load(open("./vectors/concated_bert_general_vectors.pkl", "rb"))
frequencies = pickle.load(open("./dictionary/matched_words/frequencies.pkl", "rb"))
sense_classes = pickle.load(open("./dictionary/sense_classes.pkl", "rb"))

In [None]:
experiment(model="w2v", mode="weighted", vecs=w2v_general_vecs, frequencies=frequencies)
experiment(model="ft", mode="weighted", vecs=ft_general_vecs, frequencies=frequencies)
experiment(model="bert", mode="weighted", vecs=bert_general_vecs, frequencies=frequencies)

100%|██████████| 6455/6455 [02:03<00:00, 52.10it/s]
100%|██████████| 6455/6455 [02:02<00:00, 52.85it/s]
100%|██████████| 6455/6455 [02:01<00:00, 53.30it/s]


In [None]:
experiment(model="w2v", mode="uniformed", vecs=w2v_general_vecs, frequencies=frequencies)
experiment(model="ft", mode="uniformed", vecs=ft_general_vecs, frequencies=frequencies)
experiment(model="bert", mode="uniformed", vecs=bert_general_vecs, frequencies=frequencies)

100%|██████████| 6455/6455 [01:57<00:00, 54.98it/s]
100%|██████████| 6455/6455 [02:02<00:00, 52.67it/s]
100%|██████████| 6455/6455 [02:07<00:00, 50.61it/s]


In [None]:
experiment(model="w2v", mode="general", vecs=w2v_general_vecs, frequencies=frequencies)
experiment(model="ft", mode="general", vecs=ft_general_vecs, frequencies=frequencies)
experiment(model="bert", mode="general", vecs=bert_general_vecs, frequencies=frequencies)

100%|██████████| 6455/6455 [02:19<00:00, 46.39it/s]
100%|██████████| 6455/6455 [02:22<00:00, 45.18it/s]
100%|██████████| 6455/6455 [02:20<00:00, 45.91it/s]


In [None]:
def baseline_experiment(vecs):

    # get all lexemes
    lexs=set()
    sclasses=set()
    frequencies={}

    for vec in vecs:
        if vec is None:
            continue
        #test = vec[0]
        #training = vec[1]
        lexeme = vec[2]
        scode = vec[3]
        sclass = vec[4]
        
        lexs.add(lexeme)
        sclasses.add(sclass)
        if sclass is not None:
            if lexeme in frequencies:
                if scode in frequencies[lexeme]:
                    frequencies[lexeme][scode][1]+=1
                else:
                    frequencies[lexeme].update({scode:[sclass, 1]})

            else:
                frequencies.update({lexeme:{scode: [sclass, 1]}})

    sclasses.remove(None)

    from statistics import mean
    from collections import Counter
    from tqdm.notebook import tqdm
    from random import random
    
    strings=["lexeme\tfrequency of lexeme\tsclasses num\tnum correct mlp\tnum all mlp\tscore\n"]
    scores=[]

    for lex in tqdm(lexs):
        abs_freq=0
        target_vectors =[]
        try:
            sclasses = [frequencies[lex][scode][0] for scode in frequencies[lex]]
        except KeyError:
            continue

        for vec in vecs:
            if vec is not None:
                lexeme = vec[2]
                sclass = vec[4]
                
                if lexeme == lex and sclass in sclasses:
                    abs_freq+=1

        clfs = pickle.load(open("./classifiers_all/general_classifiers.pkl", "rb"))
        clfs_test={0:[], 1:[]}

        for sclass_candid in clfs["w2v"]:
            if sclass_candid in sclasses:
                clfs_test[1].append(clfs["w2v"][sclass_candid])
            else:
                clfs_test[0].append(clfs["w2v"][sclass_candid])

        
        answer_sheet=[] 
        
        for mlp in clfs_test[0]: # the answer should be negative
            binary = int(random() *2)
            if binary == 0: # correct
                answer_sheet.append(1)
            else: # incorrect
                answer_sheet.append(0)
        
        for mlp in clfs_test[1]: # the answer should be positive
            binary = int(random() *2)
            if binary == 1: # correct
                answer_sheet.append(1)
            else: # incorrect
                answer_sheet.append(0)
    
        
        corrects = Counter(answer_sheet)[1]
        score = round((corrects /len(answer_sheet))*100, 4)
        scores.append(score)
        #abs_freq = sum([v[1] for k, v in frequencies[lex].items()])
        strings.append("{}\t{}\t{}\t{}\t{}\t{}\n".format(lex, abs_freq, len(sclasses), corrects, len(answer_sheet), score))    
        with open("./results/sense_detection_{}_result.tsv".format("baseline"), "w") as f:
            f.write("".join(strings))

    strings.append("\nAverage score: {}\n".format(round(mean(scores), 4)))
    with open("./results/sense_detection_{}_result.tsv".format("baseline"), "w") as f:
        f.write("".join(strings))

import pickle
w2v_general_vecs = pickle.load(open("./vectors/concated_w2v_general_vectors.pkl", "rb"))
baseline_experiment(w2v_general_vecs)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=6455.0), HTML(value='')))


