In [None]:
from tqdm import tqdm
import math
import pandas as pd

# Data Loading

In [None]:
def load_bigrams(path):

    file = open(path, 'r')

    bigrams = {}
    all_oc = 0
    
    for line in tqdm(file):

        oc, word_1, word_2 = line.split()
        oc = int(oc)

        if int(oc) < 5: continue

        if word_2[-1] == "\n": word_2 = word2[:-1]
        all_oc += oc

        if (word_1, word_2) in bigrams.keys():
            bigrams[(word_1, word_2)] += oc
        else:
            bigrams[(word_1, word_2)] = oc

    return bigrams, all_oc

In [None]:
def load_supertags(path):

    file = open(path, 'r')
    word_to_tag = {}

    for line in tqdm(file):

        word, tag = line.split(" ")
        if tag[-1] == "\n": tag = tag[:-1]

        word_to_tag[word.lower()] = tag
    
    file.close()
    return word_to_tag

In [None]:
def load_unigrams(path):

    file = open(path, 'r')
    word_to_occurrence = {}
    all_oc = 0

    for line in tqdm(file):

        words = line.lower().split(" ")
        
        for word in words:

            all_oc += 1
            if word[-1] == "\n": word = word[:-1]

            if word in word_to_occurrence.keys():
                word_to_occurrence[word] += 1
            else:
                word_to_occurrence[word] = 1

    return word_to_occurrence, all_oc

In [None]:
def supertags_stats(unigrams, bigrams, word_to_tag):

    unitags = {}
    unitags_all_oc = 0
    bitags = {}
    bitags_all_oc = 0

    for unigram in unigrams.keys():

        if unigram in word_to_tag.keys():
            unitags_all_oc += unigrams[unigram]
            if word_to_tag[unigram] in unitags.keys():
                unitags[word_to_tag[unigram]] += unigrams[unigram] 
            else:
                unitags[word_to_tag[unigram]] = unigrams[unigram]

    for bigram in bigrams.keys():
        
        if bigram[0] in word_to_tag.keys() and bigram[1] in word_to_tag.keys():
             bitags_all_oc += bigrams[bigram]

             tag_1 = word_to_tag[bigram[0]]
             tag_2 = word_to_tag[bigram[1]]

             if (tag_1, tag_2) in bitags.keys():
                bitags[(tag_1, tag_2)] += bigrams[bigram]
             else:
                bitags[(tag_1, tag_2)] = bigrams[bigram]

    return unitags, unitags_all_oc, bitags, bitags_all_oc

In [None]:
bigrams, bigrams_all_oc = load_bigrams('/content/poleval_2grams.txt')
unigrams, unigrams_all_oc = load_unigrams('/content/polish_corpora.txt')
word_to_tag = load_supertags('/content/drive/My Drive/Colab Notebooks/NLP/Dane/Copy of supertags.txt')

59134224it [01:11, 827856.05it/s]
23011601it [05:20, 71758.68it/s]
1781994it [00:04, 359139.36it/s]


In [None]:
unitags, unitags_all_oc, bitags, bitags_all_oc = supertags_stats(unigrams, bigrams, word_to_tag)

# Colocation testing

In [None]:
def pmi(word_1, word_2):

    oc = bigrams[(word_1, word_2)]

    res = 0
    if word_1 in unigrams.keys() and word_2 in unigrams.keys():
        res = math.log((oc/bigrams_all_oc)/((unigrams[word_1]/unigrams_all_oc)*
                                          (unigrams[word_2]/unigrams_all_oc)))
    return res

In [None]:
def psm(word_1, word_2):

    oc = bigrams[(word_1, word_2)]

    res = 0
    if word_1 in unigrams.keys() and word_2 in unigrams.keys():
        f_xy = oc
        fhat_xy = unigrams[word_1]*unigrams[word_2]/unigrams_all_oc
        res = f_xy * (math.log(f_xy) - fhat_xy - 1)
    
    return res

In [None]:
def t_student(word_1, word_2):

    oc = bigrams[(word_1, word_2)]

    res = 0
    if word_1 in unigrams.keys() and word_2 in unigrams.keys():
        f_xy = oc/bigrams_all_oc 
        fhat_xy = ((unigrams[word_1]/unigrams_all_oc) * (unigrams[word_2]/unigrams_all_oc))
    
        res = (f_xy - fhat_xy) / math.sqrt(f_xy)
    
    return res

In [None]:
def pmiwords_and_pmitags(word_1, word_2):

    pmi_words = pmi(word_1, word_2)

    pmi_tags = 0
    if word_1 in word_to_tag.keys() and word_2 in word_to_tag.keys():

        tag_1 = word_to_tag[word_1]
        tag_2 = word_to_tag[word_2]

        oc = bitags[(tag_1, tag_2)]

        pmi_tags = (oc/bitags_all_oc)/((unitags[tag_1]/unitags_all_oc)*
                                            (unitags[tag_2]/unitags_all_oc))
        if pmi_tags != 0:
            pmi_tags = math.log(pmi_tags)

    return pmi_words + max(0, pmi_tags)

In [None]:
def best_col(f, word):

    candidates = [(word1, word2) for word1, word2 in bigrams.keys() if word1==word or word2==word]
    results = {}

    for candidate in candidates:

        results[candidate] = f(candidate[0],candidate[1]) 

    return sorted(results.items(), key=lambda x: x[1], reverse=True)

In [None]:
words = ['koń', 'ojciec', 'dom', 'poduszka', 'czerwony', 'pisać', 'koszula', 'kot', 'kobieta', 'pies']
pd.set_option("display.max_rows", None, "display.max_columns", None)

for word in words:

    best_pmi = best_col(pmi, word)[:7]
    best_psm = best_col(psm, word)[:7]
    best_t_student = best_col(t_student, word)[:7]
    best_pmitags = best_col(pmiwords_and_pmitags, word)[:7]


    multicol_pmi = pd.MultiIndex.from_tuples([('PMI', 'Points'), ('PMI', 'Pair')])
    multicol_psm = pd.MultiIndex.from_tuples([('PSM', 'Points'), ('PSM', 'Pair')])
    multicol_pmitags = pd.MultiIndex.from_tuples([('PMITAGS', 'Points'), ('PMITAGS', 'Pair')])
    multicol_t_student = pd.MultiIndex.from_tuples([('T_student', 'Points'), ('T_student', 'Pair')])

    print('------------------------------------------ {} ------------------------------------------'.format(word))
    display(pd.concat([pd.DataFrame(best_pmi, columns=multicol_pmi), pd.DataFrame(best_psm, columns=multicol_psm), 
                     pd.DataFrame(best_pmitags, columns=multicol_pmitags), pd.DataFrame(best_t_student, columns=multicol_t_student)], axis = 1))


------------------------------------------ koń ------------------------------------------


Unnamed: 0_level_0,PMI,PMI,PSM,PSM,PMITAGS,PMITAGS,T_student,T_student
Unnamed: 0_level_1,Points,Pair,Points,Pair,Points,Pair,Points,Pair
0,"(koń, gorącokrwisty)",12.277124,"(koń, trojański)",122.489501,"(koń, gorącokrwisty)",14.452644,"(jak, koń)",0.00042
1,"(koń, wierzchowy)",12.094802,"(czarny, koń)",87.814574,"(koń, wierzchowy)",14.270323,"(koń, ,)",0.000378
2,"(koń, trojański)",11.890008,"(koń, pociągowy)",78.895153,"(koń, trojański)",14.065528,"(że, koń)",0.000345
3,"(koń, pociągowy)",11.822868,"(szalony, koń)",52.190982,"(koń, pociągowy)",13.998389,"(koń, trojański)",0.00033
4,"(koń, zaprzęgowy)",11.283872,"(biały, koń)",45.030526,"(koń, zaprzęgowy)",13.459393,"(koń, jest)",0.000316
5,"(koń, zimnokrwisty)",11.283872,"(koń, ca)",39.697251,"(koń, zimnokrwisty)",13.459393,"(czarny, koń)",0.000294
6,"(koń, przewalskiego)",10.667686,"(koń, cu)",33.98385,"(koń, przewalskiego)",10.667686,"(koń, nie)",0.000282


------------------------------------------ ojciec ------------------------------------------


Unnamed: 0_level_0,PMI,PMI,PSM,PSM,PMITAGS,PMITAGS,T_student,T_student
Unnamed: 0_level_1,Points,Pair,Points,Pair,Points,Pair,Points,Pair
0,"(ojciec, chrzestny)",9.792649,"(ojciec, święty)",3665.575348,"(ojciec, lubecki)",12.078753,"(jego, ojciec)",0.00394
1,"(ojciec, rutilio)",9.718055,"(mój, ojciec)",1547.017275,"(ojciec, chrzestny)",11.29924,"(ojciec, był)",0.00218
2,"(ojciec, soborowy)",8.910963,"(ojciec, chrzestny)",915.462059,"(ojciec, święty)",11.228311,"(jej, ojciec)",0.002178
3,"(przybrany, ojciec)",8.611537,"(ojciec, zmarł)",896.489159,"(ojciec, medard)",11.163,"(ojciec, ,)",0.0015
4,"(ojciec, duchesne)",8.514082,"(bóg, ojciec)",627.760588,"(ojciec, soborowy)",10.417555,"(ojciec, święty)",0.001316
5,"(ojciec, goriot)",8.268959,"(ojciec, pracował)",485.139847,"(ojciec, honoriusz)",10.216792,"(mój, ojciec)",0.001064
6,"(ojciec, laurenty)",8.218618,"(ojciec, zginął)",429.526571,"(ojciec, meissner)",9.864775,"(a, ojciec)",0.00097


------------------------------------------ dom ------------------------------------------


Unnamed: 0_level_0,PMI,PMI,PSM,PSM,PMITAGS,PMITAGS,T_student,T_student
Unnamed: 0_level_1,Points,Pair,Points,Pair,Points,Pair,Points,Pair
0,"(dom, mintoff)",9.509237,"(dom, jednorodzinny)",4665.158365,"(dom, przedpogrzebowy)",12.019815,"(dom, kultury)",0.001828
1,"(dom, mansjonarski)",9.509237,"(dom, mieszkalny)",3042.070891,"(dom, maklerski)",11.949349,"(dom, pomocy)",0.001704
2,"(dom, sathanas)",9.509237,"(dom, wolnostojący)",2495.334215,"(dom, zajezdny)",11.941312,"(dom, jednorodzinny)",0.001432
3,"(dom, przedpogrzebowy)",9.49243,"(rodzinny, dom)",2004.821627,"(dom, rekolekcyjny)",11.805293,"(:, dom)",0.001281
4,"(dom, maklerski)",9.421964,"(dom, wydawniczy)",1644.539944,"(dom, misjonarski)",11.785308,"(dom, dziecka)",0.001236
5,"(dom, zajezdny)",9.413927,"(dom, maklerski)",1505.184433,"(dom, jednorodzinny)",11.736651,"(dom, mieszkalny)",0.001214
6,"(dom, sportova)",9.375706,"(dom, rodzinny)",1460.764374,"(dom, wolnostojący)",11.595992,"(dom, ,)",0.001135


------------------------------------------ poduszka ------------------------------------------


Unnamed: 0_level_0,PMI,PMI,PSM,PSM,PMITAGS,PMITAGS,T_student,T_student
Unnamed: 0_level_1,Points,Pair,Points,Pair,Points,Pair,Points,Pair
0,"(poduszka, przeciwodleżynowa)",13.168111,"(poduszka, powietrzna)",626.570939,"(poduszka, przeciwodleżynowa)",16.110733,"(poduszka, powietrzna)",0.000619
1,"(poduszka, frejki)",12.803468,"(poduszka, silnika)",28.017004,"(poduszka, powietrzna)",15.092179,"(,, poduszka)",0.000253
2,"(poduszka, powietrzna)",12.031106,"(poduszka, pod)",23.963334,"(poduszka, ortopedyczna)",14.713212,"(:, poduszka)",0.000216
3,"(poduszka, ortopedyczna)",11.652139,"(poduszka, ortopedyczna)",20.342698,"(poduszka, puchowa)",14.283274,"(poduszka, pod)",0.000215
4,"(poduszka, luxus)",11.398825,"(poduszka, dekoracyjna)",20.337945,"(poduszka, dekoracyjna)",13.354317,"(poduszka, silnika)",0.000199
5,"(pasazera, poduszka)",11.337131,"(poduszka, szara)",20.303695,"(poduszka, frejki)",12.803468,"(-, poduszka)",0.000198
6,"(poduszka, puchowa)",11.222201,"(poduszka, pow.)",20.100797,"(poduszka, szara)",11.505206,"(poduszka, ortopedyczna)",0.000179


------------------------------------------ czerwony ------------------------------------------


Unnamed: 0_level_0,PMI,PMI,PSM,PSM,PMITAGS,PMITAGS,T_student,T_student
Unnamed: 0_level_1,Points,Pair,Points,Pair,Points,Pair,Points,Pair
0,"(czerwony, spągowiec)",11.161286,"(czerwony, krzyż)",2898.161343,"(czerwony, kapturek)",13.281529,"(czerwony, krzyż)",0.00118
1,"(czerwony, kapturek)",10.739073,"(kolor, czerwony)",1655.895581,"(bniec, czerwony)",13.245054,"(kolor, czerwony)",0.000938
2,"(bniec, czerwony)",10.717669,"(czerwony, szlak)",1620.402855,"(prądnik, czerwony)",12.838557,"(czerwony, ,)",0.000926
3,"(chotel, czerwony)",10.606444,"(czerwony, kolor)",694.662381,"(buławnik, czerwony)",12.443455,"(czerwony, szlak)",0.000923
4,"(prądnik, czerwony)",10.311173,"(czerwony, kapturek)",478.160536,"(muchomor, czerwony)",11.840822,"(polski, czerwony)",0.000794
5,"(buławnik, czerwony)",9.91607,"(przechodzi, czerwony)",325.242991,"(czerwony, kartonik)",11.786432,"(:, czerwony)",0.000766
6,"(połuorzeł, czerwony)",9.893752,"(czerwony, karzeł)",256.960501,"(czerwony, spaw)",11.66686,"(,, czerwony)",0.00067


------------------------------------------ pisać ------------------------------------------


Unnamed: 0_level_0,PMI,PMI,PSM,PSM,PMITAGS,PMITAGS,T_student,T_student
Unnamed: 0_level_1,Points,Pair,Points,Pair,Points,Pair,Points,Pair
0,"(zaczniecie, pisać)",7.710349,"(zaczął, pisać)",2207.315434,"(przestań, pisać)",11.171312,"(pisać, o)",0.001177
1,"(przestań, pisać)",7.644803,"(zaczęła, pisać)",654.222522,"(potrafisz, pisać)",9.155096,"(zaczął, pisać)",0.001075
2,"(pisać, głupot)",7.294188,"(pisać, wiersze)",366.094454,"(powinnaś, pisać)",9.014559,"(pisać, ,)",0.000838
3,"(pisać, wiersze)",7.266683,"(będę, pisać)",215.8366,"(przestać, pisać)",8.598764,"(i, pisać)",0.000742
4,"(zacznijmy, pisać)",7.186702,"(byłoby, pisać)",184.946308,"(będziecie, pisać)",8.465185,"(można, pisać)",0.000666
5,"(zaczęłam, pisać)",7.155827,"(czym, pisać)",166.1177,"(lubi, pisać)",8.373549,"(zaczęła, pisać)",0.000646
6,"(pisać, brednie)",7.151655,"(pisać, teksty)",157.403952,"(zamiast, pisać)",8.340988,"(pisać, .)",0.000645


------------------------------------------ koszula ------------------------------------------


Unnamed: 0_level_0,PMI,PMI,PSM,PSM,PMITAGS,PMITAGS,T_student,T_student
Unnamed: 0_level_1,Points,Pair,Points,Pair,Points,Pair,Points,Pair
0,"(koszula, flanelowa)",12.258266,"(biała, koszula)",318.267948,"(koszula, flanelowa)",15.319339,"(biała, koszula)",0.000474
1,"(koszula, ciału)",10.105768,"(koszula, męska)",126.170145,"(koszula, nocna)",12.915744,"(koszula, z)",0.000408
2,"(koszula, nocna)",9.854671,"(koszula, nocna)",68.602223,"(koszula, męska)",12.828568,"(koszula, męska)",0.000334
3,"(bawełniana, koszula)",9.773359,"(koszula, damska)",33.980554,"(koszula, oficerska)",12.8105,"(koszula, nocna)",0.000268
4,"(koszula, męska)",9.767494,"(męska, koszula)",33.974825,"(bawełniana, koszula)",12.683736,"(koszula, ,)",0.000255
5,"(koszula, oficerska)",9.749427,"(koszula, oficerska)",31.145784,"(bliższa, koszula)",12.462819,"(koszula, -)",0.000213
6,"(ciału, koszula)",9.486728,"(elegancka, koszula)",31.131843,"(elegancka, koszula)",12.105996,"(koszula, damska)",0.000211


------------------------------------------ kot ------------------------------------------


Unnamed: 0_level_0,PMI,PMI,PSM,PSM,PMITAGS,PMITAGS,T_student,T_student
Unnamed: 0_level_1,Points,Pair,Points,Pair,Points,Pair,Points,Pair
0,"(kot, napłakał)",11.806221,"(maciej, kot)",701.815382,"(kot, napłakał)",13.577696,"(kot, ,)",0.000725
1,"(gadający, kot)",10.417657,"(czarny, kot)",321.696893,"(kot, bengalski)",11.105265,"(maciej, kot)",0.000652
2,"(kot, szczekał)",9.893586,"(rasy, kot)",163.233366,"(dziki, kot)",11.043098,"(czarny, kot)",0.000479
3,"(kot, filemon)",9.653051,"(mój, kot)",148.688682,"(kot, syberyjski)",10.682547,"(jak, kot)",0.000464
4,"(kot, bengalski)",9.112332,"(kot, napłakał)",141.679178,"(kot, sylwester)",10.530208,"(kot, ()",0.000382
5,"(kot, prot)",9.108324,"(tomasz, kot)",99.493961,"(gadający, kot)",10.417657,"(rasy, kot)",0.000369
6,"(kot, syberyjski)",8.689613,"(twój, kot)",93.644808,"(kot, szczekał)",9.893586,"(mój, kot)",0.000364


------------------------------------------ kobieta ------------------------------------------


Unnamed: 0_level_0,PMI,PMI,PSM,PSM,PMITAGS,PMITAGS,T_student,T_student
Unnamed: 0_level_1,Points,Pair,Points,Pair,Points,Pair,Points,Pair
0,"(kobieta, 65-letnia)",9.986363,"(każda, kobieta)",2837.072662,"(kobieta, spodziewająca)",11.941959,"(kobieta, ,)",0.001537
1,"(54-letnia, kobieta)",9.649891,"(młoda, kobieta)",1673.88638,"(kobieta, ciężarna)",11.829309,"(że, kobieta)",0.001296
2,"(sposora, kobieta)",9.620904,"(pierwsza, kobieta)",1174.354396,"(kobieta, karmiąca)",11.370555,"(każda, kobieta)",0.001273
3,"(ciezarna, kobieta)",9.46757,"(jedna, kobieta)",505.229969,"(przedsiębiorcza, kobieta)",11.349177,"(pierwsza, kobieta)",0.000948
4,"(38-letnia, kobieta)",9.387527,"(piękna, kobieta)",349.250569,"(czarnoskóra, kobieta)",11.161551,"(młoda, kobieta)",0.000938
5,"(50-letnia, kobieta)",9.179887,"(starsza, kobieta)",311.492062,"(demoniczna, kobieta)",11.088451,"(kobieta, nie)",0.000926
6,"(zamozna, kobieta)",9.179887,"(kobieta, powinna)",309.295204,"(ciężarna, kobieta)",11.006518,"(kobieta, ma)",0.000851


------------------------------------------ pies ------------------------------------------


Unnamed: 0_level_0,PMI,PMI,PSM,PSM,PMITAGS,PMITAGS,T_student,T_student
Unnamed: 0_level_1,Points,Pair,Points,Pair,Points,Pair,Points,Pair
0,"(pies, descalzos)",11.374109,"(mój, pies)",366.559152,"(dorosły, pies)",12.763019,"(pies, jest)",0.000673
1,"(berneński, pies)",10.712711,"(twój, pies)",298.180663,"(pies, pasterski)",12.158348,"(że, pies)",0.000667
2,"(pies, stróżujący)",10.589154,"(pies, pogrzebany)",257.103169,"(berneński, pies)",12.10457,"(jak, pies)",0.000658
3,"(bezpański, pies)",10.457818,"(pies, rasy)",149.917169,"(bezpański, pies)",11.849678,"(to, pies)",0.000637
4,"(pies, asystujący)",10.433126,"(pies, myśliwski)",148.900216,"(pies, kanaryjski)",11.521482,"(pies, nie)",0.000573
5,"(pies, huckleberry)",10.121346,"(pies, pasterski)",145.47208,"(pies, stróżujący)",11.489447,"(pies, ,)",0.000572
6,"(pies, tropiący)",9.987815,"(każdy, pies)",129.847856,"(pies, descalzos)",11.374109,"(mój, pies)",0.000522
