# NKJP VOCABULARY COMPARISON

### IMPORTS, VARIABLES

In [1]:
import requests
from urllib.parse import quote
import conllu
from tqdm import tqdm
import pandas as pd

pd.set_option('display.max_rows', None)

In [2]:
file_words = '../data/memoirs_10k_corrected.conllu'
file_lemmas = '../data/memoirs_3k_corrected.conllu'
test_file = '../data/ud-treebanks/UD_Polish-PDB/pl_pdb-ud-test.conllu'

In [3]:
from functions import *
from preproc_bert import remove_ranges

In [4]:
# retrieving the NKJP API access username and password from a separate file -
# these have to remain confidential

with open('nkjp_access.txt') as f:
    username, password = [x.strip() for x in f.readlines()]

### FUNCTIONS AND CLASSES

In [5]:
def retrieve_nkjp_info(
    username: str, password: str, query: str, start_year: str = '1945',
    end_year: str = '2023', balanced: str = 'false', offset: str = '0',
    limit: str = '1', slope: str = '0', order: str = 'true',
    group_f: str = '-1', group_l: str = '1', group_r: str = '1000',
    group_rs: str = '0', span_search: str = 'true', any_form = False
):
    '''A function which retrieves the number of hits for a single query in NKJP.
    
    Args:
        username (str): The username used to log into the NKJP API services.
        password (str): The password used to log into the NKJP API services.
        query (str): The string that is to be queried.
        start_year (str): The start year for the search span.
        end_year (str): The end year for the search span.
        balanced (str): Determines whether the balanced subcorpus should be used or not.
        offset (str): Determines the offset.
        limit (str): Determines the query limit.
        slope (str): Determines the query slope.
        order (str): Determines the order.
        group_f (str): Determines the group field.
        group_l (str): Determines the group limit.
        group_r (str): Determines the group range.
        group_rs (str): Determines the group range start.
        span_search (str): Toggles the span search.
        any_form (bool): Determines if the search should find all of the forms of a given word.
    
    Returns:
        A string representing the number of hits found for the query.
    '''
    # encoding the query into an URL-friendly format
    query = quote(query)
    # adding '**' to signify that any form of this lemma should be searched for (if so desired)
    if any_form:
        query = query + '**'
    # retrieving the information from NKJP    
    r = requests.get(
        url=f'http://pelcra.clarin-pl.eu/NKJP/api/concordances?u={username}&p={password}&query={query}&dismaxQuery=index_date%3A%5B{start_year}-01-01T23%3A00%3A00Z%20TO%20{end_year}-12-31T23%3A00%3A00Z%5D%20&balanced={balanced}&offset={offset}&limit={limit}&slope={slope}&order={order}&groupField={group_f}&groupLimit={group_l}&groupRange={group_r}&groupRangeStart={group_rs}&useSpansSearch={span_search}'
    ) 
    results = r.json()

    return results['summary']['total']

In [6]:
def get_nkjp_stats(username: str, password: str, word_list: list, any_form=False):
    '''A function which retrieves statistics for a given sentence or list of sentences from NKJP. It reduces the sentences to a 
    list of unique words and then retrieves the counts for each of them.
    
    Args:
        username (str): The username used to log into the NKJP API services.
        password (str): The password used to log into the NKJP API services.
        word_list (list): A list of strings or a list of lists of strings representing a sentence or sentences to be searched.
        any_form (bool): Determines if the search should find all of the forms of a given word.
    
    Returns:
        A list of tuples of (word, count).
    '''
    # unwrapping the lists
    if isinstance(word_list[0], list):
        word_list = [x for sentence in word_list for x in sentence]
        
    # creating a set of words (not to call them double for no reason)
    word_list = list(set(word_list))
    word_list.sort()
    
    stats = []
    for word in tqdm(word_list, desc='Loading word counts...'):
        count = retrieve_nkjp_info(username, password, word, any_form=any_form)
        stats.append((word,int(count)))
        
    return stats

In [7]:
def get_missing_words(stats: list):
    '''A function which processes the relevant information out of a list of counts of words from NKJP.
    
    Args:
       stats (list[tuple]): A list of tuples of word, count.
    
    Returns:
        A Pandas DataFrame containing words and their counts sorted by the count in ascending order. Also prints the total
        number of tokens, the number of tokens with 0 hits and the % value of the total that they constitute along with a list
        of these tokens.
    '''
    # retrieving the tokens that have 0 hits
    zero_hits = []
    for tpl in stats:
        if tpl[1] == 0:
            zero_hits.append(tpl[0])
    
    # printing out some stats
    print(f'Out of {str(len(stats))} queries {str(len(zero_hits))} ({str(len(zero_hits)/len(stats)*100)}%) had no hits in NKJP.')
    zero_string = ' '.join(zero_hits)
    print(f'The queries with no hits include: {zero_string}')
    print()
    
    
    df = pd.DataFrame(stats, columns=['Query', 'Count']).sort_values('Count').set_index('Query')
    
    return df
        

### EXECUTION - STANDARD

In [8]:
test_tokens_lemmas = extract_conllu_data(test_file, 'lemma', sentences=True, combined=True, fulltext=False)
test_tokens, test_lemmas = make_tagger_friendly(test_tokens_lemmas)

In [9]:
test_lemma_stats = get_nkjp_stats(username, password, test_lemmas, any_form=True)

Loading word counts...: 100%|█████████████████████████████████████████████████████| 7583/7583 [4:18:06<00:00,  2.04s/it]


In [11]:
test_lemma_df = get_missing_words(test_lemma_stats)

Out of 7583 queries 44 (0.5802452855070552%) had no hits in NKJP.
The queries with no hits include: ! ) 19:15 25-procentowy 642-65-85 9-miesięczny Arasyb Bielsko-Biała Bushill-Matthews Collridge Eija-Riitta HA-Il Hawełko III-1 IRSC IRSR Instagram Lunzie McMillan-Scott Minecraft PPE-DE Palmiak Stallarholmen Winfryd-Bonifacy Yeosol [ ajtemik antysubsydyjny bezfabularny bio-obrazowanie ciemku krio-elektronowy lajwik merozoit niekonscjentywny non-profit nudności odmaterializować podwaliny przeciwbiałaczkowy przeciwretrowirusowy tekstilandia trichlorobenzen Ździara



In [12]:
test_lemma_df

Unnamed: 0_level_0,Count
Query,Unnamed: 1_level_1
!,0
bio-obrazowanie,0
odmaterializować,0
25-procentowy,0
trichlorobenzen,0
[,0
Stallarholmen,0
ajtemik,0
Lunzie,0
krio-elektronowy,0


In [13]:
test_word_stats = get_nkjp_stats(username, password, test_tokens)

Loading word counts...: 100%|███████████████████████████████████████████████████| 12601/12601 [6:52:41<00:00,  1.97s/it]


In [15]:
test_word_df = get_missing_words(test_word_stats)

Out of 12601 queries 56 (0.44440917387508927%) had no hits in NKJP.
The queries with no hits include: ! " % ' '' ( ) , 19:15 25-procentowy 5-proc 642-65-85 9-miesięczna ; Ajtemików Bushill-Matthewsowi Collridge Eija-Riitta HA-Il III-1 IRSC IRSR Instagramie Kalkilli Lunzie Maggego McMillan-Scott Minecrafcie PPE-DE Palmiak Pirkera Stallarholmen Winfryd-Bonifacy Yeosol [ ] ajtemika bio-obrazowania celekoksybem efawirenzem krio-elektronową lajwika non-profit nukleozydowymi przeciwciałem przeciwretrowirusowego ry(d)zykować sakwinawiru tektilandia trichlorobenzenu tuńczykowymi zięciowskim – — ” „



In [16]:
test_word_df

Unnamed: 0_level_0,Count
Query,Unnamed: 1_level_1
!,0
Palmiak,0
PPE-DE,0
Minecrafcie,0
McMillan-Scott,0
Maggego,0
Lunzie,0
Kalkilli,0
Instagramie,0
IRSR,0


### EXECUTION - PREDICTIONS

In [8]:
tokens_lemmas = extract_conllu_data(file_lemmas, 'lemma', sentences=True, combined=True, fulltext=False)
tokens_words = extract_conllu_data(file_words, 'lemma', sentences=True, combined=True, fulltext=False)

_, lemmas = make_tagger_friendly(tokens_lemmas)
words, _ = make_tagger_friendly(tokens_words)

In [10]:
hist_lemma_stats = get_nkjp_stats(username, password, lemmas, any_form=True)

Loading word counts...: 100%|███████████████████████████████████████████████████████| 1213/1213 [45:55<00:00,  2.27s/it]


In [12]:
hist_word_stats = get_nkjp_stats(username, password, words)

Loading word counts...: 100%|█████████████████████████████████████████████████████| 4302/4302 [2:27:30<00:00,  2.06s/it]


In [13]:
hist_lemma_df = get_missing_words(hist_lemma_stats)
hist_lemma_df.to_excel('../data/mistakes/nkjp_lemmas.xlsx')

Out of 1213 queries 86 (7.089859851607584%) had no hits in NKJP.
The queries with no hits include: ! ) Asińdźka Bludniki Będowszczyazna Bęklowizna Cobary Czołhany Dochorów Dorchów Dłużanin Głowecki Kmińszczyzna Kurypów Lesniowice Muczynowska Nawarya Notiak Pierściorowski Pokasowce Ronantowizna Ruszkowizna Rypnin Rzotoławski Semiginów Siemginów Siemiginów Siemignów Strużewo Stryiskie Swieżaska Szołayska Temerowice Treterówna Treterówną Zebold abbum adlinencja assekuracya całorolny cwansiger daruju domnikalny dotacya dośmierć excentryczność generacya gymnazyum instantacya ioyciec jurysdykcya juryzdyksya kadectwo mandatariat mandataryusz mandatyrusz mojomu mortyfikować mychayłowu niepomiąć nieprzynieść obeymować obeyście ordynarya oycowski pełnłnomocnik przystoyna półgrunt półrolny rarachować separacya spaśne stayermarka submittować sukcessor sukcessorka sukcessya sukcesya szambelanic szyzmatycki treterianum ukochanomu warżyć wdokument świętej pamięci Żółtowizna



In [14]:
hist_lemma_df

Unnamed: 0_level_0,Count
Query,Unnamed: 1_level_1
!,0
assekuracya,0
separacya,0
Notiak,0
Nawarya,0
Muczynowska,0
rarachować,0
półrolny,0
półgrunt,0
przystoyna,0


In [15]:
hist_word_df = get_missing_words(hist_word_stats)
hist_word_df.to_excel('../data/mistakes/nkjp_words.xlsx')

Out of 4302 queries 346 (8.04277080427708%) had no hits in NKJP.
The queries with no hits include: ! ( ) , Abbum Adelunia Asińdźka Badenianką Bełszowcu Bełzkiem Bienkowskey Blizcey Bludnickey Bludnikach Bludnikami Bludniki Bodzowcu Borkoscy Bołszowcu Bołszowice Bołszowickim Będowszczyazna Bęklowizna Chyrowskiey Cobary Czołhanach Czołhany Dobrrzyńskiej Dochorowie Dominikalnym Domnikalnego Dorchów Dołputowie Dołputów Dziduszyckiego Dłużanie Dźurkowie Floyrana Galecyi Galicyiskiego Golejowskiemi Golejowskimi Gwoźdzu Głoweckiego Głuską Helnkę Horodzyńskiego Inżyniryi Jabłonoscy Jenerałówną Jędrżejowicz Kazimierzostwie Kleofasę Kmińszczyzna Knihinicz Knihiniczach Komornikostwa Komornikowej Kopestyńskich Koropacza Korytyńską Koziobrodzkiego Kołmyiskim Kołomyiskimc Kruszelnicy Kruszelnicę Krzywczas Kunaszowa Kunaszowie Kurypów Kutyszcza Kutyszczach Leboskich Lesniowic Luboscy Maksymowic Mandatariaty Mandataryusz Mandataryusza Mandatyrusza Michałoskich Mohorocie Muczynowską Mychayłowu Nawaryi 

In [16]:
hist_word_df

Unnamed: 0_level_0,Count
Query,Unnamed: 1_level_1
!,0
Treterowej,0
Treterówną,0
Wincentowey,0
Woyniłowa,0
Woyniłowie,0
Woyniłowskich,0
Treterianum,0
Woyniłów,0
Zabilską,0
