In [4]:
import xml.etree.ElementTree as ET
import os
from tqdm import tqdm
import re
import pandas as pd

In [5]:
gram_mapper = {
'subst':  'rzecz',
'depr':   'rzecz',
'num':    'rzecz',
'numcol': 'rzecz',
'adj':    'przym',
'adja':   'przym',
'adjp':   'przym',
'adjc':   'przym',
'adv':    'przys',
'ppron12':'rzecz',
'ppron3': 'rzecz',
'siebie': 'rzecz',
'fin':    'czas',
'bedzie': 'czas',
'aglt':   'czas',
'praet':  'czas',
'impt':   'czas',
'imps':   'czas',
'inf':    'czas',
'pcon':   'czas',
'pant':   'czas',
'ger':    'czas',
'pact':   'czas',
'ppas':   'czas',
'winien': '?',
'pred':   '?',
'prep':   '?',
'conj':   '?',
'comp':   '?',
'qub':    '?',
'brev':   '?',
'burk':   '?',
'interj': '?',
'interp': '?',
'xxx':    '?',
'ign':    '?',
}

In [67]:
def addToWordsDf(xml_path, wordsDf):
    tree = ET.parse(xml_path)
    root = tree.getroot()
    for tok in root.iter('tok'):
        for lex in tok.iter('lex'):
            if "disamb" in lex.attrib and lex.attrib['disamb']=="1":
                base = lex.find('base').text
                ctag = lex.find('ctag').text
                partOfSpeach = gram_mapper[ctag.split(":")[0]]
                if wordsDf[wordsDf['base'] == base].empty:
                    record = pd.DataFrame(data={"base": [base], "parts_of_speech": [partOfSpeach], "n_occurrences": [1]})
                    wordsDf = pd.concat([wordsDf, record], ignore_index=True)
                else:
                    values = wordsDf.loc[wordsDf['base'] == base, ["n_occurrences"]]
                    wordsDf.loc[wordsDf['base'] == base, ["n_occurrences"]] = values + 1
    return wordsDf

In [77]:
def createWordsDf(xmls_train_dir, save=None):
    wordsDf = pd.DataFrame(columns=["base", "parts_of_speech", "n_occurrences"])
    for fileName in tqdm(os.listdir(xmls_train_dir)):
        wordsDf = addToWordsDf(xmls_train_dir+fileName, wordsDf)
        if save is not None:
            wordsDf.to_csv(save)

In [69]:
def getMostFreq(wordsDf, partOfSpeach, vec_size):
    return wordsDf[wordsDf["parts_of_speech"] == partOfSpeach].sort_values(by=['n_occurrences'], ascending=False).head(vec_size)

In [83]:
def createDataset(xmls_dir, mostFreq):
    X, y = [], []
    for train_xml in tqdm(os.listdir(xmls_dir)):
        x_sample, y_sample = getSample(xmls_dir + train_xml, mostFreq)
        X.append(x_sample)
        y.append(y_sample)
    return X, y

def getSample(xml_file_path, mostFreq):
    record_bases = getTextBases(xml_file_path)
    x = []
    fileName = os.path.basename(xml_file_path)
    fileName = os.path.splitext(fileName)[0]
    y = re.findall(r'[a-zA-Z\-]+', fileName)[0]
    for base in mostFreq:
        x.append(record_bases.count(base))
    return x, y

def getTextBases(xml_path):
    tree = ET.parse(xml_path)
    root = tree.getroot()
    bases = []
    for tok in root.iter('tok'):
        for lex in tok.iter('lex'):
            if "disamb" in lex.attrib and lex.attrib['disamb']=="1":
                base = lex.find('base').text
                bases.append(base)
    return bases

In [71]:
def slenderize(old_X, n_col):
    X = []
    for row in old_X:
        X.append(old_X[:n_col])
    return X

In [86]:
xmls_train_dir = "./zad4_xmls/wiki_train_34_categories_results_wcrft2/"
xmls_test_dir = "./zad4_xmls/wiki_test_34_categories_results_wcrft2/"
wordsDf = createWordsDf(xmls_train_dir)

In [80]:
wordsDf = pd.read_csv("wordsDf.csv")

In [84]:
mostFreq = list(getMostFreq(wordsDf, "rzecz", 1000)['base'])
X_train_1000, y_train = createDataset(xmls_train_dir, mostFreq)
X_test_1000, y_test = createDataset(xmls_test_dir, mostFreq)

100%|██████████| 6889/6889 [02:03<00:00, 55.71it/s] 
100%|██████████| 2957/2957 [00:59<00:00, 49.67it/s]


In [92]:
from sklearn.metrics import f1_score
from sklearn.naive_bayes import GaussianNB, MultinomialNB
gnb = GaussianNB()
gnb.fit(X_train_1000, y_train)
y_pred = clf.predict(X_test_1000)
f1_score(y_test, y_pred, average=None)

array([0.56684492, 0.64454976, 0.31481481, 0.55319149, 0.32116788,
       0.35555556, 0.53744493, 0.5106383 , 0.24836601, 0.39800995,
       0.72277228, 0.49746193, 0.28      , 0.60714286, 0.51666667,
       0.4260355 , 0.70857143, 0.58385093, 0.67080745, 0.47572816,
       0.50314465, 0.23529412, 0.69005848, 0.62857143, 0.31521739,
       0.7032967 , 0.7388535 , 0.66666667, 0.69565217, 0.63101604,
       0.73366834, 0.72941176, 0.35294118, 0.14457831, 0.        ,
       0.        , 1.        , 1.        , 1.        ])

In [93]:
mnb = MultinomialNB()
mnb.fit(X_train_1000, y_train)
y_pred = clf.predict(X_test_1000)
f1_score(y_test, y_pred, average=None)

array([0.56684492, 0.64454976, 0.31481481, 0.55319149, 0.32116788,
       0.35555556, 0.53744493, 0.5106383 , 0.24836601, 0.39800995,
       0.72277228, 0.49746193, 0.28      , 0.60714286, 0.51666667,
       0.4260355 , 0.70857143, 0.58385093, 0.67080745, 0.47572816,
       0.50314465, 0.23529412, 0.69005848, 0.62857143, 0.31521739,
       0.7032967 , 0.7388535 , 0.66666667, 0.69565217, 0.63101604,
       0.73366834, 0.72941176, 0.35294118, 0.14457831, 0.        ,
       0.        , 1.        , 1.        , 1.        ])