In [1]:
import csv
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing

In [2]:
def conllu_to_df(from_file, to_file):
    data = []
    with open(from_file, encoding='utf-8') as rdata:
        lines = rdata.read().split('\n')
        for line in lines:
            strok = line.split('\t')
            if len(strok) == 1:
                continue
            data.append(strok)

    with open(to_file, "w", encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerows(data)
    
    df = pd.read_csv(to_file, names = 
                 ["ID", "TOKEN", "LEM", "POS", "POSABBR", "FEATS", "HEAD", "DEPREL", "DEPRELS", "MISC"])

    df[["ID", "TOKEN", "LEM", "POS", "POSABBR", "FEATS", "HEAD", "DEPREL", "DEPRELS", "MISC"]] = df[["ID", "TOKEN", "LEM", "POS", "POSABBR", "FEATS", "HEAD", "DEPREL", "DEPRELS", "MISC"]].astype(str)
    return df

In [3]:
traindf = conllu_to_df('fipb-ud-train.conllu', 'traindata.csv')
devdf = conllu_to_df('fipb-ud-dev.conllu', 'devdata.csv')
testdf = conllu_to_df('fipb-ud-test.conllu', 'testdata.csv')

In [4]:
traindf.head()

Unnamed: 0,ID,TOKEN,LEM,POS,POSABBR,FEATS,HEAD,DEPREL,DEPRELS,MISC
0,1,Kävelyreitti,kävely#reitti,NOUN,N,Case=Nom|Number=Sing,0,root,_,_
1,2,III,III,ADJ,Num,NumType=Ord,1,nummod,_,_
2,1,Jäällä,jää,NOUN,N,Case=Ade|Number=Sing,2,nmod,_,_
3,2,kävely,kävely,NOUN,N,Case=Nom|Number=Sing,3,nsubj,3:PBArg_0,_
4,3,avaa,avata,VERB,V,Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbF...,0,root,_,PBSENSE=avata.1


In [5]:
devdf.head()

Unnamed: 0,ID,TOKEN,LEM,POS,POSABBR,FEATS,HEAD,DEPREL,DEPRELS,MISC
0,1,The,The,PROPN,N,_,0,root,_,_
1,2,Garden,Garden,PROPN,N,_,1,name,_,_
2,3,Collection,Collection,PROPN,N,_,1,name,_,_
3,4,by,by,PROPN,N,_,1,name,_,_
4,5,H&M,H&M,PROPN,N,Abbr=Yes|Case=Nom|Number=Sing,1,name,_,_


In [6]:
testdf.head()

Unnamed: 0,ID,TOKEN,LEM,POS,POSABBR,FEATS,HEAD,DEPREL,DEPRELS,MISC
0,1,Pelkkää,pelkkä,ADJ,A,Case=Par|Degree=Pos|Number=Sing,2,amod,_,_
1,2,tyhjyyttä,tyhjyys,NOUN,N,Case=Par|Number=Sing,0,root,_,_
2,1,Kävin,käydä,VERB,V,Mood=Ind|Number=Sing|Person=1|Tense=Past|VerbF...,0,root,_,PBSENSE=käydä.9
3,2,tänään,tänään,ADV,Adv,_,1,advmod,1:PBArgM_tmp,_
4,3,katsomassa,katsoa,VERB,V,Case=Ine|InfForm=3|Number=Sing|VerbForm=Inf|Vo...,1,xcomp,1:PBArg_1,PBSENSE=katsoa.1


In [7]:
generaldf = pd.concat([traindf, devdf, testdf])

In [8]:
id_le = preprocessing.LabelEncoder()
lem_le = preprocessing.LabelEncoder()
posabbr_le = preprocessing.LabelEncoder()
feats_le = preprocessing.LabelEncoder()
head_le = preprocessing.LabelEncoder()
deprel_le = preprocessing.LabelEncoder()

deprels_le = preprocessing.LabelEncoder()

id_cod = id_le.fit_transform(generaldf["ID"])
lem_cod = lem_le.fit_transform(generaldf["LEM"])
posabbr_cod = posabbr_le.fit_transform(generaldf["POSABBR"])
feats_cod = feats_le.fit_transform(generaldf["FEATS"])
head_cod = head_le.fit_transform(generaldf["HEAD"])
deprel_cod = deprel_le.fit_transform(generaldf["DEPREL"])

deprels_cod = deprels_le.fit_transform(generaldf["DEPRELS"])

Model training stage

In [9]:
train_id = pd.DataFrame(id_le.transform(traindf["ID"]), columns=["ID"])
train_lem = pd.DataFrame(lem_le.transform(traindf["LEM"]), columns=["LEM"])
train_posabbr = pd.DataFrame(posabbr_le.transform(traindf["POSABBR"]), columns=["POSABBR"])
train_feats = pd.DataFrame(feats_le.transform(traindf["FEATS"]), columns=["FEATS"])
train_head = pd.DataFrame(head_le.transform(traindf["HEAD"]), columns=["HEAD"])
train_deprel = pd.DataFrame(deprel_le.transform(traindf["DEPREL"]), columns=["DEPREL"])

train_deprels = pd.DataFrame(deprels_le.transform(traindf["DEPRELS"]), columns=["DEPRELS"])

In [10]:
pre_features = pd.concat([train_id, train_lem, train_posabbr, train_feats, train_head, train_deprel], axis=1)

features = pre_features.values
target = train_deprels.values

In [11]:
clf = DecisionTreeClassifier()
clf.fit(features, target)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [12]:
print(clf.feature_importances_)

[ 0.11911812  0.29879778  0.04941744  0.10427211  0.17083229  0.25756227]


Development stage

In [13]:
dev_id = pd.DataFrame(id_le.transform(devdf["ID"]), columns=["ID"])
dev_lem = pd.DataFrame(lem_le.transform(devdf["LEM"]), columns=["LEM"])
dev_posabbr = pd.DataFrame(posabbr_le.transform(devdf["POSABBR"]), columns=["POSABBR"])
dev_feats = pd.DataFrame(feats_le.transform(devdf["FEATS"]), columns=["FEATS"])
dev_head = pd.DataFrame(head_le.transform(devdf["HEAD"]), columns=["HEAD"])
dev_deprel = pd.DataFrame(deprel_le.transform(devdf["DEPREL"]), columns=["DEPREL"])

dev_deprels = pd.DataFrame(deprels_le.transform(devdf["DEPRELS"]), columns=["DEPRELS"])

In [14]:
pre_dev_features = pd.concat([dev_id, dev_lem, dev_posabbr, dev_feats, dev_head, dev_deprel], axis=1)

dev_features = pre_dev_features.values
dev_target = dev_deprels.values

In [15]:
dev_scores = clf.score(dev_features, dev_target)
dev_scores

0.68409562274860825

<ul>
<li> id, lem, feats = 0.50900556707782996 </li>
<li> id, pos, lem, head, feats, deprel =  0.68682458246916278 </li>
<li> lem, head, deprel = 0.674926318087545 </li>
<li> id, lem, head, deprel = 0.66641196375941492 </li>
</ul>

Test stage

In [16]:
test_id = pd.DataFrame(id_le.transform(testdf["ID"]), columns=["ID"])
test_lem = pd.DataFrame(lem_le.transform(testdf["LEM"]), columns=["LEM"])
test_posabbr = pd.DataFrame(posabbr_le.transform(testdf["POSABBR"]), columns=["POSABBR"])
test_feats = pd.DataFrame(feats_le.transform(testdf["FEATS"]), columns=["FEATS"])
test_head = pd.DataFrame(head_le.transform(testdf["HEAD"]), columns=["HEAD"])
test_deprel = pd.DataFrame(deprel_le.transform(testdf["DEPREL"]), columns=["DEPREL"])

test_deprels = pd.DataFrame(deprels_le.transform(testdf["DEPRELS"]), columns=["DEPRELS"])

In [17]:
pre_test_features = pd.concat([test_id, test_lem, test_posabbr, test_feats, test_head, test_deprel], axis=1)

test_features = pre_test_features.values
test_target = test_deprels.values

In [18]:
test_scores = clf.score(test_features, test_target)
test_scores

0.67363238512035006