In [1]:
import re
import string
from iwnlp.iwnlp_wrapper import IWNLPWrapper

In [2]:
translator = str.maketrans('', '', string.punctuation)

In [3]:
lemmatizer = IWNLPWrapper(lemmatizer_path='C:/Users/1/Desktop/thesis/IWNLP.Lemmatizer_20170501.json')

In [4]:
def capitalize(word):
    if len(word) > 1:
        word = word[0].upper() + word[1:]
    return word

In [5]:
def get_lemma(word):
    lemma = lemmatizer.lemmatize_plain(word)
    if lemma is None:
        return capitalize(word)
    else:
        return capitalize(lemma[0])

In [6]:
#data: 'der sandmann' by hoffmann (german-russian)
s = open("C:/Users/1/Desktop/thesis/data/de-ru Hoffmann - Der Sandmann.pbo", "r", encoding="utf-8")
s = s.read()
res_s = re.findall('s=".+?"', s)

In [7]:
#original sentences in german
sentences_s = []
for item in res_s:
    item_new = item[3:-1]
    sentences_s.append(item_new)

In [8]:
#unique german tokens: not lemmatized
tokens_s = []

for sentence in sentences_s:
    sentence_new = sentence.translate(translator)
    tokens = sentence_new.split(" ")
    for token in tokens:
        if token not in tokens_s:
            tokens_s.append(token)

In [9]:
print (len(tokens_s))

3669


In [10]:
#unique german tokens: lemmatized and capitalized
tokens_s_lemmatized = []
for word in tokens_s:
    lemma = get_lemma(word)
    tokens_s_lemmatized.append(lemma)

In [11]:
#top german frequency words
file_words = open('C:/Users/1/Desktop/thesis/wortliste.txt', 'r', encoding='utf-8')
lines_words = file_words.readlines()

In [12]:
#capitalizing all words
dictionary = []
for line in lines_words:
    line_new = line.split(' ')
    dictionary.append(capitalize(line_new[0]))

In [13]:
print (len(dictionary))

29441


In [14]:
def is_in_dic(word):
    if word in dictionary:
        return True

In [15]:
def is_part(word):
    if len(word) > 1 and is_in_dic(word) is True:
        return True
    if (word.endswith('n') or word.endswith('e') or word.endswith('s')) and is_in_dic(word[:-1]):
        return True
    if (word.endswith('en') or word.endswith('er') or word.endswith('es')) and is_in_dic(word[:-2]):
        return True

In [16]:
#2-component compounds
def is_compound(word):
    max_ind = len(word)
    
    for ind, char in enumerate(word):
        left_compound = word[0:max_ind-ind]
        right_compound = word[max_ind-ind:max_ind]

        if is_part(left_compound) and len(left_compound) != len(word):
            right_compound_upper = capitalize(right_compound)
            if is_part(right_compound_upper):
                return word

In [17]:
#3-component compounds
def is_compound_of_three(word):
    max_ind = len(word)
    
    for ind1 in range(max_ind):
        for ind2 in range(ind1, max_ind):
            left_part = word[:ind1]
            middle_part = word[ind1:ind2]
            right_part = word[ind2:]
            
            if is_part(left_part):
                middle_part_upper = capitalize(middle_part)
                right_part_upper = capitalize(right_part)
                if is_part(middle_part_upper) and is_part(right_part_upper):
                    return word

In [18]:
#2-component compounds using list of top frequency german words
sandmann = []
for token in tokens_s_lemmatized:
    if is_compound(token) and token not in sandmann:
        sandmann.append(token)

In [19]:
len(sandmann)

178

In [20]:
#3-component compounds using list of top frequency german words
sandmann2 = []
for token in tokens_s_lemmatized:
    if is_compound_of_three(token) and token not in sandmann2:
        sandmann2.append(token)

In [21]:
len(sandmann2)

6

In [22]:
#compounds from GermaNet
file_compounds = open("C:/Users/1/Desktop/thesis/compounds_list.txt", "r", encoding="utf-8")
lines_compounds = file_compounds.readlines()

In [23]:
compounds = []
for line in lines_compounds:
    tokens = line.split(" ")
    compound = tokens[0]
    compounds.append(compound)

In [24]:
print (len(compounds))

74151


In [25]:
#compounds using list of compounds
sandmann3 = []
for token in tokens_s_lemmatized:
    if token in compounds and token not in sandmann3:
        sandmann3.append(token)

In [26]:
print (len(sandmann3))

68


In [27]:
#adding 3-component compounds to 2-component list
for token in sandmann2:
    if token not in sandmann:
        sandmann.append(token)

In [28]:
#annotated compounds from sandmann
z = open("C:/Users/1/Desktop/thesis/sandmann_komposits_annotated.txt", "r", encoding="utf-8")
lines_z = z.readlines()

In [29]:
lines_z_new = []
for line in lines_z:
    line_new = line.strip("\n")
    lines_z_new.append(line_new)

In [30]:
compounds_sandmann = []
for word in lines_z_new:
    lemma = lemmatizer.lemmatize_plain(word)
    if lemma is None:
        compounds_sandmann.append(capitalize(word))
    else:
        compounds_sandmann.append(capitalize(lemma[0]))

In [31]:
print (len(compounds_sandmann))

205


In [32]:
#list of top frequency words: precision
count = 0
for compound in sandmann:
    if compound in compounds_sandmann:
        count += 1
acc = count/len(compounds_sandmann)
print (acc)

0.8585365853658536


In [33]:
#list of compounds: precision
count2 = 0
for compound in sandmann3:
    if compound in compounds_sandmann:
        count2 += 1
acc2 = count2/len(compounds_sandmann)
print (acc2)

0.33170731707317075


In [34]:
#both approaches
sandmann4 = []
for compound in sandmann:
    sandmann4.append(compound)
for compound in sandmann3:
    if compound not in sandmann:
        sandmann4.append(compound)

In [35]:
#both approaches: precision
count3= 0
for compound in sandmann4:
    if compound in compounds_sandmann:
        count3 += 1
acc3 = count3/len(compounds_sandmann)
print (acc3)

0.9170731707317074


In [36]:
#words that were not recognised as compounds
for comp in compounds_sandmann:
    if comp not in sandmann4:
        print (comp)

Ammenmärchen
Doppeltgänger
Feuerströme
Holzpüppchen
Kindereien
Kleblocken
Papierschnitzchen
Peipendreher
Provinzialstadt
Schicksalspopanz
Schosshündchen
Schreibpults
Stossrapieren
Taschenperspektiv
Hellblinkende
Herzinnigstgeliebter
Weisshauptige


In [37]:
#words that were wrongly recognised as compounds
for comp in sandmann4:
    if comp not in compounds_sandmann:
        print (comp)

Gestalt
Innerstes
Gestalten
Innersten
Siegmund
Siegmunds
Ersterben
