### Resource processing
This section deals with turning German resources into a spreadsheet which can be translated.

In [1]:
import spacy
nlp = spacy.load('de_core_news_sm', exclude=["ner", "parser", "textcat", "tagger"])
nlp.max_length = 10000000

In [33]:
from importlib import reload
import epub
import re
from epub import epub2text
from collections import Counter, defaultdict
import functions
reload(functions)
from functions import GermanLanguage, Word, Translation, partition_warnings, WarningLevel, token_to_word_tuple
from functools import reduce

In [3]:
de = GermanLanguage()

In [4]:
resource_dir = '/home/peter/Documents/github/word-frequency-analyser/books'
books = os.listdir(resource_dir)

resources = [] # type: List[str]
for i, book in enumerate(books):
    try:
        resource_fragments = epub.epub2text(f'{resource_dir}/{book}')
        resource = '\n'.join(resource_fragments)
        resources.append(resource)
    except KeyError:
        pass

In [5]:
docs = [nlp(resource) for resource in resources]

In [6]:
len('\n'.join(resources).split())

1152400

In [7]:
counters = [Counter(token_to_word_tuple(token) for token in doc if token.is_alpha and not token.is_stop) for doc in docs]
counts = reduce(lambda a, c: a.update(c) or a, counters)

words = [
    Word(de, c[0], c[1], counts[c])
        for c in counts
            if de.is_valid_word(c[0], c[1])
]

In [8]:
test = '\n'.join([w.to_sheet() for w in sorted(words, key=lambda w: w.freq, reverse=True)])
with open('done.txt', 'w') as f:
    f.write(test)
print(test)

")	1	NOUN
dreiunddreißig	=GOOGLETRANSLATE("dreiunddreißig", "de", "en")	1	OTHER
die Todesdrohungen	=GOOGLETRANSLATE("die Todesdrohungen", "de", "en")	1	NOUN
der Sklavenstand	=GOOGLETRANSLATE("der Sklavenstand", "de", "en")	1	NOUN
freikommen	=GOOGLETRANSLATE("zu freikommen", "de", "en")	1	VERB
der Leuchtturm	=GOOGLETRANSLATE("der Leuchtturm", "de", "en")	1	NOUN
das Krankenzimmer	=GOOGLETRANSLATE("das Krankenzimmer", "de", "en")	1	NOUN
beschwörend	=GOOGLETRANSLATE("beschwörend", "de", "en")	1	OTHER
glänzender	=GOOGLETRANSLATE("glänzender", "de", "en")	1	OTHER
gekünstelt	=GOOGLETRANSLATE("zu gekünstelt", "de", "en")	1	VERB
die tragbar	=GOOGLETRANSLATE("die tragbar", "de", "en")	1	NOUN
wasserdichte	=GOOGLETRANSLATE("wasserdichte", "de", "en")	1	OTHER
abwechselnd	=GOOGLETRANSLATE("abwechselnd", "de", "en")	1	OTHER
das Utensil	=GOOGLETRANSLATE("das Utensil", "de", "en")	1	NOUN
die Dezemberwoche	=GOOGLETRANSLATE("die Dezemberwoche", "de", "en")	1	NOUN
der Kracher	=GOOGLETRANSLATE("der Kracher

### Spreadsheet corrections.

The resultant spreadsheet can have issues in it, so it's important to flag likely issues for manual overview.

In [49]:
with open('test.txt', 'r') as f:
    translations = [Translation.from_sheet(line) for line in f.read().splitlines()]
    by_warnings = partition_warnings(translations)

In [62]:
print('\n'.join([f'{t.source}: {t.dest}' for t in by_warnings[functions.WarningLevel.FAILURE]]))

der Potter: the Potter
oh: Oh
hab: hab
der Lord: the Lord
golden: golden
o: O
wild: wild
der Black: the Black
ans: ans
normal: normal
der Dementor: the Dementor
die Bank: the Bank
warm: warm
blond: blond
bitter: bitter
prompt: prompt
die Mum: the Mum
der Durmstrang: the Durmstrang
okay: to okay
s: s
der Prophet: the Prophet
das Champion: the Champion
bring: to bring
mach: mach
tu: tu
der Borgin: the Borgin
blind: blind
abrupt: abrupt
mild: mild
der Schlenker: the Schlenker
das Expelliarmus: the Expelliarmus
stieben: to stieben
die Bellatrix: the Bellatrix
cool: cool
das Deluminator: the Deluminator
der September: the September
wehend: wehend
doppeln: doppeln
der Lump: the Lump
elegant: elegant
die Lestrange: the Lestrange
anscheinen: anscheinen
m: m
total: total
das Horn: the Horn
die Lockhart: the Lockhart
fair: fair
der August: the August
r: r
b: b
pack: pack
das Atrium: the Atrium
red: red
golden: to golden
der Doge: the Doge
einhandeln: to einhandeln
hol: hol
abbekommen: to abbekom

In [73]:
# sentences = resources[0].split('\n')
test = re.sub('–', '', re.sub('[«»]', '"', resources[5]))
sentences = re.split('[\n.]', test)
chosen = [s.strip() for s in sentences if 5 < len(s) < 100 and '"' not in s]

print((s for s in sentences if 'wenden' in s).__next__())
'Harry' in resources[5]
# print(resources[5][:10000])

 Er reichte die Würstchen Harry, der so hungrig war, dass es ihm vorkam, als hätte er noch nie etwas Wundervolleres gekostet, doch immer noch konnte er den Blick nicht von dem Riesen abwenden


True