### Resource processing
This section deals with turning German resources into a spreadsheet which can be translated.

In [1]:
import spacy
nlp = spacy.load('de_core_news_sm', exclude=["ner", "parser", "textcat", "tagger"])
nlp.max_length = 10000000

In [2]:
import epub
from epub import epub2text
from collections import Counter, defaultdict
from functions import GermanLanguage, Word, Translation, partition_warnings, WarningLevel
from functools import reduce

In [3]:
de = GermanLanguage()

In [4]:
resource_dir = '/home/peter/Documents/github/word-frequency-analyser/books'
books = os.listdir(resource_dir)

resources = [] # type: List[str]
for i, book in enumerate(books):
    try:
        resource_fragments = epub.epub2text(f'{resource_dir}/{book}')
        resource = '\n'.join(resource_fragments)
        resources.append(resource)
    except KeyError:
        pass

docs = [nlp(resource) for resource in resources]

In [12]:
len('\n'.join(resources).split())

1152400

In [5]:
def simplify_pos(pos: str) -> str:
    return pos if pos == 'NOUN' or pos == 'VERB' else 'OTHER'

In [6]:
counters = [Counter((token.lemma_, simplify_pos(token.pos_)) for token in doc if token.is_alpha and not token.is_stop) for doc in docs]
counts = reduce(lambda a, c: a.update(c) or a, counters)

words = [
    Word(de, c[0], c[1], counts[c])
        for c in counts
            if de.is_valid_word(c[0], c[1])
]

In [7]:
test = '\n'.join([w.to_sheet() for w in sorted(words, key=lambda w: w.freq)])
with open('done.txt', 'w') as f:
    f.write(test)
print(test)

r Unterricht", "de", "en")	193
das stillen	=GOOGLETRANSLATE("das stillen", "de", "en")	193
der Kessel	=GOOGLETRANSLATE("der Kessel", "de", "en")	193
deutlich	=GOOGLETRANSLATE("deutlich", "de", "en")	195
kurz	=GOOGLETRANSLATE("kurz", "de", "en")	195
schlafen	=GOOGLETRANSLATE("zu schlafen", "de", "en")	195
genauso	=GOOGLETRANSLATE("genauso", "de", "en")	196
die Erinnerung	=GOOGLETRANSLATE("die Erinnerung", "de", "en")	197
schieben	=GOOGLETRANSLATE("zu schieben", "de", "en")	197
hinein	=GOOGLETRANSLATE("hinein", "de", "en")	197
die Reihe	=GOOGLETRANSLATE("die Reihe", "de", "en")	198
der Sohn	=GOOGLETRANSLATE("der Sohn", "de", "en")	199
schließen	=GOOGLETRANSLATE("zu schließen", "de", "en")	200
das Bild	=GOOGLETRANSLATE("das Bild", "de", "en")	201
ruhig	=GOOGLETRANSLATE("ruhig", "de", "en")	201
ständig	=GOOGLETRANSLATE("ständig", "de", "en")	201
meist	=GOOGLETRANSLATE("meist", "de", "en")	202
jung	=GOOGLETRANSLATE("jung", "de", "en")	203
lernen	=GOOGLETRANSLATE("zu lernen", "de", "en")	203

### Spreadsheet corrections.

The resultant spreadsheet can have issues in it, so it's important to flag likely issues for manual overview.

In [8]:
with open('test.txt', 'r') as f:
    translations = [Translation.from_sheet(line) for line in f.read().splitlines()]
    by_warnings = partition_warnings(translations)

In [9]:
print([f'{t.source}: {t.dest}' for t in by_warnings[WarningLevel.FAILURE]])

['lässet: to lässet', 'ists: to ists', 'satzten: to satzten', 'ausdermaßen: to ausdermaßen', 'gemachet: to gemachet', 'stund: to stund', 'ward: to ward', 'zumalen: to zumalen', 'derohalben: to derohalben', 'satzte: to satzte', 'oftermalen: to oftermalen', 'gesetzet: to gesetzet', 'genennet: to genennet', 'dörfte: to dörfte', 'dergestalten: to dergestalten', 'daselbsten: to daselbsten', 'sonsten: to sonsten', 'allwo: to allwo']
