In [1]:
import glob
import os
import string
from concurrent.futures import ThreadPoolExecutor as Pool
from subprocess import check_output, run

import requests

import nltk
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, classification_report

In [2]:
FASTTEXT_PATH = '../../fastText/fasttext'
MODEL_NAME = 'model'
ALL_SENTENCES = 'sentences.txt'
TRAIN_SENTENCES = 'sentences.train'
TEST_SENTENCES = 'sentences.test'

In [3]:
books = {
    'mickiewicz': [
        'https://wolnelektury.pl/media/book/txt/pan-tadeusz.txt',
        'https://wolnelektury.pl/media/book/txt/dziady-dziady-widowisko-czesc-i.txt',
        'https://wolnelektury.pl/media/book/txt/dziady-dziadow-czesci-iii-ustep-do-przyjaciol-moskali.txt',
        'https://wolnelektury.pl/media/book/txt/ballady-i-romanse-pani-twardowska.txt',
        'https://wolnelektury.pl/media/book/txt/ballady-i-romanse-powrot-taty.txt',
        'https://wolnelektury.pl/media/book/txt/ballady-i-romanse-switez.txt',
        'https://wolnelektury.pl/media/book/txt/dziady-dziady-poema-dziady-czesc-iv.txt',
        'https://wolnelektury.pl/media/book/txt/oda-do-mlodosci.txt'
    ],
    'sienkiewicz': [
        'https://wolnelektury.pl/media/book/txt/quo-vadis.txt',
        'https://wolnelektury.pl/media/book/txt/sienkiewicz-we-mgle.txt',
        'https://wolnelektury.pl/media/book/txt/potop-tom-pierwszy.txt',
        'https://wolnelektury.pl/media/book/txt/potop-tom-drugi.txt',
        'https://wolnelektury.pl/media/book/txt/potop-tom-trzeci.txt',
    ],
    'orzeszkowa': [
        'https://wolnelektury.pl/media/book/txt/orzeszkowa-kto-winien.txt',
        'https://wolnelektury.pl/media/book/txt/nad-niemnem-tom-pierwszy.txt',
        'https://wolnelektury.pl/media/book/txt/nad-niemnem-tom-drugi.txt',
        'https://wolnelektury.pl/media/book/txt/nad-niemnem-tom-trzeci.txt',
        'https://wolnelektury.pl/media/book/txt/gloria-victis-dziwna-historia.txt',
        'https://wolnelektury.pl/media/book/txt/z-pozogi.txt',
        'https://wolnelektury.pl/media/book/txt/pani-dudkowa.txt',
        'https://wolnelektury.pl/media/book/txt/dymy.txt',
        'https://wolnelektury.pl/media/book/txt/syn-stolarza.txt',
        'https://wolnelektury.pl/media/book/txt/dobra-pani.txt',
        'https://wolnelektury.pl/media/book/txt/cnotliwi.txt',
        'https://wolnelektury.pl/media/book/txt/kilka-slow-o-kobietach.txt',
        'https://wolnelektury.pl/media/book/txt/patryotyzm-i-kosmopolityzm.txt',
        'https://wolnelektury.pl/media/book/txt/julianka.txt',
    ],
    'prus': [
        'https://wolnelektury.pl/media/book/txt/lalka-tom-drugi.txt',
        'https://wolnelektury.pl/media/book/txt/lalka-tom-pierwszy.txt',
        'https://wolnelektury.pl/media/book/txt/antek.txt',
        'https://wolnelektury.pl/media/book/txt/katarynka.txt',
        'https://wolnelektury.pl/media/book/txt/prus-anielka.txt',
        'https://wolnelektury.pl/media/book/txt/prus-placowka.txt',

    ],
    'reymont': [
        'https://wolnelektury.pl/media/book/txt/ziemia-obiecana-tom-pierwszy.txt',
        'https://wolnelektury.pl/media/book/txt/chlopi-czesc-pierwsza-jesien.txt',
        'https://wolnelektury.pl/media/book/txt/reymont-chlopi-zima.txt',
        'https://wolnelektury.pl/media/book/txt/chlopi-czesc-trzecia-wiosna.txt',
        'https://wolnelektury.pl/media/book/txt/chlopi-czesc-czwarta-lato.txt',
    ]
}

In [4]:
def download_book(author, url):
    book_content = requests.get(url).text
    file_name = url.split('/')[-1]
    footer = book_content.rindex('-----')
    header = book_content.find('ISBN') + 23  # magic number
    with open(os.path.join(author, file_name), 'w', encoding='utf-8') as file:
        file.write(book_content[header:footer])
    return url


def download_books():
    for author in books:
        if not os.path.exists(author):
            os.makedirs(author)

    authors = [author for author, urls in books.items() for _ in urls]
    urls = [url for author, urls in books.items() for url in urls]

    with Pool() as pool:
        for finished_url in pool.map(download_book, authors, urls):
            print(finished_url)


def get_sentences(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        return nltk.sent_tokenize(file.read(), language='polish')


def create_file_with_sentences():
    with open(ALL_SENTENCES, 'w', encoding='utf-8') as result_file:
        for author in books:
            sentences = (sentence.strip(string.whitespace) 
                         for filename in glob.glob(author +  os.path.sep + '*') 
                         for sentence in get_sentences(filename))
            for sentence in sentences:
                if len(sentence) > 3:
                    print(f'__label__{author} {sentence}', file=result_file)

In [131]:
# download_books()

https://wolnelektury.pl/media/book/txt/pan-tadeusz.txt
https://wolnelektury.pl/media/book/txt/dziady-dziady-widowisko-czesc-i.txt
https://wolnelektury.pl/media/book/txt/dziady-dziadow-czesci-iii-ustep-do-przyjaciol-moskali.txt
https://wolnelektury.pl/media/book/txt/ballady-i-romanse-pani-twardowska.txt
https://wolnelektury.pl/media/book/txt/ballady-i-romanse-powrot-taty.txt
https://wolnelektury.pl/media/book/txt/ballady-i-romanse-switez.txt
https://wolnelektury.pl/media/book/txt/dziady-dziady-poema-dziady-czesc-iv.txt
https://wolnelektury.pl/media/book/txt/quo-vadis.txt
https://wolnelektury.pl/media/book/txt/sienkiewicz-we-mgle.txt
https://wolnelektury.pl/media/book/txt/potop-tom-pierwszy.txt
https://wolnelektury.pl/media/book/txt/potop-tom-drugi.txt
https://wolnelektury.pl/media/book/txt/potop-tom-trzeci.txt
https://wolnelektury.pl/media/book/txt/orzeszkowa-kto-winien.txt
https://wolnelektury.pl/media/book/txt/nad-niemnem-tom-pierwszy.txt
https://wolnelektury.pl/media/book/txt/nad-nie

In [158]:
if not os.path.isfile(ALL_SENTENCES):
    create_file_with_sentences()

In [5]:
with open(ALL_SENTENCES, 'r', encoding='utf-8') as file:
    sentences_train, sentences_test = train_test_split(list(file), test_size=0.2)

In [133]:
len(sentences_train)

109827

In [134]:
len(sentences_test)

27457

In [135]:
with open(TRAIN_SENTENCES, 'w', encoding='utf-8') as file:
    file.write(''.join(sentences_train))
    
with open(TEST_SENTENCES, 'w', encoding='utf-8') as file:
    file.write(''.join(sentences_test))

In [136]:
def train_model(train_file):
    run([FASTTEXT_PATH, 'supervised', '-input', train_file, '-output', MODEL_NAME])

In [137]:
train_model(TRAIN_SENTENCES)

In [138]:
y_true = [sentence.split(' ')[0] for sentence in sentences_test]

In [139]:
def predict(test_file):
    return check_output([FASTTEXT_PATH, 'predict', f'{MODEL_NAME}.bin', test_file]).decode().split('\r\n')[:-1]

In [140]:
predictions = predict(TEST_SENTENCES)

In [149]:
def print_results(y_true, predictions, average):
    precision, recall, fscore, _ = precision_recall_fscore_support(y_true, predictions, average=average)
    print(average)
    print('Precision', precision)
    print('Recall', recall)
    print('F1', fscore)

In [151]:
print_results(y_true, predictions, average='micro')

micro
Precision 0.7254616309137925
Recall 0.7254616309137925
F1 0.7254616309137925


In [152]:
print_results(y_true, predictions, average='macro')

macro
Precision 0.7096142817412083
Recall 0.7071523894874279
F1 0.7082219632044042


In [157]:
print(classification_report(y_true, predictions))

                      precision    recall  f1-score   support

 __label__mickiewicz       0.64      0.61      0.62      2702
 __label__orzeszkowa       0.68      0.70      0.69      4790
       __label__prus       0.72      0.70      0.71      6331
    __label__reymont       0.72      0.73      0.73      5388
__label__sienkiewicz       0.79      0.79      0.79      8246

         avg / total       0.73      0.73      0.73     27457

