In [37]:
from urllib import request
from eventlet import GreenPool
import os
import pandas as pd
import regex
from sklearn import model_selection
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics


In [25]:
book_files={
 "Mickiewicz": [
 "https://wolnelektury.pl/media/book/txt/pan-tadeusz.txt",
 "https://wolnelektury.pl/media/book/txt/dziady-dziady-widowisko-czesc-i.txt",
 "https://wolnelektury.pl/media/book/txt/dziady-dziadow-czesci-iii-ustep-do-przyjaciol-moskali.txt",
 "https://wolnelektury.pl/media/book/txt/ballady-i-romanse-pani-twardowska.txt",
 "https://wolnelektury.pl/media/book/txt/ballady-i-romanse-powrot-taty.txt",
 "https://wolnelektury.pl/media/book/txt/ballady-i-romanse-switez.txt",
 "https://wolnelektury.pl/media/book/txt/dziady-dziady-poema-dziady-czesc-iv.txt",
 ],
 "Sienkiewicz": [
 "https://wolnelektury.pl/media/book/txt/quo-vadis.txt",
 "https://wolnelektury.pl/media/book/txt/sienkiewicz-we-mgle.txt",
 "https://wolnelektury.pl/media/book/txt/potop-tom-pierwszy.txt",
 "https://wolnelektury.pl/media/book/txt/potop-tom-drugi.txt",
 "https://wolnelektury.pl/media/book/txt/potop-tom-trzeci.txt",
 ],
 "Orzeszkowa": [
 "https://wolnelektury.pl/media/book/txt/orzeszkowa-kto-winien.txt",
 "https://wolnelektury.pl/media/book/txt/nad-niemnem-tom-pierwszy.txt",
 "https://wolnelektury.pl/media/book/txt/nad-niemnem-tom-drugi.txt",
 "https://wolnelektury.pl/media/book/txt/nad-niemnem-tom-trzeci.txt",
 "https://wolnelektury.pl/media/book/txt/gloria-victis-dziwna-historia.txt",
 "https://wolnelektury.pl/media/book/txt/z-pozogi.txt",
 "https://wolnelektury.pl/media/book/txt/pani-dudkowa.txt",
 "https://wolnelektury.pl/media/book/txt/dymy.txt",
 "https://wolnelektury.pl/media/book/txt/syn-stolarza.txt",
 "https://wolnelektury.pl/media/book/txt/dobra-pani.txt",
 "https://wolnelektury.pl/media/book/txt/cnotliwi.txt",
 "https://wolnelektury.pl/media/book/txt/kilka-slow-o-kobietach.txt",
 "https://wolnelektury.pl/media/book/txt/patryotyzm-i-kosmopolityzm.txt",
 "https://wolnelektury.pl/media/book/txt/julianka.txt",
 ],
 "Prus": [
 "https://wolnelektury.pl/media/book/txt/lalka-tom-drugi.txt",
 "https://wolnelektury.pl/media/book/txt/lalka-tom-pierwszy.txt",
 "https://wolnelektury.pl/media/book/txt/antek.txt",
 "https://wolnelektury.pl/media/book/txt/katarynka.txt",
 "https://wolnelektury.pl/media/book/txt/prus-anielka.txt",
 "https://wolnelektury.pl/media/book/txt/prus-placowka.txt",
 
 ],
 "Reymont": [
 "https://wolnelektury.pl/media/book/txt/ziemia-obiecana-tom-pierwszy.txt",
 "https://wolnelektury.pl/media/book/txt/chlopi-czesc-pierwsza-jesien.txt",
 "https://wolnelektury.pl/media/book/txt/reymont-chlopi-zima.txt",
 "https://wolnelektury.pl/media/book/txt/chlopi-czesc-trzecia-wiosna.txt",
 "https://wolnelektury.pl/media/book/txt/chlopi-czesc-czwarta-lato.txt",
 ]
}

In [29]:
def fetch(url):
    file_path = os.path.join("./data/",os.path.basename(url))
    if os.path.exists(file_path):
        return None, None
    data = request.urlopen(url).read()
    return file_path, data

os.mkdir('data')

for author in book_files:
    pool = GreenPool()
    
    for file_path, data in pool.imap(fetch, book_files[author]):
        if file_path:
            with open(file_path, mode="wb") as f:
                f.write(data)
print ("DONE")




DONE


In [38]:

def preprocess_file(file_path=None, file_url=None):
    if not file_path and file_url:
        file_path = os.path.join("data",os.path.basename(file_url))
        
    text = open(file_path,'rb').read().decode("utf-8").lower()

    text = regex.sub(u"[^ \n\p{Latin}\-'.?!]", " ",text)
    text = regex.sub(u"[ \n]+", " ", text) 
    text = regex.sub(r"----- ta lektura.*","", text) 

    return [regex.sub(r"^ ","",l) for l in regex.split('\.|,|\?|!|:',text)]


def get_book_df(document, author):
    return pd.DataFrame({
        'author': pd.Series(len(document)*[author]),
        'txt': pd.Series(document),
    })
    
book_lines_df = pd.concat([
    get_book_df(preprocess_file(file_url=url),author=author) 
        for author in book_files for url in book_files[author] 
])

book_lines_df.head()

Unnamed: 0,author,txt
0,Mickiewicz,adam mickiewicz pan tadeusz czyli ostatni zaja...
1,Mickiewicz,ojczyzno moja
2,Mickiewicz,ty jesteś jak zdrowie ile cię trzeba cenić ten...
3,Mickiewicz,dziś piękność twą w całej ozdobie widzę i opis...
4,Mickiewicz,panno święta co jasnej bronisz częstochowy i w...


In [39]:
book_lines_df.groupby('author').count()

Unnamed: 0_level_0,txt
author,Unnamed: 1_level_1
Mickiewicz,5093
Orzeszkowa,22177
Prus,31033
Reymont,24107
Sienkiewicz,40381


In [40]:
book_lines_df['words'] = book_lines_df['txt'].apply(lambda row: len(row.split()))
book_lines_df.groupby('author')['words'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Mickiewicz,5093.0,16.103868,13.951437,0.0,6.0,13.0,23.0,145.0
Orzeszkowa,22177.0,19.400821,17.257917,0.0,6.0,15.0,27.0,219.0
Prus,31033.0,12.131537,10.04145,0.0,5.0,10.0,17.0,133.0
Reymont,24107.0,16.359398,18.880965,0.0,5.0,10.0,21.0,316.0
Sienkiewicz,40381.0,13.705579,12.038426,0.0,5.0,10.0,19.0,146.0


In [41]:
book_lines_df.groupby('author')['words'].quantile(0.98)

author
Mickiewicz     54.0
Orzeszkowa     67.0
Prus           39.0
Reymont        72.0
Sienkiewicz    47.0
Name: words, dtype: float64

In [42]:
train_df, test_df = model_selection.train_test_split(
    book_lines_df, 
    test_size=0.1, 
    stratify=book_lines_df['author'])

In [43]:
vect = CountVectorizer()
vect.fit(train_df['txt'])
sample_sentence = train_df.iloc[2]['txt']
vect.transform([sample_sentence])

<1x83597 sparse matrix of type '<class 'numpy.int64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [44]:
X_train = vect.transform(train_df['txt'])
X_test=vect.transform(test_df['txt'])
model = LogisticRegression(class_weight='balanced', dual=True)
model.fit(X_train, train_df['author'])



LogisticRegression(C=1.0, class_weight='balanced', dual=True,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [45]:
model.score(X_test, test_df['author'])

0.6417752442996743

In [46]:
target = test_df['author']
predicted = model.predict(X_test)
print(metrics.classification_report(target, predicted))

              precision    recall  f1-score   support

  Mickiewicz       0.05      0.61      0.09        41
  Orzeszkowa       0.76      0.62      0.68      3001
        Prus       0.52      0.85      0.64      3073
     Reymont       0.74      0.72      0.73      3017
 Sienkiewicz       0.92      0.38      0.54      3148

    accuracy                           0.64     12280
   macro avg       0.60      0.64      0.54     12280
weighted avg       0.73      0.64      0.65     12280

