In [4]:
def text2paragraphs(filename, min_size=1):
    #A Text contained in the files 'filename' will be read
    # and chopped into paragraph. Paragraph with a string
    #length less than min_size will be ignored.
    #A list of paragraph string will be returned.
    with open(filename, encoding='utf-8') as f:
        txt = f.read()
    paragraphs = [para for para in txt.split("\n\n") if len(para) > min_size]
    return paragraphs


In [5]:
labels = ['Varginia Woolf', 'Samuel Butler', 'Herman Melville',
         'David Herbert Lawrence','Daniel Defoe','James Joyce']

files = ['night_and_day_virginia_woolf.txt','the_way_of_all_flash_butler.txt',
         'moby_dick_melville.txt','sons_and_lovers_lawrence.txt',
         'robinson_crusoe_defoe.txt','james_joyce_ulysses.txt']
path = "books/"

In [7]:
data = []
targets = []
counter = 0
for fname in files:
    paras = text2paragraphs(path + fname, min_size=150)
    data.extend(paras)
    targets += [counter] * len(paras)
    counter +=1

In [8]:
#cell is useless, because train_test_split will do the shuffling
import random
data_targets = list(zip(data, targets))
#create random permuation on list
data_targets = random.sample(data_targets,len(data_targets))

data, targets = list(zip(*data_targets))

In [9]:
from sklearn.model_selection import train_test_split

res = train_test_split(data, targets,
                      train_size=0.8,
                      test_size=0.2,
                      random_state=42)
train_data, test_data, train_targets, test_targets = res

In [12]:
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

#Use list() to convert ENGLISH_STOP_WORDS from a frazenset to a list
vectorizer = CountVectorizer(stop_words=list(ENGLISH_STOP_WORDS))
vectors = vectorizer.fit_transform(train_data)

#creating a classifier
classifier = MultinomialNB(alpha=.01)
classifier.fit(vectors, train_targets)

vectors_test = vectorizer.transform(test_data)

predictions = classifier.predict(vectors_test)
accuracy_score = metrics.accuracy_score(test_targets, predictions)
f1_score = metrics.f1_score(test_targets, predictions,average='macro')

print('Accuracy Score: ', accuracy_score)
print('F1-Score: ',f1_score)

Accuracy Score:  0.9216113228089275
F1-Score:  0.9151577285091399


In [14]:
paras = text2paragraphs(path + "night_and_day_virginia_woolf.txt",min_size=250)

first_para, last_para = 100, 500
vector_test = vectorizer.transform(paras[first_para: last_para])

predictions = classifier.predict(vector_test)
print(predictions)
targets = [0] * (last_para - first_para)
accuracy_score = metrics.accuracy_score(targets, predictions)
precision_score = metrics.precision_score(targets,predictions,average='macro')
f1_score = metrics.f1_score(targets, predictions, average='macro')

print("accuracy score:", accuracy_score)
print("Precision score: ", precision_score)
print("F1-score: ", f1_score)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 5 0 0 0 0 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 5 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
accuracy score: 0.9875
Precision score:  0.3333333333333333
F1-score:  0.3312368972746331


In [15]:
predictions = classifier.predict_proba(vector_test)
predictions

array([[1.00000000e+000, 1.54960091e-143, 1.81216709e-136,
        6.80508247e-127, 5.62859917e-176, 1.18259173e-118],
       [1.00000000e+000, 2.53758451e-027, 6.79442950e-034,
        1.12551181e-036, 8.01208252e-040, 1.05653616e-030],
       [1.00000000e+000, 3.04370489e-038, 5.15073580e-030,
        2.26788470e-039, 2.48405618e-045, 8.63038759e-028],
       ...,
       [1.00000000e+000, 1.86642267e-029, 6.50635954e-035,
        4.37698367e-044, 2.36417378e-050, 1.75719197e-028],
       [1.00000000e+000, 1.67954451e-071, 1.68702043e-075,
        2.37790629e-108, 4.29295467e-111, 2.76249804e-078],
       [1.00000000e+000, 1.60843314e-012, 1.05272005e-023,
        9.97976135e-026, 3.58604533e-034, 9.99384162e-027]])

In [17]:
for i in range(0,10):
    print(predictions[i], paras[i + first_para])

[1.00000000e+000 1.54960091e-143 1.81216709e-136 6.80508247e-127
 5.62859917e-176 1.18259173e-118] The night was very still, and on such nights, when the traffic thins
away, the walker becomes conscious of the moon in the street, as if the
curtains of the sky had been drawn apart, and the heaven lay bare, as
it does in the country. The air was softly cool, so that people who
had been sitting talking in a crowd found it pleasant to walk a little
before deciding to stop an omnibus or encounter light again in an
underground railway. Sandys, who was a barrister with a philosophic
tendency, took out his pipe, lit it, murmured “hum” and “ha,” and was
silent. The couple in front of them kept their distance accurately, and
appeared, so far as Denham could judge by the way they turned towards
each other, to be talking very constantly. He observed that when a
pedestrian going the opposite way forced them to part they came together
again directly afterwards. Without intending to watch them he nev

In [19]:
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS
from sklearn.neural_network import MLPClassifier
from sklearn import metrics

vectorizer = CountVectorizer(stop_words=list(ENGLISH_STOP_WORDS))
vectors = vectorizer.fit_transform(train_data)
print("Creating a classifier. This will take some time!")
classifier = MLPClassifier(random_state=1, max_iter=300).fit(vectors, train_targets)

Creating a classifier. This will take some time!


In [20]:
vectors_test = vectorizer.transform(test_data)
predictions = classifier.predict(vectors_test)
accuracy_score = metrics.accuracy_score(test_targets,predictions)
f1_score = metrics.f1_score(test_targets,
                            predictions,
                            average='macro')
print("accuracy score: ", accuracy_score)
print("F1-score: ", f1_score)


accuracy score:  0.9205225911812738
F1-score:  0.9217613783828069
