In [2]:
import os 
os.listdir("books")

['james_joyce_ulysses.txt',
 'moby_dick_melville.txt',
 'night_and_day_virginia_woolf.txt',
 'robinson_crusoe_defoe.txt',
 'sons_and_lovers_lawrence.txt',
 'the_way_of_all_flash_butler.txt']

In [10]:
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS
from sklearn.neural_network import MLPClassifier
from sklearn import metrics
def text2paragraphs(filename, min_size=1):
    #A Text contained in the files 'filename' will be read
    # and chopped into paragraph. Paragraph with a string
    #length less than min_size will be ignored.
    #A list of paragraph string will be returned.
    with open(filename, encoding='utf-8') as f:
        txt = f.read()
    paragraphs = [para for para in txt.split("\n\n") if len(para) > min_size]
    return paragraphs

In [3]:
labels = ['Virginia Woolf', 'Samuel Butler', 'Herman Melville',
        'David Herbert Lawrence', 'Daniel Defoe', 'James Joyce']
path = "books/"

In [5]:
files = os.listdir("books")
labels = {fname[:2] for fname in files if fname.endswith(".txt")}
labels = sorted(list(labels))
labels


['ja', 'mo', 'ni', 'ro', 'so', 'th']

In [6]:
print(files)

['james_joyce_ulysses.txt', 'moby_dick_melville.txt', 'night_and_day_virginia_woolf.txt', 'robinson_crusoe_defoe.txt', 'sons_and_lovers_lawrence.txt', 'the_way_of_all_flash_butler.txt']


In [11]:
data = []
targets = []
for fname in files:
    if fname.endswith(".txt"):
        paras = text2paragraphs(path + fname, min_size=150)
        data.extend(paras)
        country = fname[:2]
        index = labels.index(country)
        targets += [index] * len(paras)


In [12]:
import random
data_targets = list(zip(data, targets))
# create random permuation on list:
data_targets = random.sample(data_targets, len(data_targets))
data, targets = list(zip(*data_targets))


In [13]:
import random
data_targets = list(zip(data, targets))
# create random permuation on list:
data_targets = random.sample(data_targets, len(data_targets))
data, targets = list(zip(*data_targets))

from sklearn.model_selection import train_test_split
res = train_test_split(data, targets,
                        train_size=0.8,
                        test_size=0.2,
                        random_state=42)
train_data, test_data, train_targets, test_targets = res


In [14]:
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

vectorizer = CountVectorizer(stop_words=list(ENGLISH_STOP_WORDS))
#vectorizer = CountVectorizer()
vectors = vectorizer.fit_transform(train_data)

# creating a classifier
classifier = MultinomialNB(alpha=.01)
classifier.fit(vectors, train_targets)

vectors_test = vectorizer.transform(test_data)
predictions = classifier.predict(vectors_test)
accuracy_score = metrics.accuracy_score(test_targets,predictions)
f1_score = metrics.f1_score(test_targets,
                            predictions,
                            average='macro')
print("accuracy score: ", accuracy_score)
print("F1-score: ", f1_score)


accuracy score:  0.914534567229178
F1-score:  0.906989407012656


In [15]:
some_texts = ["Es ist nicht von Bedeutung, wie langsam du gehst, solange du nicht stehenbleibst.",
            "Man muss das Unmögliche versuchen, um das Möglichezu erreichen.",
            "It's so much darker when a light goes out than it would have been if it had never shone.",
            "Rien n'est jamais fini, il suffit d'un peu de bonheur pour que tout recommence.",
            "Girano le stelle nella notte ed io ti penso forte forte e forte ti vorrei"]

sources = ["Konfuzius", "Hermann Hesse", "John Steinbeck", "EmileZola", "Gianna Nannini" ]

In [16]:
vtest = vectorizer.transform(some_texts)
predictions = classifier.predict(vtest)
for label in predictions:
    print(label, labels[label])


0 ja
1 mo
4 so
0 ja
0 ja
