In [1]:
import sys
import os
import re
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm 
from collections import OrderedDict 
from operator import itemgetter
from collections import Counter
import itertools
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

%config IPCompleter.greedy=True

## 1. Read data and split in train and valid

In [2]:
with open('dane_pozytywistyczne/korpus_orzeszkowej.txt', encoding='UTF-8') as f:
    orzeszkowa = [x for x in f.readlines() if len(x) > 1]
    
with open('dane_pozytywistyczne/korpus_prusa.txt', encoding='UTF-8') as f:
    prus = [x for x in f.readlines() if len(x) > 1]
    
with open('dane_pozytywistyczne/korpus_sienkiewicza.txt', encoding='UTF-8') as f:
    sienkiewicz = [x for x in f.readlines() if len(x) > 1]

In [3]:
len(orzeszkowa), len(prus), len(sienkiewicz)

(6291, 8323, 2615)

In [4]:
train_O, test_O = train_test_split(orzeszkowa, test_size=0.33, random_state=42)
train_P, test_P = train_test_split(prus, test_size=0.33, random_state=42)
train_S, test_S = train_test_split(sienkiewicz, test_size=0.33, random_state=42)

## 2. Create normalized lookup tables for Naive Bayes

In [5]:
def regex(line: str) -> list:
    return re.sub(r'[^\w\s]', '', line).strip().lower().split()
#     return re.sub(r'[\W+]', '', line).strip().lower()

def preprocess(train: list) -> list:
    return list(itertools.chain.from_iterable([regex(line) for line in train]))
#     return ''.join([regex(line) for line in train if len(line) > 1])

In [6]:
all_text_in_one_string = preprocess(train=train_O) + preprocess(train=train_P) + preprocess(train=train_S)
all_letters = set(all_text_in_one_string)

In [7]:
%%time
train_O_letters_occurencies = Counter(preprocess(train=train_O))
train_P_letters_occurencies = Counter(preprocess(train=train_P))
train_S_letters_occurencies = Counter(preprocess(train=train_S))

train_O_letters_occurencies = {k: v / sum(train_O_letters_occurencies.values()) 
                                for k, v in train_O_letters_occurencies.items()}

train_P_letters_occurencies = {k: v / sum(train_P_letters_occurencies.values()) 
                                for k, v in train_P_letters_occurencies.items()}

train_S_letters_occurencies = {k: v / sum(train_S_letters_occurencies.values()) 
                                for k, v in train_S_letters_occurencies.items()}

languages = [
    train_O_letters_occurencies,
    train_P_letters_occurencies,
    train_S_letters_occurencies
]

names = ['Orzeszkowa', 'Prus', 'Sienkiewicz']

CPU times: user 8 s, sys: 59 µs, total: 8 s
Wall time: 8.01 s


## 3. Naive Bayes model

In [8]:
def naive_bayes(
    sentence: str, 
    languages: list = languages, 
    names: list = names,
    normalize: bool = True,
    UNK: float = 1e-5
):
    """
    Returns the most probable language of a sentence
    
    :param languages: List of dictionaries
    :param names: Names of those languages
    """
    
    sentence = regex(line=sentence)

    log_probs = {}
    Z = 0.0
    for i, lang in enumerate(languages):
        P_DC = np.sum(np.array(([np.log(lang[x]) if x in lang else np.log(UNK) for x in sentence])))
        if normalize:
            P_DC = np.exp(P_DC)
        log_probs[names[i]] = P_DC * (1 / len(languages))
        Z += log_probs[names[i]]
    
    if normalize:
        for x in log_probs:
            log_probs[x] /= (Z + 1e-18)
        
    probs = OrderedDict(reversed(sorted(log_probs.items(), key=itemgetter(1))))
    return probs

In [9]:
naive_bayes(
    sentence='W dwa miesiące potem Marysia', 
    languages=languages, 
    names=names
)

OrderedDict([('Sienkiewicz', 0.9430753675093664),
             ('Prus', 0.026027607397559212),
             ('Orzeszkowa', 0.02199449379231031)])

## 4. Validate accuracy

NB: Using only letters accuracy was 24%, 47%, 50% for Orzeszkowa, Prus and Sienkiewicz

In [10]:
classify_O = [next(iter(naive_bayes(x, normalize=False))) for x in test_O]
accuracy_O = classify_O.count('Orzeszkowa') / len(classify_O)
accuracy_O * 100

76.60086663456909

In [11]:
classify_P = [next(iter(naive_bayes(x, normalize=False))) for x in test_P]
accuracy_P = classify_P.count('Prus') / len(classify_P)
accuracy_P * 100

82.70840917364397

In [12]:
classify_S = [next(iter(naive_bayes(x, normalize=False))) for x in test_S]
accuracy_S = classify_S.count('Sienkiewicz') / len(classify_S)
accuracy_S * 100

79.95365005793744

In [13]:
print('Confusion matrix:')

CM = confusion_matrix(
    ['Orzeszkowa'] * len(classify_O) + ['Prus'] * len(classify_P) + ['Sienkiewicz'] * len(classify_S), 
    classify_O + classify_P + classify_S
)
CM

Confusion matrix:


array([[1591,  321,  165],
       [ 197, 2272,  278],
       [  52,  121,  690]])

In [14]:
print('Confusion matrix in percentages:')

(CM / CM.sum(axis=1)[:, np.newaxis] * 100).astype(np.int)

Confusion matrix in percentages:


array([[76, 15,  7],
       [ 7, 82, 10],
       [ 6, 14, 79]])

In [15]:
print('Total accuracy: ')

print(np.diag(CM).sum() / CM.sum() * 100)

Total accuracy: 
80.05978547564621


## Validate on another tests

In [16]:
valid_O, valid_P, valid_S = [], [], []

path = 'dane_pozytywistyczne/testy1'
for file_name in os.listdir(path):
    if file_name.find('orzeszkowej') != -1:
        with open(path + '/' + file_name, encoding='UTF-8') as f:
            valid_O.append([x for x in f.readlines() if len(x) > 1])
    
    if file_name.find('prus') != -1:
        with open(path + '/' + file_name, encoding='UTF-8') as f:
            valid_P.append([x for x in f.readlines() if len(x) > 1])
    
    if file_name.find('sienkiewicz') != -1:
        with open(path + '/' + file_name, encoding='UTF-8') as f:
            valid_S.append([x for x in f.readlines() if len(x) > 1])

In [17]:
valid_O_sentences = [
    sentence 
    for text in valid_O 
        for line in text 
            for sentence in line.split('.') 
                if len(sentence) > 1
]

valid_P_sentences = [
    sentence 
    for text in valid_P 
        for line in text 
            for sentence in line.split('.') 
                if len(sentence) > 1
]

valid_S_sentences = [
    sentence 
    for text in valid_S 
        for line in text 
            for sentence in line.split('.') 
                if len(sentence) > 1
]

## With normalization, every text divided on sentences

In [18]:
UNK=1e-5

classify_valid_O = [next(iter(naive_bayes(sentence, normalize=True, UNK=UNK)))
                        for sentence in valid_O_sentences]
accuracy_valid_O = classify_valid_O.count('Orzeszkowa') / len(classify_valid_O)
print(f'Orzeszkowa: {accuracy_valid_O * 100:.2f}%')


classify_valid_P = [next(iter(naive_bayes(sentence, normalize=True, UNK=UNK)))
                        for sentence in valid_P_sentences]
accuracy_valid_P = classify_valid_P.count('Prus') / len(classify_valid_P)
print(f'Prus: {accuracy_valid_P * 100:.2f}%')


classify_valid_S = [next(iter(naive_bayes(sentence, normalize=True, UNK=UNK)))
                        for sentence in valid_S_sentences]
accuracy_valid_S = classify_valid_S.count('Sienkiewicz') / len(classify_valid_S)
accuracy_valid_S * 100
print(f'Sienkiewicz: {accuracy_valid_S * 100:.2f}%')

CM = confusion_matrix(
    ['Orzeszkowa'] * len(classify_valid_O) + ['Prus'] * len(classify_valid_P) + ['Sienkiewicz'] * len(classify_valid_S), 
    classify_valid_O + classify_valid_P + classify_valid_S
)


print('\n=================================\n')
print('Confusion matrix in percentages:')
print((CM / CM.sum(axis=1)[:, np.newaxis] * 100).astype(np.int))

print('\n=================================\n')
print('Total accuracy: ')
print(np.diag(CM).sum() / CM.sum() * 100)

Orzeszkowa: 47.15%
Prus: 73.07%
Sienkiewicz: 44.90%


Confusion matrix in percentages:
[[47 22 30]
 [ 7 73 19]
 [12 42 44]]


Total accuracy: 
56.057737813535255


## Without normalization, every text as a whole

In [23]:
UNK=1e-5

classify_valid_O = [next(iter(naive_bayes(' '.join(preprocess(test)), normalize=False, UNK=UNK))) 
                        for test in valid_O]
accuracy_valid_O = classify_valid_O.count('Orzeszkowa') / len(classify_valid_O)
print(f'Orzeszkowa: {accuracy_valid_O * 100:.2f}%')

classify_valid_P = [next(iter(naive_bayes(' '.join(preprocess(test)), normalize=False, UNK=UNK))) 
                        for test in valid_P]
accuracy_valid_P = classify_valid_P.count('Prus') / len(classify_valid_P)
print(f'Prus: {accuracy_valid_P * 100:.2f}%')

classify_valid_S = [next(iter(naive_bayes(' '.join(preprocess(test)), normalize=False, UNK=UNK))) 
                        for test in valid_S]
accuracy_valid_S = classify_valid_S.count('Sienkiewicz') / len(classify_valid_S)
print(f'Sienkiewicz: {accuracy_valid_S * 100:.2f}%')

print('\n=================================\n')
print('Confusion matrix:')
CM = confusion_matrix(
    ['Orzeszkowa'] * len(classify_valid_O) + ['Prus'] * len(classify_valid_P) + ['Sienkiewicz'] * len(classify_valid_S), 
    classify_valid_O + classify_valid_P + classify_valid_S
)
print(CM)

print('\n=================================\n')
print('Confusion matrix in percentages:')
print((CM / CM.sum(axis=1)[:, np.newaxis] * 100).astype(np.int))

print('\n=================================\n')
print('Total accuracy: ')
print(np.diag(CM).sum() / CM.sum() * 100)

Orzeszkowa: 66.67%
Prus: 95.24%
Sienkiewicz: 51.85%


Confusion matrix:
[[ 8  0  4]
 [ 0 20  1]
 [ 0 13 14]]


Confusion matrix in percentages:
[[66  0 33]
 [ 0 95  4]
 [ 0 48 51]]


Total accuracy: 
70.0
