In [1]:
import sys
import os
import re
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm 
from collections import OrderedDict 
from operator import itemgetter
from collections import Counter
import itertools
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

%config IPCompleter.greedy=True

## 1. Read data and split in train and valid

In [2]:
with open('dane_pozytywistyczne/korpus_orzeszkowej.txt', encoding='UTF-8') as f:
    orzeszkowa = [x for x in f.readlines() if len(x) > 1]
    
with open('dane_pozytywistyczne/korpus_prusa.txt', encoding='UTF-8') as f:
    prus = [x for x in f.readlines() if len(x) > 1]
    
with open('dane_pozytywistyczne/korpus_sienkiewicza.txt', encoding='UTF-8') as f:
    sienkiewicz = [x for x in f.readlines() if len(x) > 1]

In [3]:
len(orzeszkowa), len(prus), len(sienkiewicz)

(6291, 8323, 2615)

In [4]:
train_O, test_O = train_test_split(orzeszkowa, test_size=0.33, random_state=42)
train_P, test_P = train_test_split(prus, test_size=0.33, random_state=42)
train_S, test_S = train_test_split(sienkiewicz, test_size=0.33, random_state=42)

## 2. Create normalized lookup tables for Naive Bayes

In [5]:
def regex(line: str) -> str:
    return re.sub(r'[^\w\s]', '', line).strip().lower().split()
#     return re.sub(r'[\W+]', '', line).strip().lower()

def preprocess(train: list):
    return list(itertools.chain.from_iterable([regex(line) for line in train]))
#     return ''.join([regex(line) for line in train if len(line) > 1])

In [6]:
all_text_in_one_string = preprocess(train=train_O) + preprocess(train=train_P) + preprocess(train=train_S)
all_letters = set(all_text_in_one_string)

In [7]:
train_O_letters_occurencies = Counter(preprocess(train=train_O))
train_P_letters_occurencies = Counter(preprocess(train=train_P))
train_S_letters_occurencies = Counter(preprocess(train=train_S))

train_O_letters_occurencies = {k: v / sum(train_O_letters_occurencies.values()) 
                                for k, v in train_O_letters_occurencies.items()}

train_P_letters_occurencies = {k: v / sum(train_P_letters_occurencies.values()) 
                                for k, v in train_P_letters_occurencies.items()}

train_S_letters_occurencies = {k: v / sum(train_S_letters_occurencies.values()) 
                                for k, v in train_S_letters_occurencies.items()}

languages = [
    train_O_letters_occurencies,
    train_P_letters_occurencies,
    train_S_letters_occurencies
]

names = ['Orzeszkowa', 'Prus', 'Sienkiewicz']

## 3. Naive Bayes model

In [8]:
def naive_bayes(sentence: str, languages: list = languages, names: list = names):
    """
    Returns the most probable language of a sentence
    
    :param languages: List of dictionaries
    :param names: Names of those languages
    """
    
    sentence = regex(line=sentence)

    log_probs = {}
    Z = 0.0
    for i, lang in enumerate(languages):
        P_DC = np.sum(np.array(([np.log(lang[x]) if x in lang else np.log(1e-18) for x in sentence])))
        P_DC = np.exp(P_DC)
        log_probs[names[i]] = P_DC * (1 / len(languages))
        Z += log_probs[names[i]]
        
    for x in log_probs:
        log_probs[x] /= (Z + 1e-18)
        
    # TODO compute language probabilitie and order from most to least probable
    probs = OrderedDict(reversed(sorted(log_probs.items(), key=itemgetter(1))))
    return probs

In [9]:
naive_bayes(
    sentence='W dwa miesiące potem Marysia', 
    languages=languages, 
    names=names
)

OrderedDict([('Sienkiewicz', 0.9906483844720596),
             ('Prus', 2.7340558462639588e-15),
             ('Orzeszkowa', 2.310399623751893e-15)])

In [10]:
naive_bayes(
    sentence='Przetwarzanie języka to fajny przedmiot', 
    languages=languages, 
    names=names
)

OrderedDict([('Prus', 4.299640558205488e-30),
             ('Orzeszkowa', 4.125905761418803e-30),
             ('Sienkiewicz', 2.5972229690104778e-43)])

In [11]:
naive_bayes(
    sentence='Przetwarzanie języka naturalnego to fajny przedmiot', 
    languages=languages, 
    names=names
)

OrderedDict([('Orzeszkowa', 5.462677595915203e-35),
             ('Prus', 4.299640558205432e-48),
             ('Sienkiewicz', 2.5972229690104077e-61)])

## 4. Validate accuracy

NB: Using only letters accuracy was 24%, 47%, 50% for Orzeszkowa, Prus and Sienkiewicz

In [12]:
classify_O = [next(iter(naive_bayes(x))) for x in test_O]
accuracy_O = classify_O.count('Orzeszkowa') / len(classify_O)
accuracy_O * 100

73.32691381800674

In [13]:
classify_P = [next(iter(naive_bayes(x))) for x in test_P]
accuracy_P = classify_P.count('Prus') / len(classify_P)
accuracy_P * 100

79.06807426283218

In [14]:
classify_S = [next(iter(naive_bayes(x))) for x in test_S]
accuracy_S = classify_S.count('Sienkiewicz') / len(classify_S)
accuracy_S * 100

69.17728852838934

In [15]:
print('Confusion matrix:')

CM = confusion_matrix(
    ['Orzeszkowa'] * len(classify_O) + ['Prus'] * len(classify_P) + ['Sienkiewicz'] * len(classify_S), 
    classify_O + classify_P + classify_S
)
CM

Confusion matrix:


array([[1523,  211,  343],
       [ 290, 2172,  285],
       [ 118,  148,  597]], dtype=int64)

In [16]:
print('Confusion matrix in percentages:')

(CM / CM.sum(axis=1)[:, np.newaxis] * 100).astype(np.int)

Confusion matrix in percentages:


array([[73, 10, 16],
       [10, 79, 10],
       [13, 17, 69]])

In [17]:
print('Total accuracy: ')

print(np.diag(CM).sum() / CM.sum() * 100)

Total accuracy: 
75.47037102162828


## Validate on another tests

In [18]:
valid_O, valid_P, valid_S = [], [], []

path = 'dane_pozytywistyczne/testy1'
for file_name in os.listdir(path):
    if file_name.find('orzeszkowej') != -1:
        with open(path + '/' + file_name, encoding='UTF-8') as f:
            valid_O.append([x for x in f.readlines() if len(x) > 1])
    
    if file_name.find('prus') != -1:
        with open(path + '/' + file_name, encoding='UTF-8') as f:
            valid_P.append([x for x in f.readlines() if len(x) > 1])
    
    if file_name.find('sienkiewicz') != -1:
        with open(path + '/' + file_name, encoding='UTF-8') as f:
            valid_S.append([x for x in f.readlines() if len(x) > 1])

In [19]:
naive_bayes(valid_O[0][0])

OrderedDict([('Prus', 4.268399789214876e-21),
             ('Orzeszkowa', 3.621125660342394e-21),
             ('Sienkiewicz', 3.594389432622781e-21)])

In [20]:
valid_O = [
    sentence 
    for text in valid_O 
        for line in text 
            for sentence in line.split('.') 
                if len(sentence) > 1
]

valid_P = [
    sentence 
    for text in valid_P 
        for line in text 
            for sentence in line.split('.') 
                if len(sentence) > 1
]

valid_S = [
    sentence 
    for text in valid_S 
        for line in text 
            for sentence in line.split('.') 
                if len(sentence) > 1
]

In [21]:
classify_valid_O = [next(iter(naive_bayes(sentence))) for sentence in valid_O]
accuracy_valid_O = classify_valid_O.count('Orzeszkowa') / len(classify_valid_O)
accuracy_valid_O * 100

59.854014598540154

In [22]:
classify_valid_P = [next(iter(naive_bayes(line))) for line in valid_P]
accuracy_valid_P = classify_valid_O.count('Prus') / len(classify_valid_P)
accuracy_valid_P * 100

10.12970969734404

In [23]:
classify_valid_S = [next(iter(naive_bayes(line))) for line in valid_S]
accuracy_valid_S = classify_valid_S.count('Prus') / len(classify_valid_S)
accuracy_valid_S * 100

50.36420395421436

In [24]:
print('Confusion matrix:')

CM = confusion_matrix(
    ['Orzeszkowa'] * len(classify_valid_O) + ['Prus'] * len(classify_valid_P) + ['Sienkiewicz'] * len(classify_valid_S), 
    classify_valid_O + classify_valid_P + classify_valid_S
)
CM

Confusion matrix:


array([[ 410,  164,  111],
       [ 209, 1215,  195],
       [ 424,  968,  530]], dtype=int64)

In [25]:
print('Confusion matrix in percentages:')

(CM / CM.sum(axis=1)[:, np.newaxis] * 100).astype(np.int)

Confusion matrix in percentages:


array([[59, 23, 16],
       [12, 75, 12],
       [22, 50, 27]])

In [26]:
print('Total accuracy: ')

print(np.diag(CM).sum() / CM.sum() * 100)

Total accuracy: 
50.99384761003313
