In [None]:
import regex
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd

In [None]:
!tar -xvf '/content/drive/My Drive/Colab Notebooks/NLP/Dane/dane_pozytywizm.tgz'

# Preprocessing

In [None]:
def preprocess(path, stop_words, author):

    text = open(path, 'rb').read().decode("utf-8").lower()

    text = regex.sub(u"[^ \n\p{Latin},:\-'.?!]", " ",text)
    text = regex.sub(u"[,]", " ,",text)
    text = regex.sub(u"[ \n]+", " ", text)
    sentences = [regex.split(' ', regex.sub(r"^ ","",l)) for l in regex.split('\.|\?|!|:',text)] 
    sentences = [[word for word in sentence if word not in stop_words] for sentence in sentences]
    sentences = [sentence for sentence in sentences if len(sentence) > 2]

    return sentences, np.full(len(sentences), author)

In [None]:
path_to_stop_words = '/content/drive/My Drive/Colab Notebooks/NLP/Dane/stop_words.txt'

stop_words = {word for word in open(path_to_stop_words,'rb').read().decode('utf-8').split('\n')}

In [None]:
# Orzeszkowa corpus
path_orz = '/content/dane_pozytywistyczne/korpus_orzeszkowej.txt'
sentences_orz, labels_orz = preprocess(path_orz, stop_words, 0)
print("Orzeszkowa - number of sentences: {}".format(len(sentences_orz)))
# Prus corpus
path_pr = '/content/dane_pozytywistyczne/korpus_prusa.txt'
sentences_pr, labels_pr = preprocess(path_pr, stop_words, 1)
print("Prus - number of sentences: {}".format(len(sentences_pr)))
# Sienkiewicz corpus
path_sie = '/content/dane_pozytywistyczne/korpus_sienkiewicza.txt'
sentences_sie, labels_sie = preprocess(path_sie, stop_words, 2)
print("Sienkiewicz - number of sentences: {}".format(len(sentences_sie)))

Orzeszkowa - number of sentences: 12284
Prus - number of sentences: 12166
Sienkiewicz - number of sentences: 5363


# Train/Val split

In [None]:
data = np.concatenate((sentences_orz,sentences_pr,sentences_sie))
labels = np.concatenate((labels_orz,labels_pr,labels_sie))

X_train, X_val, y_train, y_val = train_test_split(data, labels, test_size=0.05, random_state=42)

print("Train/Val split")
print('Train size: {}'.format(len(X_train)))
print('Val size: {}'.format(len(X_val)))

Train/Val split
Train size: 28322
Val size: 1491


# Feature engineering

## Dictionary

In [None]:
def build_dict(sentences):
    res_dict = {}
    iter = 0 
    for sen in sentences:
        for word in sen:
            if word not in res_dict:
                res_dict[word] = iter
                iter += 1

    return res_dict

words_to_keys = build_dict(X_train)
keys_to_words = {value: key for (key, value) in words_to_keys.items()}

print("Vocabulary size: {}".format(len(words_to_keys.keys())))

Vocabulary size: 54287


## Word occurences/sentences length/comma occurences

In [None]:
def build_features(words_to_keys, dataset, labels):

    number_of_authors = np.unique(labels)

    sentences_length = np.zeros((4, len(number_of_authors)))

    commas = np.zeros((1, len(number_of_authors)))

    occurences = np.zeros((len(words_to_keys.keys()), len(number_of_authors)))

    for sen, label in zip(dataset, labels):
        for word in set(sen):
            occurences[words_to_keys[word], label] += 1


        sen_len = len(sen)
        if sen_len <= 5:
            sentences_length[0, label] += 1
        elif 5 < sen_len <= 10:
            sentences_length[1, label] += 1
        elif 10 < sen_len <= 15:
            sentences_length[2, label] += 1
        elif 15 < sen_len:
            sentences_length[3, label] += 1
        
    return occurences, sentences_length, commas

oc, sentences_length, commas = build_features(words_to_keys, X_train, y_train)

#Orzechowska : 0 | Prus : 1 | Sienkiewicz : 2
words_occurences_dataframe = pd.DataFrame(oc, index = words_to_keys.keys(), columns=['0', '1', '2'])
sentences_length_dataframe = pd.DataFrame(sentences_length, index = ['l5', 'm5l10','m10l15','m15'], columns=['0', '1', '2'])
df = pd.concat([words_occurences_dataframe / words_occurences_dataframe.sum(0), sentences_length_dataframe / sentences_length_dataframe.sum(0)])


df.tail(10)

Unnamed: 0,0,1,2
związuje,0.0,0.0,2.1e-05
drga,0.0,0.0,2.1e-05
błyszczy,0.0,0.0,2.1e-05
wywołało,8e-06,0.0,0.0
powyrywał,0.0,1e-05,0.0
zawiasów,0.0,1e-05,0.0
l5,0.275268,0.321268,0.299863
m5l10,0.279126,0.367077,0.306518
m10l15,0.180626,0.18159,0.185555
m15,0.264981,0.130066,0.208064


# Naive_bayes

In [None]:
"FROM MACHINE LEARNING CLASS ASSIGNMENT 2"

def naive_bayes(sent, authors, df, dict_W):
    """Returns the most probable language of a sentence"""

    # Try working with log-probabilities.
    # to prevent taking log(0) you can e.g. add a very small amount (1e-100)
    # to each tabulated frequency.
    df_log = np.log(df+1e-100)
    
    # normalize the sentence: remove spaces and punctuations, take lower case

    probs = {}
    res = 1
    for author in authors:
        log_prob = 0
        for word in sent:
            if word in dict_W:
                log_prob += df_log.loc[word][author]
        
        #['l5', 'm5l10','m10l15','m15l20']
        sen_len = len(sent)
        if sen_len <= 5:
            log_prob += df_log.loc['l5'][author]
        elif 5 < sen_len <= 10:
            log_prob += df_log.loc['m5l10'][author]
        elif 10 < sen_len <= 15:
            log_prob += df_log.loc['m10l15'][author]
        elif 15 < sen_len:
            log_prob += df_log.loc['m15'][author]

        probs[author] = np.exp(log_prob)
        res += probs[author] 
        

    # TODO compute language probabilitie and order from most to least probable
    probs = [(x, y/res) for x,y in sorted(probs.items(), key=lambda x: x[1], reverse=True)][0]
    return probs

## VALIDATION

In [None]:
from tqdm import tqdm

def error_bayes(X_test, y_test, df, words_to_keys):
    res = 0
    for sen, label in zip(X_test, y_test):
        pred = naive_bayes(sen, [0, 1, 2], df, words_to_keys.keys())
        if pred[0] == label:
            res += 1
    
    print("Error: {}".format(1-res/len(y_test)))

In [None]:
error_bayes(X_val, y_val, df, words_to_keys)

Error: 0.22132796780684105


## TEST

In [None]:
import os
paths = '/content/dane_pozytywistyczne/testy1/'

test_X = []
test_y = []

for i, path in enumerate(os.listdir(paths)):
    if len(regex.findall("orzeszkowej", path)) == 1:
        sentences, labels = preprocess(os.path.join(paths,path), stop_words, 0)
    if len(regex.findall("prusa", path)) == 1:
        sentences, labels = preprocess(os.path.join(paths,path), stop_words, 1)
    if len(regex.findall("sienkiewicza", path)) == 1:
        sentences, labels = preprocess(os.path.join(paths,path), stop_words, 2)
    test_X += sentences
    test_y += labels.tolist()
    print("\n",path)
    error_bayes(sentences, labels, df, words_to_keys)



 test_sienkiewicza15.txt
Error: 0.6774193548387097

 test_prusa0.txt
Error: 0.4590163934426229

 test_sienkiewicza11.txt
Error: 0.6421052631578947

 test_orzeszkowej7.txt
Error: 0.42000000000000004

 test_sienkiewicza41.txt
Error: 0.7948717948717949

 test_orzeszkowej.txt
Error: 0.34042553191489366

 test_sienkiewicza17.txt
Error: 0.8690476190476191

 test_sienkiewicza3.txt
Error: 0.7894736842105263

 test_sienkiewicza25.txt
Error: 0.42666666666666664

 test_prusa14.txt
Error: 0.273972602739726

 test_sienkiewicza47.txt
Error: 0.7397260273972603

 test_sienkiewicza35.txt
Error: 0.8734177215189873

 test_prusa20.txt
Error: 0.07462686567164178

 test_orzeszkowej19.txt
Error: 0.22916666666666663

 test_orzeszkowej13.txt
Error: 0.46153846153846156

 test_prusa8.txt
Error: 0.33333333333333337

 test_sienkiewicza33.txt
Error: 0.7605633802816901

 test_prusa12.txt
Error: 0.2666666666666667

 test_prusa16.txt
Error: 0.18604651162790697

 test_sienkiewicza29.txt
Error: 0.8873239436619719

 tes

In [None]:
error_bayes(test_X, test_y, df, words_to_keys)

Error: 0.4969450101832994
