# Imports

In [1]:
%reset -f

In [2]:
import re
import string
from collections import Counter

import numpy as np
import pandas as pd
pd.set_option('max_colwidth', 150)

import classla
# classla.download('bg')

In [3]:
DATA_PATH_PREP = '../DATA/prepared'

# Load data

In [4]:
df_samples = pd.read_pickle(f'{DATA_PATH_PREP}/03_df_samples.pkl')
print(df_samples.shape)
df_samples.head()

(600, 2)


Unnamed: 0,text,author
0,"сията й — злато с шепа, моля ти се. Късмет ли да го наречеш, какво да го наречеш — не знайш! Па вземи и децата й: по-голямото, не го ли знаете, ед...",aleko-konstantinov
1,сега не си раз,aleko-konstantinov
2,"ме какво да правя.\n\t— Как какво да правиш! Прави каквото правят хората: свий си опашката, па си налягай парцалите… Какво се кикотиш, не е ли тъй...",aleko-konstantinov
3,"а, че не съчувствувай на македонците…\n\t(Я тури две-три дървета в печката. Стига.)\n\tНямаше си хас, джанъм, този Биконсфилд, дето ни разпокъса. ...",aleko-konstantinov
4,"шва)… значи, и да се разгатне енигмата на зеленото сукно с металическото копче, и пламъкът на петрола в нощната тъмнина, и ужасният смрад, и цвърт...",aleko-konstantinov


## Character-based lexical features

The number of each distinct special character, spaces, punctuation, parentheses and quotation marks as separate features.

In [5]:
tmp_text = df_samples['text'][4]
tmp_text[:250]

'шва)… значи, и да се разгатне енигмата на зеленото сукно с металическото копче, и пламъкът на петрола в нощната тъмнина, и ужасният смрад, и цвъртението… и закритият фаетон в Драгоманското дефиле, и всичко, и всичко… У-у! Косите ми настръхват… Но, ба'

In [6]:
format_specific = ''.join({char for char in tmp_text if not char.isalnum() and char not in string.punctuation and char not in {'\t', '\n'}})
print(f'{format_specific=}')

special = string.punctuation + format_specific + '=#<> '
special

format_specific='—… '


'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~—… =#<> '

In [7]:
def count_characters(string):
    return {char: string.count(char) for char in special}

char_counts = df_samples['text'].apply(count_characters)
char_counts[:3]

0    {'!': 2, '"': 0, '#': 0, '$': 0, '%': 0, '&': 0, ''': 0, '(': 0, ')': 0, '*': 0, '+': 0, ',': 27, '-': 1, '.': 3, '/': 0, ':': 2, ';': 0, '<': 0, ...
1    {'!': 0, '"': 0, '#': 0, '$': 0, '%': 0, '&': 0, ''': 0, '(': 0, ')': 0, '*': 0, '+': 0, ',': 0, '-': 0, '.': 0, '/': 0, ':': 0, ';': 0, '<': 0, '...
2    {'!': 1, '"': 0, '#': 0, '$': 0, '%': 0, '&': 0, ''': 0, '(': 1, ')': 1, '*': 0, '+': 0, ',': 14, '-': 5, '.': 6, '/': 0, ':': 1, ';': 0, '<': 0, ...
Name: text, dtype: object

In [8]:
text_features = pd.DataFrame.from_records(char_counts)
text_features

Unnamed: 0,!,"""",#,$,%,&,',(,),*,...,^,_,`,{,|,},~,—,…,Unnamed: 21
0,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,5,0,138
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
2,1,0,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,7,9,123
3,0,0,0,0,0,0,0,2,1,0,...,0,0,0,0,0,0,0,1,3,137
4,5,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,9,142
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,81
596,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,7,0,170
597,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,89
598,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,52


In [9]:
only_zeros = text_features.columns[(text_features == 0).all()].tolist()
print(f'{only_zeros=}')
print(f'{len(only_zeros)=}')

text_features = text_features.drop(only_zeros, axis=1)
text_features

only_zeros=['"', '#', '$', '%', '&', "'", '+', '/', '<', '=', '>', '@', '\\', '^', '`', '{', '|', '}', '~']
len(only_zeros)=19


Unnamed: 0,!,(,),*,",",-,.,:,;,?,[,],_,—,…,Unnamed: 16
0,2,0,0,0,27,1,3,2,0,0,0,0,0,5,0,138
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3
2,1,1,1,0,14,5,6,1,0,2,0,0,0,7,9,123
3,0,2,1,0,20,4,8,2,0,1,0,0,0,1,3,137
4,5,0,1,0,18,3,2,0,0,0,0,0,0,1,9,142
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,0,0,0,0,9,0,3,1,0,0,0,0,0,0,0,81
596,4,0,0,0,12,1,7,3,0,0,0,0,0,7,0,170
597,0,0,0,0,5,1,3,1,0,0,0,0,0,0,0,89
598,0,0,0,0,5,1,2,1,0,3,0,0,0,0,0,52


## Sentence- and word-based features

Distribution of POS-tags, token length, number of sentences, sentence length, average word length, words in all-caps and counts of words above and below 2-3 and 6 characters as separate features.

### Distribution of POS-tags

In [10]:
nlp = classla.Pipeline('bg', processors='tokenize,pos')
doc = nlp("Алеко Константинов е роден в Свищов. Кой е Papi Hans?")
print(doc.to_conll())

2023-02-02 19:22:06 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2023-02-02 19:22:06 INFO: Use device: cpu
2023-02-02 19:22:06 INFO: Loading: tokenize
2023-02-02 19:22:06 INFO: Loading: pos
2023-02-02 19:22:08 INFO: Done loading processors!


# newpar id = 1
# sent_id = 1.1
# text = Алеко Константинов е роден в Свищов.
1	Алеко	_	PROPN	Npmsi	Definite=Ind|Gender=Masc|Number=Sing	_	_	_	_
2	Константинов	_	PROPN	Hmsi	Definite=Ind|Gender=Masc|Number=Sing	_	_	_	_
3	е	_	AUX	Vxitf-r3s	Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin|Voice=Act	_	_	_	_
4	роден	_	VERB	Vpptcv--smi	Aspect=Perf|Definite=Ind|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass	_	_	_	_
5	в	_	ADP	R	_	_	_	_	_
6	Свищов	_	PROPN	Npmsi	Definite=Ind|Gender=Masc|Number=Sing	_	_	_	SpaceAfter=No
7	.	_	PUNCT	punct	_	_	_	_	_

# sent_id = 1.2
# text = Кой е Papi Hans?
1	Кой	_	PRON	Pie-os-m	Case=Nom|Gender=Masc|Number=Sing|PronType=Int	_	_	_	_
2	е	_	AUX	Vxitf-r3s	Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin|Voice=Act	_	_	_	_
3	Papi	_	PROPN	Npmsi	Definite=Ind|Gender=Masc|Number=Sing	_	_	_	_
4	Hans	_	PROPN	Npmsi	Definite=Ind|Gender=Masc|Number=Sing	_	_	_	SpaceAfter=No
5	?	_	PUNCT	punct	_	_	_	_	_




In [11]:
pos_tags = []
print(doc.to_dict()[0])

for parsed_info, _ in doc.to_dict():
    for entry in parsed_info:
        pos_tags.append(entry['upos'])
        print(f"{entry['text'].rjust(15)} | {entry['upos'].rjust(5)} | {entry['xpos']}")
    print()

tag_counts = Counter(pos_tags)
tag_distribution = {tag: count / len(pos_tags) for tag, count in tag_counts.items()}

print(f'{pos_tags=}')
print(f'{tag_distribution=}')
print(pd.Series(tag_distribution))

([{'id': 1, 'text': 'Алеко', 'upos': 'PROPN', 'xpos': 'Npmsi', 'feats': 'Definite=Ind|Gender=Masc|Number=Sing'}, {'id': 2, 'text': 'Константинов', 'upos': 'PROPN', 'xpos': 'Hmsi', 'feats': 'Definite=Ind|Gender=Masc|Number=Sing'}, {'id': 3, 'text': 'е', 'upos': 'AUX', 'xpos': 'Vxitf-r3s', 'feats': 'Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin|Voice=Act'}, {'id': 4, 'text': 'роден', 'upos': 'VERB', 'xpos': 'Vpptcv--smi', 'feats': 'Aspect=Perf|Definite=Ind|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass'}, {'id': 5, 'text': 'в', 'upos': 'ADP', 'xpos': 'R'}, {'id': 6, 'text': 'Свищов', 'upos': 'PROPN', 'xpos': 'Npmsi', 'feats': 'Definite=Ind|Gender=Masc|Number=Sing', 'misc': 'SpaceAfter=No'}, {'id': 7, 'text': '.', 'upos': 'PUNCT', 'xpos': 'punct'}], '# newpar id = 1\n# sent_id = 1.1\n# text = Алеко Константинов е роден в Свищов.\n')
          Алеко | PROPN | Npmsi
   Константинов | PROPN | Hmsi
              е |   AUX | Vxitf-r3s
          роден |  VERB | Vpptcv--smi


In [42]:
def gen_pos_tags(text: str):
    tokenized = nlp(text).to_dict()
    parsed_infos = sum([parsed_info for parsed_info, _ in tokenized], [])
    pos_tags = [entry['upos'] for entry in parsed_infos]

    tag_counts = Counter(pos_tags)
    total_tags = len(pos_tags)

    tag_distribution = {tag: count / total_tags
                        for tag, count in tag_counts.items()}

    return pd.Series(tag_distribution)

df_tags = df_samples['text'].apply(gen_pos_tags)
df_tags

Unnamed: 0,NOUN,PRON,PUNCT,ADP,VERB,PART,AUX,DET,CCONJ,ADJ,INTJ,SCONJ,ADV,PROPN,NUM,X
0,0.109195,0.155172,0.224138,0.045977,0.155172,0.063218,0.034483,0.051724,0.051724,0.045977,0.017241,0.022989,0.022989,,,
1,,,,,0.250000,0.250000,0.250000,,,,,,0.250000,,,
2,0.072289,0.156627,0.253012,0.024096,0.198795,0.036145,0.072289,0.024096,0.036145,0.012048,0.018072,0.018072,0.066265,0.006024,0.006024,
3,0.084746,0.118644,0.214689,0.056497,0.192090,0.039548,0.056497,0.028249,0.039548,0.028249,0.005650,0.033898,0.039548,0.033898,0.028249,
4,0.211111,0.055556,0.200000,0.094444,0.105556,0.011111,0.066667,0.011111,0.061111,0.077778,0.033333,0.011111,0.055556,,0.005556,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,0.197917,0.062500,0.145833,0.093750,0.114583,0.010417,0.041667,0.031250,0.093750,0.083333,0.020833,0.010417,0.072917,0.020833,,
596,0.183168,0.158416,0.163366,0.113861,0.099010,0.004950,0.034653,0.029703,0.044554,0.074257,0.009901,,0.059406,0.024752,,
597,0.181818,0.090909,0.090909,0.101010,0.151515,0.020202,0.040404,0.040404,0.101010,0.101010,,0.010101,0.030303,0.040404,,
598,0.166667,0.045455,0.196970,0.136364,0.045455,0.015152,0.030303,0.030303,0.090909,0.090909,,,0.060606,0.075758,0.015152,


In [46]:
df_tags = df_tags.fillna(0)
df_tags

Unnamed: 0,NOUN,PRON,PUNCT,ADP,VERB,PART,AUX,DET,CCONJ,ADJ,INTJ,SCONJ,ADV,PROPN,NUM,X
0,0.109195,0.155172,0.224138,0.045977,0.155172,0.063218,0.034483,0.051724,0.051724,0.045977,0.017241,0.022989,0.022989,0.000000,0.000000,0.0
1,0.000000,0.000000,0.000000,0.000000,0.250000,0.250000,0.250000,0.000000,0.000000,0.000000,0.000000,0.000000,0.250000,0.000000,0.000000,0.0
2,0.072289,0.156627,0.253012,0.024096,0.198795,0.036145,0.072289,0.024096,0.036145,0.012048,0.018072,0.018072,0.066265,0.006024,0.006024,0.0
3,0.084746,0.118644,0.214689,0.056497,0.192090,0.039548,0.056497,0.028249,0.039548,0.028249,0.005650,0.033898,0.039548,0.033898,0.028249,0.0
4,0.211111,0.055556,0.200000,0.094444,0.105556,0.011111,0.066667,0.011111,0.061111,0.077778,0.033333,0.011111,0.055556,0.000000,0.005556,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,0.197917,0.062500,0.145833,0.093750,0.114583,0.010417,0.041667,0.031250,0.093750,0.083333,0.020833,0.010417,0.072917,0.020833,0.000000,0.0
596,0.183168,0.158416,0.163366,0.113861,0.099010,0.004950,0.034653,0.029703,0.044554,0.074257,0.009901,0.000000,0.059406,0.024752,0.000000,0.0
597,0.181818,0.090909,0.090909,0.101010,0.151515,0.020202,0.040404,0.040404,0.101010,0.101010,0.000000,0.010101,0.030303,0.040404,0.000000,0.0
598,0.166667,0.045455,0.196970,0.136364,0.045455,0.015152,0.030303,0.030303,0.090909,0.090909,0.000000,0.000000,0.060606,0.075758,0.015152,0.0


In [47]:
text_features = pd.concat([text_features, df_tags], axis=1)
text_features

Unnamed: 0,!,(,),*,",",-,.,:,;,?,...,AUX,DET,CCONJ,ADJ,INTJ,SCONJ,ADV,PROPN,NUM,X
0,2,0,0,0,27,1,3,2,0,0,...,0.034483,0.051724,0.051724,0.045977,0.017241,0.022989,0.022989,0.000000,0.000000,0.0
1,0,0,0,0,0,0,0,0,0,0,...,0.250000,0.000000,0.000000,0.000000,0.000000,0.000000,0.250000,0.000000,0.000000,0.0
2,1,1,1,0,14,5,6,1,0,2,...,0.072289,0.024096,0.036145,0.012048,0.018072,0.018072,0.066265,0.006024,0.006024,0.0
3,0,2,1,0,20,4,8,2,0,1,...,0.056497,0.028249,0.039548,0.028249,0.005650,0.033898,0.039548,0.033898,0.028249,0.0
4,5,0,1,0,18,3,2,0,0,0,...,0.066667,0.011111,0.061111,0.077778,0.033333,0.011111,0.055556,0.000000,0.005556,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,0,0,0,0,9,0,3,1,0,0,...,0.041667,0.031250,0.093750,0.083333,0.020833,0.010417,0.072917,0.020833,0.000000,0.0
596,4,0,0,0,12,1,7,3,0,0,...,0.034653,0.029703,0.044554,0.074257,0.009901,0.000000,0.059406,0.024752,0.000000,0.0
597,0,0,0,0,5,1,3,1,0,0,...,0.040404,0.040404,0.101010,0.101010,0.000000,0.010101,0.030303,0.040404,0.000000,0.0
598,0,0,0,0,5,1,2,1,0,3,...,0.030303,0.030303,0.090909,0.090909,0.000000,0.000000,0.060606,0.075758,0.015152,0.0


### Distribution of Token Length

number of sentences

sentence length

average word length

words in all-caps

counts of words above and below 2-3 and 6 characters

## Contracted word forms

Count of preference towards one type of contraction, e.g. "I’m" versus "I am". The total number of occurrences of contractions and fully written forms are used as separate features.

## Function words

The frequency of each function word is counted and used as a separate feature. We use a combination of previously defined words and the function word list from the NLTK3 library.

## Readability indexes obtained

Flesch reading ease score, Dale-Chall readability score, SMOG grade, Flesch-Kincaid grade, Coleman-Liau index, Gunning-Fog index, automated readability index and the Linsear Write readability metric. Additionally, we count the number of difficult words and keep all indexes as separate features.

# Saving to files