# Imports

In [1]:
%reset -f

In [2]:
import re
import string

import numpy as np
import pandas as pd
pd.set_option('max_colwidth', 150)

In [3]:
DATA_PATH_PREP = '../DATA/prepared'

# Load data

In [4]:
df_samples = pd.read_pickle(f'{DATA_PATH_PREP}/03_df_samples.pkl')
print(df_samples.shape)
df_samples.head()

(600, 2)


Unnamed: 0,text,author
0,"сията й — злато с шепа, моля ти се. Късмет ли да го наречеш, какво да го наречеш — не знайш! Па вземи и децата й: по-голямото, не го ли знаете, ед...",aleko-konstantinov
1,сега не си раз,aleko-konstantinov
2,"ме какво да правя.\n\t— Как какво да правиш! Прави каквото правят хората: свий си опашката, па си налягай парцалите… Какво се кикотиш, не е ли тъй...",aleko-konstantinov
3,"а, че не съчувствувай на македонците…\n\t(Я тури две-три дървета в печката. Стига.)\n\tНямаше си хас, джанъм, този Биконсфилд, дето ни разпокъса. ...",aleko-konstantinov
4,"шва)… значи, и да се разгатне енигмата на зеленото сукно с металическото копче, и пламъкът на петрола в нощната тъмнина, и ужасният смрад, и цвърт...",aleko-konstantinov


## Character-based lexical features

The number of each distinct special character, spaces, punctuation, parentheses and quotation marks as separate features.

In [5]:
tmp_text = df_samples['text'][4]
tmp_text[:250]

'шва)… значи, и да се разгатне енигмата на зеленото сукно с металическото копче, и пламъкът на петрола в нощната тъмнина, и ужасният смрад, и цвъртението… и закритият фаетон в Драгоманското дефиле, и всичко, и всичко… У-у! Косите ми настръхват… Но, ба'

In [6]:
format_specific = ''.join({char for char in tmp_text if not char.isalnum() and char not in string.punctuation and char not in {'\t', '\n'}})
print(f'{format_specific=}')

special = string.punctuation + format_specific + '=#<> '
special

format_specific=' —…'


'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ —…=#<> '

In [7]:
def count_characters(string):
    return {char: string.count(char) for char in special}

char_counts = df_samples['text'].apply(count_characters)
char_counts[:3]

0    {'!': 2, '"': 0, '#': 0, '$': 0, '%': 0, '&': 0, ''': 0, '(': 0, ')': 0, '*': 0, '+': 0, ',': 27, '-': 1, '.': 3, '/': 0, ':': 2, ';': 0, '<': 0, ...
1    {'!': 0, '"': 0, '#': 0, '$': 0, '%': 0, '&': 0, ''': 0, '(': 0, ')': 0, '*': 0, '+': 0, ',': 0, '-': 0, '.': 0, '/': 0, ':': 0, ';': 0, '<': 0, '...
2    {'!': 1, '"': 0, '#': 0, '$': 0, '%': 0, '&': 0, ''': 0, '(': 1, ')': 1, '*': 0, '+': 0, ',': 14, '-': 5, '.': 6, '/': 0, ':': 1, ';': 0, '<': 0, ...
Name: text, dtype: object

In [8]:
text_features = pd.DataFrame.from_records(char_counts)
text_features

Unnamed: 0,!,"""",#,$,%,&,',(,),*,...,^,_,`,{,|,},~,Unnamed: 19,—,…
0,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,138,5,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,3,0,0
2,1,0,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,123,7,9
3,0,0,0,0,0,0,0,2,1,0,...,0,0,0,0,0,0,0,137,1,3
4,5,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,142,1,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,81,0,0
596,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,170,7,0
597,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,89,0,0
598,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,52,0,0


In [9]:
only_zeros = text_features.columns[(text_features == 0).all()].tolist()
print(f'{only_zeros=}')
print(f'{len(only_zeros)=}')

text_features = text_features.drop(only_zeros, axis=1)
text_features

only_zeros=['"', '#', '$', '%', '&', "'", '+', '/', '<', '=', '>', '@', '\\', '^', '`', '{', '|', '}', '~']
len(only_zeros)=19


Unnamed: 0,!,(,),*,",",-,.,:,;,?,[,],_,Unnamed: 14,—,…
0,2,0,0,0,27,1,3,2,0,0,0,0,0,138,5,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0
2,1,1,1,0,14,5,6,1,0,2,0,0,0,123,7,9
3,0,2,1,0,20,4,8,2,0,1,0,0,0,137,1,3
4,5,0,1,0,18,3,2,0,0,0,0,0,0,142,1,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,0,0,0,0,9,0,3,1,0,0,0,0,0,81,0,0
596,4,0,0,0,12,1,7,3,0,0,0,0,0,170,7,0
597,0,0,0,0,5,1,3,1,0,0,0,0,0,89,0,0
598,0,0,0,0,5,1,2,1,0,3,0,0,0,52,0,0


## Sentence- and word-based features

Distribution of POS-tags, token length, number of sentences, sentence length, average word length, words in all-caps and counts of words above and below 2-3 and 6 characters as separate features.

In [12]:
import classla
# classla.download('bg')
nlp = classla.Pipeline('bg', processors='tokenize,pos')
doc = nlp("Алеко Константинов е роден в Свищов.")
print(doc.to_conll())

2023-02-01 23:34:50 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package  |
------------------------
| tokenize  | standard |
| pos       | standard |

2023-02-01 23:34:50 INFO: Use device: cpu
2023-02-01 23:34:50 INFO: Loading: tokenize
2023-02-01 23:34:50 INFO: Loading: pos
2023-02-01 23:34:53 INFO: Done loading processors!


# newpar id = 1
# sent_id = 1.1
# text = Алеко Константинов е роден в Свищов.
1	Алеко	_	PROPN	Npmsi	Definite=Ind|Gender=Masc|Number=Sing	_	_	_	_
2	Константинов	_	PROPN	Hmsi	Definite=Ind|Gender=Masc|Number=Sing	_	_	_	_
3	е	_	AUX	Vxitf-r3s	Aspect=Imp|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin|Voice=Act	_	_	_	_
4	роден	_	VERB	Vpptcv--smi	Aspect=Perf|Definite=Ind|Gender=Masc|Number=Sing|VerbForm=Part|Voice=Pass	_	_	_	_
5	в	_	ADP	R	_	_	_	_	_
6	Свищов	_	PROPN	Npmsi	Definite=Ind|Gender=Masc|Number=Sing	_	_	_	SpaceAfter=No
7	.	_	PUNCT	punct	_	_	_	_	_




## Contracted word forms

Count of preference towards one type of contraction, e.g. "I’m" versus "I am". The total number of occurrences of contractions and fully written forms are used as separate features.

## Function words

The frequency of each function word is counted and used as a separate feature. We use a combination of previously defined words and the function word list from the NLTK3 library.

## Readability indexes obtained

Flesch reading ease score, Dale-Chall readability score, SMOG grade, Flesch-Kincaid grade, Coleman-Liau index, Gunning-Fog index, automated readability index and the Linsear Write readability metric. Additionally, we count the number of difficult words and keep all indexes as separate features.

# Saving to files