# Imports

In [2]:
!pip install corpus_distance

Looking in indexes: https://test.pypi.org/simple/
Collecting corpus_distance
  Downloading https://test-files.pythonhosted.org/packages/40/3b/285303c4388e2a631ce690c9a34ce0a4a3e9163e7a3214c05bc942517be9/corpus_distance-0.3-py3-none-any.whl (125 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m125.0/125.0 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: corpus_distance
Successfully installed corpus_distance-0.3


In [3]:
import corpus_distance

# Data loading

In [4]:
CONTENT_DIR = "/content/texts"
TOPIC_NORMALISATION = True
SPLIT = 1

Texts (or collections of texts) should be pre-tokenised single strings, (optionally) stored in separate files. Filenames should contain lect name before extension, split by '.'. For example, 'Akimov.Belogornoje.txt', where *Akimov* is a text name, *Belogornoje* is a lect name, and *txt* is an extension.

Texts become dictionary keys, and lects names - its values.

In [5]:
from corpus_distance.data_preprocessing.data_loading import load_data
df = load_data(CONTENT_DIR, SPLIT)

The next stage is transformation of dictionary into a dataframe of the following format:

| index | text | lect |
| -------- | ------- |------- |
| 0 | text1 | lect1 |
| 1 | text2 | lect1 |
| 2 | text1 | lect2 |
| ... | ... | ... |
| m | textN | lectK |

*m* here represents the overall number of texts, *K* - the overall number of lects, and *N* is the number of texts in lect *K*.  

In [6]:
df.head()

Unnamed: 0,text,lect
0,﻿ⴕ поклонъ и бласловлѣнье • ѡт ѧкова епискупа ...,Polotsk
1,се азъ кнѧзь ѡлександръ и сынъ мои дмитрии с п...,Novgorod
2,"се язъ князь ярославъ володимѣричь , сгадавъ с...",Novgorod
3,кнѧз гердень кланѧтьс всем темь кто видить сѵю...,Polotsk
4,﻿се язъ кнѧзь смоленьскыи федоръ • сѹдилъ есмь...,Smolensk


# Data processing

Here we get lect names.

In [7]:
from corpus_distance.cdutils import get_lects_from_dataframe

In [8]:
lects = get_lects_from_dataframe(df)

In [9]:
lects

['Polotsk', 'Novgorod', 'Smolensk']

## Topic modelling

Topic modelling is used to delete topic words that reflect the features of the texts, and not the language.

In [10]:
from corpus_distance.data_preprocessing.topic_modelling import get_topic_words_for_lects, add_thematic_modelling

In [11]:
topic_words = get_topic_words_for_lects(df, lects)



In [12]:
df_without_topics = add_thematic_modelling(df, topic_words, TOPIC_NORMALISATION)

In [13]:
df_without_topics.head()

Unnamed: 0,text,lect,text_topic_normalised
0,﻿ⴕ поклонъ бласловлѣнье ѧкова епискупа полотьс...,Polotsk,﻿ⴕ поклонъ бласловлѣнье ѧкова епискупа полотьс...
1,се азъ кнѧзь ѡлександръ сынъ мои дмитрии с пос...,Novgorod,се азъ кнѧзь ѡлександръ сынъ мои дмитрии с пос...
2,се язъ князь ярославъ володимѣричь сгадавъ с п...,Novgorod,се язъ князь ярославъ володимѣричь сгадавъ с п...
3,кнѧз гердень кланѧтьс всем темь кто видить сѵю...,Polotsk,кнѧз гердень кланѧтьс всем темь кто видить сѵю...
4,﻿се язъ кнѧзь смоленьскыи федоръ сѹдилъ есмь б...,Smolensk,﻿се язъ кнѧзь смоленьскыи федоръ сѹдилъ есмь б...


## Vectorisation

I start with creating a model for representing key properties of the lect:

* Its name
* Text it contains, lowercased
* Its alphabet (with obligatory CLS `^` and EOS `$` symbols)
* Amount of enthropy of its alphabet
* Vector for each given symbol of alphabet

In [14]:
from corpus_distance.data_preprocessing.vectorisation import create_vectors_for_lects, gather_vector_information, FastTextParams

In [15]:
vectors_for_lects = create_vectors_for_lects(df_without_topics)

100%|██████████| 3/3 [00:08<00:00,  2.78s/it]


In [16]:
from pprint import pprint

In [17]:
pprint(vectors_for_lects)

{'Novgorod': <corpus_distance.data_preprocessing.vectorisation.Lect object at 0x7ac52bf9f520>,
 'Polotsk': <corpus_distance.data_preprocessing.vectorisation.Lect object at 0x7ac52bf9ee30>,
 'Smolensk': <corpus_distance.data_preprocessing.vectorisation.Lect object at 0x7ac52bf9f5e0>}


# Date preprocessing

The first stage of data preprocessing is splitting tokens into character 3-grams. The character n-grams help to find coinciding sequences more easily, than tokens or token n-grams. Specifically 3-grams help to underscore the exact places where the change is happening, providing minimal left and right context for each symbol within the sequence. Adding special symbols *^* and *$* to the start and the end of each sequence helps to do this for the first and the last symbol of the given sequence as well.

In [18]:
from corpus_distance.data_preprocessing.shingle_processing import split_lects_by_n_grams

In [19]:
df_with_n_grams = split_lects_by_n_grams(df_without_topics)

New dataframe is in the following format:

| index | lect | n-gram array |
| -------- | ------- |------- |
| 0 | lect1 | n-grams of lect1 |
| 1 | lect1 | n-grams of lect1 |
| ... | ... | ... |
| k | lectK | n-grams of lect lectK |

Here, *k* is overall number of lects.

In [20]:
df_with_n_grams.head()

Unnamed: 0,lect,n_grams
0,Polotsk,"[^ⴕ$, ^по, пок, окл, кло, лон, онъ, нъ$, ^бл, ..."
1,Novgorod,"[^се, се$, ^аз, азъ, зъ$, ^кн, кнѧ, нѧз, ѧзь, ..."
2,Smolensk,"[^се, се$, ^яз, язъ, зъ$, ^кн, кнѧ, нѧз, ѧзь, ..."


The next step is to rank n-grams by frequency. The results form *frequency_arranged_n_grams* column of the dataframe.

In [21]:
from corpus_distance.data_preprocessing.frequency_scoring import count_n_grams_frequencies

In [22]:
df_new = count_n_grams_frequencies(df_with_n_grams)

In [23]:
# add information on letter vectors and alphabet information to dataframe

df_new = gather_vector_information(df_new, vectors_for_lects)

In [24]:
df_new.head()

Unnamed: 0,lect,n_grams,frequency_arranged_n_grams,relative_frequency_n_grams,lect_vectors,lect_info
0,Polotsk,"[^ⴕ$, ^по, пок, окл, кло, лон, онъ, нъ$, ^бл, ...","[(^по, 0), (ти$, 1), (пол, 2), (ть$, 3), (оло,...","[(^по, 0.25), (ти$, 0.25049455984174085), (пол...","{'^': [-0.097130634, 0.011102267, 0.04695524, ...",4.505268
1,Novgorod,"[^се, се$, ^аз, азъ, зъ$, ^кн, кнѧ, нѧз, ѧзь, ...","[(оро, 0), (^по, 1), (ть$, 2), (мъ$, 3), (нов,...","[(оро, 0.25), (^по, 0.2504887585532747), (ть$,...","{'^': [0.119499736, -0.118440114, -0.010866955...",4.565759
2,Smolensk,"[^се, се$, ^яз, язъ, зъ$, ^кн, кнѧ, нѧз, ѧзь, ...","[(ть$, 0), (ьск, 1), (^см, 2), (смо, 3), (мол,...","[(ть$, 0.25), (ьск, 0.2504646840148699), (^см,...","{'^': [-0.055083457, -0.089446664, 0.040926043...",4.500708


# Metrics

First step is to introduce a measure for hybridisation.

One possible measure is scoring Euclidean distance between sum of letter vectors for each n-gram. This results in a loss of order within n-gram, which can yield possible disadvantages (bra === bar), when the measure is used alone; however, when joined with DistRank and Jaro distance, hopefully they yield better results.

Optional normalisation includes using alphabet information difference, calculated via subtraction of the second alphabet information from the first one. This allows to compensate for the cases, when letter from one alphabet may have multiple correspondences in the other, depending on the context. Direct (and not reversed, `1 - X`) measure is better, because the more information one alphabet carries, when contrasted to the other, the more possible one-to-many correspondences there are, the more distortions in vectors there are, the more normalisation is needed.

Final normalisation includes traditional split by maximal length of two strings, introduced in Holman et al. (2008).

In [25]:
from corpus_distance.distance_measurement.string_similarity import *
from corpus_distance.distance_measurement.hybridisation import HybridisationParameters

In [26]:
# assigning global values
# group of languages  and its outgroup
GROUP = "East Slavic"
OUTGROUP = "Zialionka"

# if hybrid metrics aids DistRank
HYBRIDISATION = True
# if hybrid values join DistRank values in a single array, or they both are
# independent values, equally contributing to the final metric
HYBRIDISATION_AS_ARRAY = False

# if distrank normalisation includes soerensen coefficient
SOERENSEN_NORMALISATION = True

# choose a metric for hybridisation
HYBRID = jaro_vector_wrapper

# if string similarity measure includes correction by
# difference in the information that alphabets carry
ALPHABET_NORMALISATION = True

# metric description
METRICS = f"{GROUP}-{SPLIT}-{TOPIC_NORMALISATION}-DistRank-{SOERENSEN_NORMALISATION}-{HYBRIDISATION}-{HYBRIDISATION_AS_ARRAY}-{HYBRID.__name__}-{ALPHABET_NORMALISATION}"

In [27]:
hybridisation_parameters = HybridisationParameters(HYBRIDISATION, SOERENSEN_NORMALISATION, HYBRIDISATION_AS_ARRAY, HYBRID, ALPHABET_NORMALISATION)

In [28]:
METRICS

'East Slavic-1-True-DistRank-True-True-False-jaro_vector_wrapper-True'

In [29]:
from corpus_distance.distance_measurement.metrics_pipeline import score_metrics_for_corpus_dataset

In [30]:
# declare arrays
# calculate distances for each pair of lects
overall_results = score_metrics_for_corpus_dataset(df_new, "/content", METRICS, hybridisation_parameters)

100%|██████████| 1077/1077 [00:01<00:00, 930.45it/s] 
100%|██████████| 632/632 [00:33<00:00, 18.95it/s]
100%|██████████| 579/579 [00:36<00:00, 16.07it/s]
100%|██████████| 1077/1077 [00:00<00:00, 1602.41it/s]
100%|██████████| 650/650 [00:37<00:00, 17.43it/s]
100%|██████████| 585/585 [00:38<00:00, 15.36it/s]
100%|██████████| 1024/1024 [00:00<00:00, 1572.41it/s]
100%|██████████| 636/636 [00:42<00:00, 15.01it/s]
100%|██████████| 624/624 [00:40<00:00, 15.39it/s]


# Clusterisation

The final step is to cluster the lects into groups, and to decide, whether the method works correctly.

In [31]:
from corpus_distance.clusterisation.clusterisation import ClusterisationParameters, clusterise_lects_from_distance_matrix
from Bio.Phylo.TreeConstruction import DistanceTreeConstructor

In [32]:
cluster_params = ClusterisationParameters(lects, OUTGROUP, GROUP, METRICS, DistanceTreeConstructor().upgma, "/content")

In [33]:
clusterise_lects_from_distance_matrix(overall_results, cluster_params)