In [1]:
import numpy as np
import pandas as pd
import os
from PIL import Image
import tqdm
from global_kmeans_pp import global_clustering
from sklearn.decomposition import FastICA
from scipy.spatial import distance
from time import time
from seaborn import heatmap, color_palette
import warnings
import pickle

In [2]:
features = []

In [3]:
def preprocessing():
    def npy_to_array(file_name):
        with open(f"morphnpys/{file_name}", 'rb') as f:
            features.append(file_name.rstrip(".npy"))
            return np.load(f)

    lst_dr = os.listdir("morphnpys")
    lst_dr = sorted(lst_dr, key=lambda x: (int(x.split(",")[0].lstrip("Том ")),
                                           float(x.split("карта ")[1].split(",")[0]),
                                           float(x.split("ареал ")[1].rstrip(".npy"))
                                          )
                   )
    
    stck = np.stack(tuple(map(npy_to_array, lst_dr)), axis=2)
    
    total_area = np.array(Image.open('area.png'))
    zero_indices = np.where(total_area == 0)

    zeros = np.zeros(stck.shape[2])
    stck[zero_indices] = zeros
    
    stck = stck.reshape(stck.shape[0] * stck.shape[1], -1)

    positions = np.where(~np.all(stck == zeros, axis=1))[0]

    X = stck[positions]
    
    return stck, positions, X

In [4]:
stck, positions, X = preprocessing()

In [5]:
X = X.astype("int")

Градиент

In [6]:
ica = FastICA(n_components=3)
ica.fit(X)
data = ica.transform(X)

In [7]:
scaled_data = (data - data.min(axis=0)) / (data.max(axis=0) - data.min(axis=0)) * 255
rgb = scaled_data.astype("int")

In [8]:
rgb = np.hstack((rgb, np.full((X.shape[0], 1), 255)))

In [9]:
new_image = np.zeros((stck.shape[0], 4))
new_image[positions] = rgb
res = new_image.reshape((984, 969, 4))
res = Image.fromarray(res.astype(np.uint8))
res.save("gradient.png")

Кластеризация

In [10]:
n_clusters = 20

In [11]:
s = time()
model = global_clustering.GlobalKMeansPP(n_clusters=n_clusters, verbose=2)
model.fit(X)
print(time() - s)

Solving 2-means
Solving 3-means
Solving 4-means
Solving 5-means
Solving 6-means
Solving 7-means
Solving 8-means
Solving 9-means
Solving 10-means
Solving 11-means
Solving 12-means
Solving 13-means
Solving 14-means
Solving 15-means
Solving 16-means
Solving 17-means
Solving 18-means
Solving 19-means
Solving 20-means
2498.674989938736


In [12]:
clusters = model.labels_
cluster_centers = model.cluster_centers_
inertia = model.inertia_

In [13]:
with open('clusters.pkl', 'wb') as f:
    pickle.dump(clusters, f)

with open('cluster_centers.pkl', 'wb') as f:
    pickle.dump(cluster_centers, f)

with open('inertia.pkl', 'wb') as f:
    pickle.dump(inertia, f)

In [14]:
with open('clusters.pkl', 'rb') as f:
    clusters = pickle.load(f)

with open('cluster_centers.pkl', 'rb') as f:
    cluster_centers = pickle.load(f)

with open('inertia.pkl', 'rb') as f:
    inertia = pickle.load(f)

In [15]:
warnings.simplefilter("ignore", UserWarning)

In [16]:
all_cluster_centers = np.concatenate(list(cluster_centers.values()))

In [17]:
all_cluster_centers.shape

(210, 158)

In [18]:
data = ica.transform(all_cluster_centers)

In [19]:
scaled_data = (data - data.min(axis=0)) / (data.max(axis=0) - data.min(axis=0)) * 255
rgb = scaled_data.astype("int")

In [20]:
rgb = np.hstack((rgb, np.full((all_cluster_centers.shape[0], 1), 255)))

In [21]:
for i in tqdm.tqdm(range(1, n_clusters + 1)):
    nth_clusters = clusters[i]
    nth_rgb = rgb[sum(range(i)):sum(range(i + 1))]

    nth_clusters_rgb = [nth_rgb[i] for i in nth_clusters]

    new_image = np.zeros((stck.shape[0], 4))
    new_image[positions] = nth_clusters_rgb
    res = new_image.reshape((984, 969, 4))
    res = Image.fromarray(res.astype(np.uint8))
    res.save(f"{i}_clusters.png")

100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [01:01<00:00,  3.05s/it]


Близость кластеров

In [22]:
nth_clusters = clusters[n_clusters]
nth_cluster_centers = cluster_centers[n_clusters]

In [23]:
for i in tqdm.tqdm(range(n_clusters)):
    distances = np.apply_along_axis(lambda x: distance.euclidean(nth_cluster_centers[i], x), 1, nth_cluster_centers)
    norm_distances = (distances - distances.min()) / (distances.max() - distances.min())
    cmap = color_palette("inferno_r", as_cmap=True)
    heat_rgb = cmap(norm_distances) * 255
    
    heat_clusters_rgb = [heat_rgb[i] for i in nth_clusters]

    new_image = np.zeros((stck.shape[0], 4))
    new_image[positions] = heat_clusters_rgb
    res = new_image.reshape((984, 969, 4))
    res = Image.fromarray(res.astype(np.uint8))
    res.save(f"proximity_to_{i}_out_of_{n_clusters}.png")

100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [00:39<00:00,  1.96s/it]


Близость к прототипам

In [24]:
distances = np.array(list(map(lambda x: distance.euclidean(X[x], nth_cluster_centers[nth_clusters[x]]), range(X.shape[0]))))
norm_distances = (distances - distances.min()) / (distances.max() - distances.min())
cmap = color_palette("inferno_r", as_cmap=True)
heat_rgb = cmap(norm_distances) * 255

In [25]:
new_image = np.zeros((stck.shape[0], 4))
new_image[positions] = heat_rgb
res = new_image.reshape((984, 969, 4))
res = Image.fromarray(res.astype(np.uint8))
res.save("proximity_to_centres_of_clusters.png")

Границы кластеров

In [26]:
arr = np.zeros((stck.shape[0]))
arr[positions] = nth_clusters + 1
arr = arr.reshape((984, 969))

In [27]:
bool_arr = (((np.roll(arr, shift=(1, 0), axis=(1, 1)) == arr) | np.logical_not(np.roll(arr, shift=(1, 0), axis=(1, 1)))) \
           & ((np.roll(arr, shift=(-1, 0), axis=(1, 1)) == arr) | np.logical_not(np.roll(arr, shift=(-1, 0), axis=(1, 1)))) \
           & ((np.roll(arr, shift=(1, 0), axis=(0, 1)) == arr) | np.logical_not(np.roll(arr, shift=(1, 0), axis=(0, 1)))) \
           & ((np.roll(arr, shift=(-1, 0), axis=(0, 1)) == arr) | np.logical_not(np.roll(arr, shift=(-1, 0), axis=(0, 1))))) \
           | np.logical_not(arr)

bool_arr = ~bool_arr

In [28]:
res = np.zeros(bool_arr.shape + (4,))
res[bool_arr] = np.array([255] * 4)

res = Image.fromarray(res.astype(np.uint8))
res.save("borders.png")

Важные признаки для кластеров

In [35]:
with open("morph_areals_data.txt", "r", encoding="UTF-8") as file:
    areals_data = [_.strip("\n") for _ in file.readlines()]

In [38]:
nth_clusters = clusters[n_clusters]
nth_cluster_centers = cluster_centers[n_clusters]

info = np.zeros((n_clusters, X.shape[-1]))

for n in range(n_clusters):
    wh = np.where(nth_clusters == n)
    cluster_size = len(wh[0])
    print(cluster_size)
    cluster_size = np.full(X.shape[-1], cluster_size) # размеры кластера
    intersection = X[wh].sum(axis=0) # размеры пересечений
    areal_sizes = X.sum(axis=0) # размеры ареалов
    union = areal_sizes + cluster_size - intersection # размеры объединений
    total_size = np.full(X.shape[-1], X.shape[0]) # размеры всего

    IoU = intersection / union
    info[n] = IoU
    
    top = sorted(enumerate(IoU), reverse=True, key=lambda x: x[1])
    for index, t in top[:5]:
        print(features[index], areals_data[index], t, sep="")
    print()

24734
Том 4, карта 6.6, ареал 3 — зона диссимилятивного аканья на русской территории (см. [ДАРЯ, 1,  карта № 2]).0.5956471935853379
Том 3, карта 12, ареал 7формы вин.-им. п. отмечены в единичных пунктах (не картографировано): рус. кони поить, коровы доить0.5051040788247304
Том 2, карта 5, ареал 5<Р’> = <1, 2, 3 л. ед., 1, 2, 3 л. мн.>, например, рус. диал. л'уб'у́ ~ л'у́б'иш..., л'у́б'ат'0.40477944185285214
Том 1, карта 5, ареал 7[к, г, ɣ, h] ~ [к’, к., г’, ɣ’, h’...] = <1 л. ед., 3 л. мн.> ~ <2, 3 л. ед., 1, 2 л. мн.>0.39405779270441926
Том 2, карта 8.1, ареал 12Р. предложный, М., женѣ, коровѣ, Р. беспредложный, Д. жены, коровы (рус.)0.3188748188748189

21904
Том 3, карта 4.1, ареал 3Переходное смягчение заднеязычных согласных перед окончанием счетной формы существительных а-склонения и о-склонения среднего рода, восходящим к окончанию nom.-acc. dualis: отмечено у существительных о-склонения среднего рода (укр. дв'í jáблуц'i, бел. д'з'в'é в'éц'e),0.36011944260119444
Том 3, карта 4.1, 

Близость к набору признаков

In [55]:
Bud_corp = np.array(list("01000001100010000010000100010100010001111101110000000000000001000001000000101010000000000000000100000000000101010011000000000000001101010010000011101001010111")).astype("int")

In [56]:
Bud_cart = stck.reshape((984, 969, stck.shape[-1]))[590, 395]

In [146]:
distances = np.array(list(map(lambda x: np.array([distance.euclidean(X[x], Bud_cart), 
                                                  distance.euclidean(X[x], Bud_corp)]), 
                              range(X.shape[0]))))

In [147]:
norm_distances = (distances - distances.min()) / (distances.max() - distances.min())

In [150]:
for ind, comp in {0: "dot", 1: "corp"}.items():
    cmap = color_palette("inferno_r", as_cmap=True)
    heat_rgb = cmap(norm_distances[:, ind]) * 255

    new_image = np.zeros((stck.shape[0], 4))
    new_image[positions] = heat_rgb
    res = new_image.reshape((984, 969, 4))
    res = Image.fromarray(res.astype(np.uint8))
    res.save(f"proximity_to_{comp}.png")

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [163]:
from scipy.stats import spearmanr, kendalltau, chi2_contingency

In [164]:
spearmanr(Bud_corp, Bud_cart)

SignificanceResult(statistic=0.5166799166058599, pvalue=3.66653701036644e-12)

In [165]:
kendalltau(Bud_corp, Bud_cart)

SignificanceResult(statistic=0.5166799166058598, pvalue=9.54540370004806e-11)

In [153]:
contingency_table = pd.crosstab(Bud_cart, Bud_corp)

In [155]:
chi2, p, dof, expected = chi2_contingency(contingency_table)

In [161]:
np.sqrt(chi2 / Bud_cart.shape[0])

0.4988555876050462