In [None]:
import json
import os
import gdown
import numpy as np
import random
from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.pyplot as plt

In [None]:
file_url = 'https://drive.google.com/uc?id=1jFepAVwFQkD0EQ5C7qeoEji9Y1Ydd9j7'
output = 'training_paraphrase.json'

if not os.path.exists(output):
    gdown.download(file_url, output, quiet=False)
print('Данные скачаны')

with open('training_paraphrase.json', 'r', encoding='utf-8') as f:
    data = json.load(f)
print('Данные загружены')

In [None]:
def prepare_data(data):
    # Отбираем только качественные примеры
    good_data = []
    for item in data:
        if item['accuracy'] >= 0.9:
            good_data.append(item)
    
    # Группируем по оригиналу
    data = {}
    for item in good_data:
        key = item['response']
        if key not in data:
            data[key] = {
                'texts': [],
                'vectors': []
            }
        data[key]['texts'].append(item['text'])
        data[key]['vectors'].append(item['best_vectors'])

    # Отбираем только полные примеры
    good_data = {}
    for i in range(len(data)):
        key = list(data.keys())[i]
        texts = data[key]['texts']
        if len(texts) == 6:
            good_data[key] = data[key]
    
    # Добавляем marks
    marks = ['original', 'lexical', 'lexical', 'semantic', 'semantic', 'semantic']
    keys = good_data.keys()
    for key in keys:
        good_data[key]['marks'] = marks
    good_data = {i: value for i, value in enumerate(good_data.values())}

    return good_data   

In [None]:
new_data = prepare_data(data)

In [None]:
# Визуализация при помощи t-sne
def visualize_vectors(data, sample_size=10, e_vectors=True):
    cluster_ids = list(data.keys())
    sample_ids = random.sample(cluster_ids, sample_size)

    X_sample = []
    y_sample = []
    marks_sample = []

    for ids in sample_ids:
        item = data[ids]
        for vec, mark in zip(item['vectors'], item['marks']):
            if e_vectors:
                X_sample.append(vec[0])
            else:
                X_sample.append(vec[1])
            y_sample.append(ids)
            marks_sample.append(mark)
    X_sample = np.array(X_sample)
    y_sample = np.array(y_sample)
    marks_sample = np.array(marks_sample)

    random_state = 42
    X_tsne = TSNE(n_components=2, perplexity=15, metric='cosine', random_state=random_state).fit_transform(X_sample)

    marker_map = {
        'original': 'o',
        'lexical': 's',
        'semantic': '^'
    }
    plt.figure(figsize=(8,6))
    for mark, marker in marker_map.items():
        idx = marks_sample == mark
        plt.scatter(X_tsne[idx, 0], X_tsne[idx, 1], c=y_sample[idx], cmap='tab20', marker=marker, s=80, label=mark, edgecolors='k')
    plt.title(f't-SNE: sample of {sample_size} clusters')
    plt.legend(title='Augmentation type')
    plt.show()

In [None]:
visualize_vectors(new_data, 7, False)

In [None]:
# Критерий силуэта
def get_silhouette(data, sample_size=None, e_vectors=True):
    cluster_ids = list(data.keys())
    if not sample_size:
        sample_ids = cluster_ids
    else:
        sample_ids = random.sample(cluster_ids, sample_size)    

    X = []
    labels = []
    for ids in sample_ids:
        item = data[ids]
        for vec in item['vectors']:
            if e_vectors:
                X.append(vec[0])
            else:
                X.append(vec[1])
            labels.append(ids)

    values = silhouette_samples(X, labels, metric='cosine')
    score = silhouette_score(X, labels, metric='cosine')  
    return values, score

In [None]:
val, score = get_silhouette(new_data, None, e_vectors=False)
print(f'Среднее значение критерия силуэта: {score}')
print(f'Максимальное значение критерия силуэта: {max(val)}')
print(f'Минимальное значение критерия силуэта: {min(val)}')