In [None]:
import spacy
import pandas as pd
from collections import defaultdict
import transliterate
from wordcloud import WordCloud
import json
import matplotlib.pyplot as plt
import numpy as np
from deep_translator import GoogleTranslator
import langdetect
import re


model = spacy.load("ru_core_news_lg")
data = pd.read_csv('responses.csv', sep=';', header=None, names=['responses'])

In [None]:
def detect_lang(text):
    try:
        return 'en' if langdetect.detect(text) == 'en' else 'ru'
    except: return 'ru'

def translate_text(text):
    return GoogleTranslator(source='en', target='ru').translate(text)

def fix_translit(text):
    return transliterate.translit(text, 'ru')

def clean_text(text):
    return re.sub(r'[^\w\sа-яА-ЯёЁ]+', '', text)

def extract_key_phrase(phrase):
    doc = model(phrase)
    if len(phrase) > 4:
        root = [t for t in doc if t.dep_ == "ROOT"][0]
        result = root.text
        if root.pos_ == "NOUN":
            adj = [t for t in doc if t.dep_ == "amod" and t.head == root]
            if adj: result = f"{root.text} {adj[0].text}"
        elif root.pos_ == "VERB":
            adv = [t for t in doc if t.dep_ == "advmod" and t.head == root]
            if adv: result = f"{root.text} {adv[0].text}"
        return result
    return phrase

data['translated'] = data['responses'].apply(lambda x: translate_text(x) if detect_lang(x) == 'en' else x)
data['translated'] = data['translated'].apply(fix_translit).apply(clean_text)
data['key_phrases'] = data['translated'].apply(extract_key_phrase)



In [None]:
vec_cache = {}
def get_vector(word):
    if word not in vec_cache:
        vec_cache[word] = model(" ".join([t.lemma_ for t in model(word)])).vector
    return vec_cache[word]

def similarity(a, b):
    v1, v2 = get_vector(a), get_vector(b)
    return np.dot(v1, v2)/(np.linalg.norm(v1)*np.linalg.norm(v2))

words = list(data['key_phrases'])
groups = defaultdict(list)
threshold = 0.6

i = 0
while i < len(words):
    current = words[i]
    if any(current in g for g in groups.values()):
        i += 1
        continue
        
    groups[current] = [current]
    similar = [w for w in words if similarity(current, w) > threshold]
    
    for word in similar:
        if word in words:
            groups[current].append(word)
            words.remove(word)

with open('synonyms.json', 'w', encoding='utf-8') as f:
    json.dump(dict(groups), f, ensure_ascii=False, indent=2)


In [None]:
# Генерация облака слов
freq = {k: len(v) for k, v in groups.items()}
cloud = WordCloud(width=800, height=500, background_color="white").generate_from_frequencies(freq)

plt.figure(figsize=(16,12))
plt.imshow(cloud, interpolation='bilinear')
plt.axis("off")
plt.savefig('wordcloud.png', bbox_inches='tight')