In [2]:
import json
import re
from pathlib import Path
import pandas as pd
import glob
import os

In [3]:
json_folder = Path('../data/json')
json_paths = glob.glob(str(json_folder / '*.json'))
json_paths

['../data/json/chat2023.json',
 '../data/json/chat2022.json',
 '../data/json/chat2024.json',
 '../data/json/chat2021.json']

In [4]:
def normalize_text(field):
    if isinstance(field, str):
        return field.strip()
    if isinstance(field, list):
        parts = []
        for el in field:
            if isinstance(el, str):
                parts.append(el.strip())
            elif isinstance(el, dict) and 'text' in el and isinstance(el['text'], str):
                parts.append(el['text'].strip())
        return ' '.join([p for p in parts if p])
    return ''

In [5]:
EMOJI_PATTERN = re.compile(
    r"[\U0001F300-\U0001F6FF\u2600-\u26FF\u2700-\u27BF]+",
    flags=re.UNICODE
)

def remove_emojis(text: str) -> str:
    return EMOJI_PATTERN.sub('', text)

In [6]:
rows = []
for p in json_paths:
    with open(p, 'r', encoding='utf-8') as f:
        data = json.load(f)
    msgs = data.get('messages', [])
    for m in msgs:
        text = normalize_text(m.get('text', ''))
        txt = remove_emojis(text).strip()
        if not text:
            continue
        if len(txt.split()) <= 4:
            continue
        rows.append({
            'source': os.path.splitext(os.path.basename(p))[0],
            'id': m.get('id'),
            'date': m.get('date'),
            'from':m.get('from'),
            # 'from_id':m.get('from_id'),
            'reply_to_message_id': m.get('reply_to_message_id'),
            'text': text,
        })


In [7]:
df = pd.DataFrame(rows)
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df = df.sort_values('date').reset_index(drop=True)

In [8]:
df.tail()

Unnamed: 0,source,id,date,from,reply_to_message_id,text
59039,chat2024,70490,2024-09-16 17:42:52,–ú–∞–∫—Å–∏–º –ü—Ä–∏–µ–º–∫–∞,,–ï—â–µ 5-—Ç—å –º–µ—Å—Ç –Ω–∞ —ç–∫—Å–∫—É—Ä—Å–∏—é –¥–ª—è –ø–µ—Ä–≤–∞–∫–æ–≤ 8 –∏–Ω—Å—Ç...
59040,chat2024,70505,2024-11-18 17:41:44,–ú–∞–∫—Å–∏–º –ü—Ä–∏–µ–º–∫–∞,2.0,üëÄ –ú—ã –∑–Ω–∞–µ–º –∫–∞–∫ –≤–æ–ª–Ω–∏—Ç–µ–ª—å–Ω–∞ –∏ —Ç—Ä–µ–≤–æ–∂–Ω–∞ –¥–ª—è –ø–µ—Ä–≤...
59041,chat2024,70507,2024-11-20 14:08:17,(–ê—Ä—Ö–∏–≤ 2024) 8 –∏–Ω—Å—Ç–∏—Ç—É—Ç –ú–ê–ò - —ç—Ç–æ IT 2024,,üëÄ –ú—ã –∑–Ω–∞–µ–º –∫–∞–∫ –≤–æ–ª–Ω–∏—Ç–µ–ª—å–Ω–∞ –∏ —Ç—Ä–µ–≤–æ–∂–Ω–∞ –¥–ª—è –ø–µ—Ä–≤...
59042,chat2024,70509,2024-11-22 10:01:41,(–ê—Ä—Ö–∏–≤ 2024) 8 –∏–Ω—Å—Ç–∏—Ç—É—Ç –ú–ê–ò - —ç—Ç–æ IT 2024,70507.0,8Ô∏è‚É£ –î–æ—Ä–æ–≥–∏–µ —Ä–æ–¥–∏—Ç–µ–ª–∏ —Å—Ç—É–¥–µ–Ω—Ç–æ–≤ 1 –∫—É—Ä—Å–∞ 8 –∏–Ω—Å—Ç–∏...
59043,chat2024,70511,2024-12-07 12:14:06,(–ê—Ä—Ö–∏–≤ 2024) 8 –∏–Ω—Å—Ç–∏—Ç—É—Ç –ú–ê–ò - —ç—Ç–æ IT 2024,2.0,"–í–∞–¥–∏–º –ö–æ–Ω–¥–∞—Ä–∞—Ç—Ü–µ–≤, –∞–∫–∞–¥–µ–º–∏—á–µ—Å–∫–∏–π —Ä—É–∫–æ–≤–æ–¥–∏—Ç–µ–ª—å ..."


### –î–∏–∞–ª–æ–≥–∏

In [9]:
df['global_id'] = df['source'] + '_' + df['id'].astype(int).astype(str)
df['global_parent_id'] = df.apply(lambda r: f"{r.source}_{int(r.reply_to_message_id)}"
                                           if pd.notna(r.reply_to_message_id) else None,
                                  axis=1)

In [10]:
all_gids = set(df['global_id'])

In [11]:
from collections import defaultdict
children = defaultdict(list)
for row in df.itertuples():
    gid  = row.global_id
    pgid = row.global_parent_id
    if pgid and pgid in all_gids:
        children[pgid].append(gid)

In [12]:
roots = [
    row.global_id for row in df.itertuples()
    if not row.global_parent_id or row.global_parent_id not in all_gids
]

In [13]:
def collect_paths(gid, path=None):
    path = (path or []) + [gid]
    if gid not in children or not children[gid]:
        yield path
    else:
        for child in children[gid]:
            yield from collect_paths(child, path)

dialog_paths = []
for root in roots:
    dialog_paths.extend(collect_paths(root))

In [14]:
info_map = df.set_index('global_id')[['from','id', 'text']].to_dict('index')
records = []
for thread_id, path in enumerate(dialog_paths, 1):
    plain, ann, parts = [], [], []
    for gid in path:
        rec  = info_map[gid]
        nick = rec['from']
        uid = rec['id']
        txt  = rec['text']
        plain.append(txt)
        ann.append(f"[{nick} ({uid})] {txt}")
        parts.append((nick, uid))
    records.append({
        'thread_id':        thread_id,
        'root_global_id':   path[0],
        'turns':            len(path),
        'dialog_plain':     "\n".join(plain),
        'dialog_annotated': "\n".join(ann),
        'participants':     parts,
        'global_ids':       path
    })

threads_df = pd.DataFrame(records)

In [15]:
threads_df

Unnamed: 0,thread_id,root_global_id,turns,dialog_plain,dialog_annotated,participants,global_ids
0,1,chat2021_6,1,–ü–æ—Ä—è–¥–æ–∫ —Ä–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏—è –ø–æ –∫–∞—Ñ–µ–¥—Ä–∞–º/–Ω–∞–ø—Ä–∞–≤–ª–µ–Ω–∏—è–º...,[–ò–Ω—Å—Ç–∏—Ç—É—Ç ‚Ññ8 –ú–ê–ò 2021 (6)] –ü–æ—Ä—è–¥–æ–∫ —Ä–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω...,"[(–ò–Ω—Å—Ç–∏—Ç—É—Ç ‚Ññ8 –ú–ê–ò 2021, 6)]",[chat2021_6]
1,2,chat2021_7,1,–ü–æ—Ä—è–¥–æ–∫ –ø–æ–ª—É—á–µ–Ω–∏—è –æ–±—â–µ–∂–∏—Ç–∏—è –ü—Ä–∏ –∑–∞—á–∏—Å–ª–µ–Ω–∏–∏ –º–µ—Å...,[–ò–Ω—Å—Ç–∏—Ç—É—Ç ‚Ññ8 –ú–ê–ò 2021 (7)] –ü–æ—Ä—è–¥–æ–∫ –ø–æ–ª—É—á–µ–Ω–∏—è –æ...,"[(–ò–Ω—Å—Ç–∏—Ç—É—Ç ‚Ññ8 –ú–ê–ò 2021, 7)]",[chat2021_7]
2,3,chat2021_10,1,–í–æ–∑–º–æ–∂–Ω–æ—Å—Ç—å –∑–∞—Å–µ–ª–µ–Ω–∏—è –≤ –æ–¥–Ω—É –∫–æ–º–Ω–∞—Ç—É –∑–Ω–∞–∫–æ–º—ã—Ö/...,[–ò–Ω—Å—Ç–∏—Ç—É—Ç ‚Ññ8 –ú–ê–ò 2021 (10)] –í–æ–∑–º–æ–∂–Ω–æ—Å—Ç—å –∑–∞—Å–µ–ª–µ...,"[(–ò–Ω—Å—Ç–∏—Ç—É—Ç ‚Ññ8 –ú–ê–ò 2021, 10)]",[chat2021_10]
3,4,chat2021_11,1,–ü–µ—Ä–µ–≤–æ–¥ –º–µ–∂–¥—É –Ω–∞–ø—Ä–∞–≤–ª–µ–Ω–∏—è–º–∏ 8 –∏–Ω—Å—Ç–∏—Ç—É—Ç–∞. –ü–µ—Ä–µ–≤...,[–ò–Ω—Å—Ç–∏—Ç—É—Ç ‚Ññ8 –ú–ê–ò 2021 (11)] –ü–µ—Ä–µ–≤–æ–¥ –º–µ–∂–¥—É –Ω–∞–ø—Ä...,"[(–ò–Ω—Å—Ç–∏—Ç—É—Ç ‚Ññ8 –ú–ê–ò 2021, 11)]",[chat2021_11]
4,5,chat2021_12,1,–ü—Ä–æ–≥—Ä–∞–º–º–∞ –æ–±—É—á–µ–Ω–∏—è –ø—Ä–æ–≥—Ä–∞–º–º–∏—Ä–æ–≤–∞–Ω–∏—é –Ω–∞ 1 –∫—É—Ä—Å–µ...,[–ò–Ω—Å—Ç–∏—Ç—É—Ç ‚Ññ8 –ú–ê–ò 2021 (12)] –ü—Ä–æ–≥—Ä–∞–º–º–∞ –æ–±—É—á–µ–Ω–∏—è...,"[(–ò–Ω—Å—Ç–∏—Ç—É—Ç ‚Ññ8 –ú–ê–ò 2021, 12)]",[chat2021_12]
...,...,...,...,...,...,...,...
42977,42978,chat2024_70489,1,"üëÄ –°—Ç—É–¥–µ–Ω—Ç—ã, –∞ –≤—ã –ø–æ–º–Ω–∏—Ç–µ –∫—Ä—ã—à–µ—Å–Ω–æ—Å–Ω—É—é –ø—Ä–∏–µ–º–Ω—É—é...","[–ú–∞–∫—Å–∏–º –ü—Ä–∏–µ–º–∫–∞ (70489)] üëÄ –°—Ç—É–¥–µ–Ω—Ç—ã, –∞ –≤—ã –ø–æ–º–Ω...","[(–ú–∞–∫—Å–∏–º –ü—Ä–∏–µ–º–∫–∞, 70489)]",[chat2024_70489]
42978,42979,chat2024_70490,1,–ï—â–µ 5-—Ç—å –º–µ—Å—Ç –Ω–∞ —ç–∫—Å–∫—É—Ä—Å–∏—é –¥–ª—è –ø–µ—Ä–≤–∞–∫–æ–≤ 8 –∏–Ω—Å—Ç...,[–ú–∞–∫—Å–∏–º –ü—Ä–∏–µ–º–∫–∞ (70490)] –ï—â–µ 5-—Ç—å –º–µ—Å—Ç –Ω–∞ —ç–∫—Å–∫...,"[(–ú–∞–∫—Å–∏–º –ü—Ä–∏–µ–º–∫–∞, 70490)]",[chat2024_70490]
42979,42980,chat2024_70505,1,üëÄ –ú—ã –∑–Ω–∞–µ–º –∫–∞–∫ –≤–æ–ª–Ω–∏—Ç–µ–ª—å–Ω–∞ –∏ —Ç—Ä–µ–≤–æ–∂–Ω–∞ –¥–ª—è –ø–µ—Ä–≤...,[–ú–∞–∫—Å–∏–º –ü—Ä–∏–µ–º–∫–∞ (70505)] üëÄ –ú—ã –∑–Ω–∞–µ–º –∫–∞–∫ –≤–æ–ª–Ω–∏—Ç...,"[(–ú–∞–∫—Å–∏–º –ü—Ä–∏–µ–º–∫–∞, 70505)]",[chat2024_70505]
42980,42981,chat2024_70507,2,üëÄ –ú—ã –∑–Ω–∞–µ–º –∫–∞–∫ –≤–æ–ª–Ω–∏—Ç–µ–ª—å–Ω–∞ –∏ —Ç—Ä–µ–≤–æ–∂–Ω–∞ –¥–ª—è –ø–µ—Ä–≤...,[(–ê—Ä—Ö–∏–≤ 2024) 8 –∏–Ω—Å—Ç–∏—Ç—É—Ç –ú–ê–ò - —ç—Ç–æ IT 2024 (70...,"[((–ê—Ä—Ö–∏–≤ 2024) 8 –∏–Ω—Å—Ç–∏—Ç—É—Ç –ú–ê–ò - —ç—Ç–æ IT 2024, 7...","[chat2024_70507, chat2024_70509]"


In [16]:
threads_df.dialog_plain[10]

'–°—Ç–∞—Ä–æ—Å—Ç–∞ –ó–∞–¥–∞—á–∏ —Å—Ç–∞—Ä–æ—Å—Ç—ã\n–ü–µ—Ä–µ–¥–∞–≤–∞—Ç—å –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏—é –æ—Ç –¥–∏—Ä–µ–∫—Ü–∏–∏ –∏ –Ω–∞—á–∞–ª—å–Ω–∏–∫–∞ –∫—É—Ä—Å–∞, –∑–∞–¥–∞–≤–∞—Ç—å –∏–º –≤–æ–ø—Ä–æ—Å—ã, –∏–Ω—Ç–µ—Ä–µ—Å—É—é—â–∏–µ —Å—Ç—É–¥–µ–Ω—Ç–æ–≤: —Å—Ç–∞—Ä–æ—Å—Ç–∞ - –ª–∏—Ü–æ –≥—Ä—É–ø–ø—ã; —Å–≤—è–∑—ã–≤–∞—Ç—å—Å—è —Å –ø—Ä–µ–ø–æ–¥–∞–≤–∞—Ç–µ–ª—è–º–∏, –ø–µ—Ä–µ–¥–∞–≤–∞—Ç—å –æ—Ç –Ω–∏—Ö –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏—é –∏ –∑–∞–¥–∞–≤–∞—Ç—å –≤–æ–ø—Ä–æ—Å—ã; –æ—Ç–º–µ—á–∞—Ç—å –ø–æ—Å–µ—â–∞–µ–º–æ—Å—Ç—å —Å–≤–æ–µ–π –≥—Ä—É–ø–ø—ã, –ø–µ—Ä–µ–¥–∞–≤–∞—Ç—å –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏—é –æ –Ω–µ–π –ø—Ä–µ–ø–æ–¥–∞–≤–∞—Ç–µ–ª—è–º –∏ –¥–∏—Ä–µ–∫—Ü–∏–∏; –ø–æ—Å–µ—â–∞—Ç—å —Å–æ–±—Ä–∞–Ω–∏—è —Å—Ç–∞—Ä–æ—Å—Ç. \n–ö–∞–∫ —Å—Ç–∞—Ç—å —Å—Ç–∞—Ä–æ—Å—Ç–æ–π:\n–ï—Å—Ç—å 2 –∏—Å—Ö–æ–¥–∞, –∫–∞–∫ –±—É–¥–µ—Ç —Ä–µ—à–µ–Ω–æ, –∫—Ç–æ –±—É–¥–µ—Ç —Å—Ç–∞—Ä–æ—Å—Ç–æ–π –≥—Ä—É–ø–ø—ã. 1–π -- –í—ã–±–æ—Ä—ã, 2–π -- –Ω–∞–∑–Ω–∞—á–µ–Ω–∏–µ –¥–∏—Ä–µ–∫—Ü–∏–µ–π. –û–±—ã—á–Ω–æ —Å—Ç–∞—Ä–æ—Å—Ç–µ, —Å–æ –≤—Ç–æ—Ä–æ–≥–æ —Å–µ–º–µ—Å—Ç—Ä–∞ –¥–æ–ø–ª–∞—á–∏–≤–∞—é—Ç 670 —Ä—É–±–ª–µ–π. –í —Å–ª—É—á–∞–µ —Å–∏–ª—å–Ω–æ–≥–æ

In [17]:
threads_df.query("turns == 2").head()

Unnamed: 0,thread_id,root_global_id,turns,dialog_plain,dialog_annotated,participants,global_ids
15,16,chat2021_27,2,–ó–¥—Ä–∞–≤—Å—Ç–≤—É–π—Ç–µ. –ù—É–∂–Ω—ã –ª–∏ –∫–∞–∫–∏–µ-–Ω–∏–±—É–¥—å —Å–ø—Ä–∞–≤–∫–∏ –¥–ª...,[–í–∞—Å–∏–ª–∏–π –ó–∞—Ö–∞—Ä–æ–≤ (27)] –ó–¥—Ä–∞–≤—Å—Ç–≤—É–π—Ç–µ. –ù—É–∂–Ω—ã –ª–∏ ...,"[(–í–∞—Å–∏–ª–∏–π –ó–∞—Ö–∞—Ä–æ–≤, 27), (–ò–Ω—Å—Ç–∏—Ç—É—Ç ‚Ññ8 –ú–ê–ò –ß–∞—Ç, ...","[chat2021_27, chat2021_28]"
16,17,chat2021_29,2,"–ù—É –ª–∏—Å—Ç –¥–∏—Å–ø–∞–Ω—Å–µ—Ä–Ω—ã–π, —Ñ–æ—Ä–º–∞ 086-–£\n–°–ø—Ä–∞–≤–∫–∞ –ø–æ ...","[–í–∞—Å–∏–ª–∏–π –ó–∞—Ö–∞—Ä–æ–≤ (29)] –ù—É –ª–∏—Å—Ç –¥–∏—Å–ø–∞–Ω—Å–µ—Ä–Ω—ã–π, —Ñ...","[(–í–∞—Å–∏–ª–∏–π –ó–∞—Ö–∞—Ä–æ–≤, 29), (–ò–Ω—Å—Ç–∏—Ç—É—Ç ‚Ññ8 –ú–ê–ò –ß–∞—Ç, ...","[chat2021_29, chat2021_30]"
19,20,chat2021_40,2,–ö–ª—é—á–µ–≤—ã–µ –¥–∞—Ç—ã –ø—Ä–∏–µ–º–Ω–æ–π –∫–∞–º–ø–∞–Ω–∏–∏ –Ω–∞ –Ω–∞–ø—Ä–∞–≤–ª–µ–Ω–∏—è...,[–ò–Ω—Å—Ç–∏—Ç—É—Ç ‚Ññ8 –ú–ê–ò 2021 (40)] –ö–ª—é—á–µ–≤—ã–µ –¥–∞—Ç—ã –ø—Ä–∏–µ...,"[(–ò–Ω—Å—Ç–∏—Ç—É—Ç ‚Ññ8 –ú–ê–ò 2021, 40), (–ò–Ω—Å—Ç–∏—Ç—É—Ç ‚Ññ8 –ú–ê–ò ...","[chat2021_40, chat2021_42]"
24,25,chat2021_67,2,"–ï—Å–ª–∏ —É–∂–µ –ø–æ–¥–∞–ª –¥–æ–∫—É–º–µ–Ω—Ç—ã, –Ω–æ –Ω–µ —É–∫–∞–∑–∞–ª —Ä–µ–∑—É–ª—å—Ç...","[–£–ª—è (67)] –ï—Å–ª–∏ —É–∂–µ –ø–æ–¥–∞–ª –¥–æ–∫—É–º–µ–Ω—Ç—ã, –Ω–æ –Ω–µ —É–∫–∞...","[(–£–ª—è, 67), (–ò–Ω—Å—Ç–∏—Ç—É—Ç ‚Ññ8 –ú–ê–ò –ß–∞—Ç, 68)]","[chat2021_67, chat2021_68]"
25,26,chat2021_67,2,"–ï—Å–ª–∏ —É–∂–µ –ø–æ–¥–∞–ª –¥–æ–∫—É–º–µ–Ω—Ç—ã, –Ω–æ –Ω–µ —É–∫–∞–∑–∞–ª —Ä–µ–∑—É–ª—å—Ç...","[–£–ª—è (67)] –ï—Å–ª–∏ —É–∂–µ –ø–æ–¥–∞–ª –¥–æ–∫—É–º–µ–Ω—Ç—ã, –Ω–æ –Ω–µ —É–∫–∞...","[(–£–ª—è, 67), (–ò–Ω—Å—Ç–∏—Ç—É—Ç ‚Ññ8 –ú–ê–ò –ß–∞—Ç, 69)]","[chat2021_67, chat2021_69]"


In [19]:
threads_df.to_csv('threads.csv')

## –ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –∫ —Ç–µ–º–∞—Ç–∏—á–µ—Å–∫–æ–º—É –º–æ–¥–µ–ª–∏—Ä–æ–≤–∞–Ω–∏—é

In [None]:
! pip install spacy[transformers] rusenttokenize
! python -m spacy download ru_core_news_sm

Collecting rusenttokenize
  Downloading rusenttokenize-0.0.5-py3-none-any.whl.metadata (2.7 kB)
Collecting spacy_transformers<1.4.0,>=1.1.2 (from spacy[transformers])
  Downloading spacy_transformers-1.3.8-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.0 kB)
Collecting transformers<4.50.0,>=3.4.0 (from spacy_transformers<1.4.0,>=1.1.2->spacy[transformers])
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m44.0/44.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting spacy-alignments<1.0.0,>=0.7.2 (from spacy_transformers<1.4.0,>=1.1.2->spacy[transformers])
  Downloading spacy_alignments-0.9.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.8.0->spacy_transformers<1.4.0,>=1.1.2->spacy[transformers])
  Downloading

In [None]:
import spacy
from rusenttokenize import ru_sent_tokenize
from string import punctuation

nlp = spacy.load("ru_core_news_sm")
stop_words = nlp.Defaults.stop_words

In [None]:
def preprocess_text(text: str) -> str:
    doc = nlp(text)
    tokens = []
    for tok in doc:
        if tok.is_punct:
            tokens.append(tok.text)
            continue

        lemma = tok.lemma_.lower().strip()
        if (not lemma
            or lemma in stop_words
            or lemma.isdigit()
            or len(lemma) <= 2
           ):
            continue
        tokens.append(lemma)
    return " ".join(tokens)

threads_df['text_proc'] = threads_df['dialog_plain'].apply(preprocess_text)

In [None]:
threads_df['text_proc'].head()

Unnamed: 0,text_proc
0,–ø–æ—Ä—è–¥–æ–∫ —Ä–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –∫–∞—Ñ–µ–¥—Ä–∞–º / –Ω–∞–ø—Ä–∞–≤–ª–µ–Ω–∏–µ –≥...
1,–ø–æ—Ä—è–¥–æ–∫ –ø–æ–ª—É—á–µ–Ω–∏–µ –æ–±—â–µ–∂–∏—Ç–∏–µ –∑–∞—á–∏—Å–ª–µ–Ω–∏–µ –º–µ—Å—Ç–æ —Ä...
2,–≤–æ–∑–º–æ–∂–Ω–æ—Å—Ç—å –∑–∞—Å–µ–ª–µ–Ω–∏–µ –∫–æ–º–Ω–∞—Ç–∞ –∑–Ω–∞–∫–æ–º—ã—Ö / –¥—Ä—É–≥ ...
3,–ø–µ—Ä–µ–≤–æ–¥ –Ω–∞–ø—Ä–∞–≤–ª–µ–Ω–∏–µ –∏–Ω—Å—Ç–∏—Ç—É—Ç . –ø–µ—Ä–µ–≤–æ–¥ –Ω–∞–ø—Ä–∞–≤–ª...
4,–ø—Ä–æ–≥—Ä–∞–º–º–∞ –æ–±—É—á–µ–Ω–∏–µ –ø—Ä–æ–≥—Ä–∞–º–º–∏—Ä–æ–≤–∞–Ω–∏–µ –∫—É—Ä—Å –æ–±—É—á–µ...


## –ö–ª–∞—Å—Ç–µ—Ä–∏–∑–∞—Ü–∏—è

In [None]:
!pip install sentence-transformers hdbscan umap-learn wordcloud bertopic[all]

Collecting bertopic[all]
  Downloading bertopic-0.17.0-py3-none-any.whl.metadata (23 kB)
Downloading bertopic-0.17.0-py3-none-any.whl (150 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m150.6/150.6 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bertopic
Successfully installed bertopic-0.17.0


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
texts = threads_df['text_proc'].tolist()

### –≠–º–±–µ–¥–¥–∏–Ω–≥–∏ + HDBSCAN + PCA + WordCloud

In [None]:
from itertools import islice

def chunked(iterable, n):
    it = iter(iterable)
    while True:
        chunk = list(islice(it, n))
        if not chunk:
            break
        yield chunk

In [None]:
from langchain.embeddings.base import Embeddings
import time
import requests
from tenacity import Retrying, stop_after_attempt, wait_fixed, retry_if_exception_type

class YandexEmbeddings(Embeddings):
    def __init__(
        self,
        api_key: str,
        folder_id: str,
        sleep_interval: float = 1.0,
        retries: int = 3,
        **kwargs
    ):
        self.api_key = api_key
        self.folder_id = folder_id
        self.sleep_interval = sleep_interval
        self.retries = retries

        self.headers = {
            "Authorization": f"Api-Key {self.api_key}",
            "Content-Type":  "application/json"
        }

    def _get_model_uri(self, is_document: bool) -> str:
        mode = "doc" if is_document else "query"
        return f"emb://{self.folder_id}/text-search-{mode}/latest"

    def _embed_one(self, text: str, is_document: bool) -> list[float]:
        payload = {
            "modelUri": self._get_model_uri(is_document),
            "text":     text
        }
        last_exc = None

        for attempt in Retrying(
            stop=stop_after_attempt(self.retries),
            wait=wait_fixed(self.sleep_interval),
            retry=retry_if_exception_type(requests.RequestException)
        ):
            with attempt:
                resp = requests.post(
                    "https://llm.api.cloud.yandex.net/foundationModels/v1/textEmbedding",
                    json=payload,
                    headers=self.headers,
                    timeout=10
                )
                resp.raise_for_status()
                data = resp.json()
                if "embedding" in data:
                    return data["embedding"]
                raise ValueError(f"–ù–µ—Ç –∫–ª—é—á–∞ 'embedding' –≤ –æ—Ç–≤–µ—Ç–µ: {data}")

        raise RuntimeError(f"–ù–µ —É–¥–∞–ª–æ—Å—å –ø–æ–ª—É—á–∏—Ç—å embedding –ø–æ—Å–ª–µ {self.retries} –ø–æ–ø—ã—Ç–æ–∫")

    def embed_query(self, text: str) -> list[float]:
        return self._embed_one(text, is_document=False)

    def embed_documents(self, texts: list[str], chunk_size: int = 15) -> list[list[float]]:
        embeddings = []
        for txt in texts:
            emb = self._embed_one(txt, is_document=True)
            embeddings.append(emb)
            # print(embeddings[:2])
            time.sleep(self.sleep_interval)
        return embeddings


In [None]:
emb = YandexEmbeddings(api_key="", folder_id="")

In [None]:
emb.embed_query("–ø—Ä–∏–≤–µ—Ç")

[0.0440673828125,
 0.03973388671875,
 -0.06744384765625,
 -0.0767822265625,
 0.038482666015625,
 0.02471923828125,
 -0.01372528076171875,
 -0.1898193359375,
 -0.0142059326171875,
 0.038360595703125,
 -0.0166778564453125,
 -0.10662841796875,
 0.051300048828125,
 -0.06787109375,
 0.0159149169921875,
 -0.0257720947265625,
 0.09423828125,
 -0.08990478515625,
 -0.01433563232421875,
 -0.0810546875,
 0.07763671875,
 0.03826904296875,
 -0.061981201171875,
 -0.037261962890625,
 -0.040252685546875,
 -0.04351806640625,
 0.065185546875,
 0.002593994140625,
 0.09149169921875,
 -0.06085205078125,
 -0.05584716796875,
 0.07086181640625,
 -0.09112548828125,
 0.08563232421875,
 -0.033660888671875,
 -0.0753173828125,
 0.018585205078125,
 0.0253753662109375,
 -0.03021240234375,
 0.061004638671875,
 -0.139892578125,
 -0.019195556640625,
 0.036895751953125,
 0.01910400390625,
 -0.019012451171875,
 0.0006270408630371094,
 0.0125732421875,
 0.037811279296875,
 0.03863525390625,
 0.06561279296875,
 -0.04165649

In [61]:
docs_vecs = emb.embed_documents(texts)

RetryError: RetryError[<Future at 0x78b9dee136d0 state=finished raised HTTPError>]

In [None]:
threads_df['embed_texts'] = docs_vecs

In [None]:
import numpy as np

X = np.array(docs_vecs)
print(X.shape)

In [None]:
import hdbscan
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA
from wordcloud import WordCloud, STOPWORDS

In [None]:
clusterer = hdbscan.HDBSCAN(min_cluster_size=15, metric='cosine')
labels = clusterer.fit_predict(X)

ValueError: Expected 2D array, got scalar array instead:
array=<__main__.YandexEmbeddings object at 0x78b9e47494d0>.
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [None]:
threads_df['hdbscan'] = labels

In [None]:
pca = PCA( n_components=2, random_state=42)
coords = pca.fit_transform(emb)
threads_df['pca_x'], threads_df['pca_y'] = coords[:,0], coords[:,1]

In [None]:
plt.figure(figsize=(8,6))

palette = sns.color_palette('tab10', np.unique(labels).max()+2)
sns.scatterplot(x='pca_x', y='pca_y', hue='hdbscan',
                data=threads_df, palette=palette,
                legend='full', s=20)

plt.title("HDBSCAN")
plt.legend(bbox_to_anchor=(1,1)); plt.show()

In [None]:
for cl in sorted(set(labels)):

    if cl == -1: continue  # —à—É–º
    texts_cl = threads_df.loc[threads_df['hdbscan']==cl, 'text_proc']

    wc = WordCloud(width=400, height=200, background_color='white',
                   stopwords=STOPWORDS).generate(" ".join(texts_cl))

    plt.figure(figsize=(4,2))
    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')
    plt.title(f"Cluster {cl} ({len(texts_cl)} docs)")
    plt.show()

### BERTopic + UMAP + c-TF-IDF

In [None]:
from bertopic import BERTopic
from umap import UMAP

In [None]:
umap_model = UMAP(n_neighbors=15, n_components=7, metric='cosine', random_state=42, min_dist=0.6)

In [None]:
topic_model = BERTopic(embedding_model=emb,
                       umap_model=umap_model,
                       nr_topics="auto",
                       diversity=0.7,
                       top_n_words=20)

TypeError: BERTopic.__init__() got an unexpected keyword argument 'diversity'

In [None]:
topics, probs = topic_model.fit_transform(texts)

In [None]:
threads_df['bertopic'] = topics

In [None]:
topic_info = topic_model.get_topic_info()
display(topic_info.head(10))

In [None]:
for t in topic_info['Topic'].unique():
    if t == -1: continue
    fig = topic_model.visualize_barchart(topic=t, top_n_topics=1)
    fig.show()

In [None]:
vis = topic_model.visualize_topics()
vis.show()