In [5]:
import json
import re
from pathlib import Path
import pandas as pd
import glob
import os

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [2]:
json_folder = Path('../data/json')
json_paths = glob.glob(str(json_folder / '*.json'))
json_paths

['..\\data\\json\\chat2021.json',
 '..\\data\\json\\chat2022.json',
 '..\\data\\json\\chat2023.json',
 '..\\data\\json\\chat2024.json']

In [3]:
def normalize_text(field):
    if isinstance(field, str):
        return field.strip()
    if isinstance(field, list):
        parts = []
        for el in field:
            if isinstance(el, str):
                parts.append(el.strip())
            elif isinstance(el, dict) and 'text' in el and isinstance(el['text'], str):
                parts.append(el['text'].strip())
        return ' '.join([p for p in parts if p])
    return ''

In [4]:
EMOJI_PATTERN = re.compile(
    r"[\U0001F300-\U0001F6FF\u2600-\u26FF\u2700-\u27BF]+",
    flags=re.UNICODE
)

def remove_emojis(text: str) -> str:
    return EMOJI_PATTERN.sub('', text)

In [6]:
rows = []
for p in json_paths:
    with open(p, 'r', encoding='utf-8') as f:
        data = json.load(f)
    msgs = data.get('messages', [])
    for m in msgs:
        text = normalize_text(m.get('text', ''))
        txt = remove_emojis(text).strip()
        if not text:
            continue
        if len(txt.split()) <= 10:
            continue
        rows.append({
            'source': os.path.splitext(os.path.basename(p))[0],
            'id': m.get('id'),
            'date': m.get('date'),
            'from':m.get('from'),
            # 'from_id':m.get('from_id'),
            'reply_to_message_id': m.get('reply_to_message_id'),
            'text': text,
        })


In [7]:
df = pd.DataFrame(rows)
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df = df.sort_values('date').reset_index(drop=True)

In [8]:
df.tail()

Unnamed: 0,source,id,date,from,reply_to_message_id,text
22091,chat2024,70490,2024-09-16 17:42:52,Максим Приемка,,Еще 5-ть мест на экскурсию для перваков 8 инст...
22092,chat2024,70505,2024-11-18 17:41:44,Максим Приемка,2.0,👀 Мы знаем как волнительна и тревожна для перв...
22093,chat2024,70507,2024-11-20 14:08:17,(Архив 2024) 8 институт МАИ - это IT 2024,,👀 Мы знаем как волнительна и тревожна для перв...
22094,chat2024,70509,2024-11-22 10:01:41,(Архив 2024) 8 институт МАИ - это IT 2024,70507.0,8️⃣ Дорогие родители студентов 1 курса 8 инсти...
22095,chat2024,70511,2024-12-07 12:14:06,(Архив 2024) 8 институт МАИ - это IT 2024,2.0,"Вадим Кондаратцев, академический руководитель ..."


### Диалоги

In [9]:
df['global_id'] = df['source'] + '_' + df['id'].astype(int).astype(str)
df['global_parent_id'] = df.apply(lambda r: f"{r.source}_{int(r.reply_to_message_id)}"
                                           if pd.notna(r.reply_to_message_id) else None,
                                  axis=1)

In [10]:
all_gids = set(df['global_id'])

In [11]:
from collections import defaultdict
children = defaultdict(list)
for row in df.itertuples():
    gid  = row.global_id
    pgid = row.global_parent_id
    if pgid and pgid in all_gids:
        children[pgid].append(gid)

In [12]:
roots = [
    row.global_id for row in df.itertuples()
    if not row.global_parent_id or row.global_parent_id not in all_gids
]

In [13]:
def collect_paths(gid, path=None):
    path = (path or []) + [gid]
    if gid not in children or not children[gid]:
        yield path
    else:
        for child in children[gid]:
            yield from collect_paths(child, path)

dialog_paths = []
for root in roots:
    dialog_paths.extend(collect_paths(root))

In [14]:
info_map = df.set_index('global_id')[['from','id', 'text', 'date']].to_dict('index')
records = []
for thread_id, path in enumerate(dialog_paths, 1):
    plain, ann, parts = [], [], []
    for gid in path:
        rec  = info_map[gid]
        nick = rec['from']
        uid = rec['id']
        txt  = rec['text']
        date = rec['date']
        plain.append(txt)
        ann.append(f"[{nick} ({uid})] {txt}")
        parts.append((nick, uid))
    records.append({
        'date' :            date, 
        'thread_id':        thread_id,
        'root_global_id':   path[0],
        'turns':            len(path),
        'dialog_plain':     "\n".join(plain),
        'dialog_annotated': "\n".join(ann),
        'participants':     parts,
        'global_ids':       path
    })

threads_df = pd.DataFrame(records)

In [15]:
threads_df['word_count'] = threads_df['dialog_plain'].str.split().apply(len)
avg_words = threads_df['word_count'].mean()
avg_words

np.float64(29.181937674571444)

In [16]:
N = 4

base_df = df.sort_values('date').reset_index(drop=True)
gid_to_idx = {gid: idx for idx, gid in enumerate(base_df['global_id'])}

threads_df = threads_df.sort_values('root_global_id').reset_index(drop=True)

aug_records = []
for row in threads_df.itertuples():
    rec = row._asdict()
    
    if row.turns > 1:
        rec['chunk_text'] = row.dialog_plain
    else:
        gid = row.root_global_id
        idx = gid_to_idx[gid]

        start = max(0, idx - N)
        end   = min(len(base_df) - 1, idx + N)
        
        context = base_df.loc[start:end, 'text'].tolist()
        rec['chunk_text'] = "\n".join(context)
    
    aug_records.append(rec)


threads_aug_df = pd.DataFrame(aug_records)

In [17]:
threads_aug_df

Unnamed: 0,Index,date,thread_id,root_global_id,turns,dialog_plain,dialog_annotated,participants,global_ids,word_count,chunk_text
0,0,2021-06-10 18:31:04,3,chat2021_10,1,Возможность заселения в одну комнату знакомых/...,[Институт №8 МАИ 2021 (10)] Возможность заселе...,"[(Институт №8 МАИ 2021, 10)]",[chat2021_10],25,Порядок распределения по кафедрам/направлениям...
1,1,2021-08-06 14:23:03,1862,chat2021_10006,1,"Здравствуйте, у меня вопрос: Как узнать кафедр...","[Шестой (10006)] Здравствуйте, у меня вопрос: ...","[(Шестой, 10006)]",[chat2021_10006],29,Математик-аналитик\nИнженер-математик\nРазрабо...
2,2,2021-08-06 14:55:32,1863,chat2021_10026,1,Вопрос теоретический. Просто просвещения для. ...,[@DVPDVPDVP (10026)] Вопрос теоретический. Про...,"[(@DVPDVPDVP, 10026)]",[chat2021_10026],82,можно будет в итоге в направлении ИТПМ выбрать...
3,3,2021-08-06 15:02:50,1864,chat2021_10029,1,Спасибо! Но все же на вопросы 1 - сколько лет ...,[@DVPDVPDVP (10029)] Спасибо! Но все же на воп...,"[(@DVPDVPDVP, 10029)]",[chat2021_10029],28,"Здравствуйте, такой вопрос \nСумма баллов 231,..."
4,4,2021-08-06 15:10:02,1865,chat2021_10030,1,Добрый день! Приказы о зачислении квотников не...,[Marina ☘️ (10030)] Добрый день! Приказы о зач...,"[(Marina ☘️, 10030)]",[chat2021_10030],14,"На платку можете пройти. На бюджет, по ситуаци..."
...,...,...,...,...,...,...,...,...,...,...,...
18254,18254,2024-07-16 15:00:47,11653,chat2024_9987,1,"Помимо Царева и Панфилова есть еще общежития, ...",[Максим Денисов (9987)] Помимо Царева и Панфил...,"[(Максим Денисов, 9987)]",[chat2024_9987],16,У этого года поступления по учебному плану анг...
18255,18255,2024-07-16 15:01:07,11654,chat2024_9988,1,"Если этаж полностью готов к заселению, то это ...",[Галина Андреева (9988)] Если этаж полностью г...,"[(Галина Андреева, 9988)]",[chat2024_9988],14,"не знаю :)\nя веду ин яз, поэтому рассказываю ..."
18256,18256,2024-07-16 15:05:17,11655,chat2024_9992,1,А можно ли с 806 по обмену по профилю it попас...,[Egor (9992)] А можно ли с 806 по обмену по пр...,"[(Egor, 9992)]",[chat2024_9992],18,Но по учебному плану теперь 3 года. Мне кажетс...
18257,18257,2024-07-16 15:19:10,11656,chat2024_9995,2,"Может, англ подвинули из-за нового предмета ""о...","[Ольга (9995)] Может, англ подвинули из-за нов...","[(Ольга, 9995), (Федя Тихонов, 9999)]","[chat2024_9995, chat2024_9999]",44,"Может, англ подвинули из-за нового предмета ""о..."


In [18]:
threads_aug_df['dialog_plain'] = threads_aug_df['chunk_text']

In [19]:
threads_aug_df = threads_aug_df.drop(
    columns=['dialog_annotated', 'participants', 'global_ids', 'date', ]
)

In [20]:
threads_aug_df

Unnamed: 0,Index,thread_id,root_global_id,turns,dialog_plain,word_count,chunk_text
0,0,3,chat2021_10,1,Порядок распределения по кафедрам/направлениям...,25,Порядок распределения по кафедрам/направлениям...
1,1,1862,chat2021_10006,1,Математик-аналитик\nИнженер-математик\nРазрабо...,29,Математик-аналитик\nИнженер-математик\nРазрабо...
2,2,1863,chat2021_10026,1,можно будет в итоге в направлении ИТПМ выбрать...,82,можно будет в итоге в направлении ИТПМ выбрать...
3,3,1864,chat2021_10029,1,"Здравствуйте, такой вопрос \nСумма баллов 231,...",28,"Здравствуйте, такой вопрос \nСумма баллов 231,..."
4,4,1865,chat2021_10030,1,"На платку можете пройти. На бюджет, по ситуаци...",14,"На платку можете пройти. На бюджет, по ситуаци..."
...,...,...,...,...,...,...,...
18254,18254,11653,chat2024_9987,1,У этого года поступления по учебному плану анг...,16,У этого года поступления по учебному плану анг...
18255,18255,11654,chat2024_9988,1,"не знаю :)\nя веду ин яз, поэтому рассказываю ...",14,"не знаю :)\nя веду ин яз, поэтому рассказываю ..."
18256,18256,11655,chat2024_9992,1,Но по учебному плану теперь 3 года. Мне кажетс...,18,Но по учебному плану теперь 3 года. Мне кажетс...
18257,18257,11656,chat2024_9995,2,"Может, англ подвинули из-за нового предмета ""о...",44,"Может, англ подвинули из-за нового предмета ""о..."


In [21]:
threads_aug_df.to_csv('../data/result/threads_df.csv')

In [22]:
threads_aug_df['word_count'] = threads_aug_df['chunk_text'].str.split().apply(len)
avg_words = threads_aug_df['word_count'].max()
avg_words

np.int64(1193)

In [23]:
threads_df['word_count'].mean()

np.float64(29.181937674571444)

In [24]:
threads_df.dialog_plain[10]

'Есть несколько автошкол, но от маи там в основном только название.'

In [63]:
%pip install spacy[transformers] rusenttokenize

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [68]:
!python.exe -m pip install --upgrade pip

Collecting pip
  Downloading pip-25.1-py3-none-any.whl.metadata (3.6 kB)
Downloading pip-25.1-py3-none-any.whl (1.8 MB)
   ---------------------------------------- 0.0/1.8 MB ? eta -:--:--
   ----------- ---------------------------- 0.5/1.8 MB 5.6 MB/s eta 0:00:01
   ----------------- ---------------------- 0.8/1.8 MB 5.6 MB/s eta 0:00:01
   ---------------------- ----------------- 1.0/1.8 MB 2.4 MB/s eta 0:00:01
   ---------------------------------- ----- 1.6/1.8 MB 2.5 MB/s eta 0:00:01
   ---------------------------------------- 1.8/1.8 MB 2.1 MB/s eta 0:00:00
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.3.1
    Uninstalling pip-24.3.1:
      Successfully uninstalled pip-24.3.1
Successfully installed pip-25.1




In [65]:
!python -m spacy download ru_core_news_sm

Collecting ru-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.7.0/ru_core_news_sm-3.7.0-py3-none-any.whl (15.3 MB)
     ---------------------------------------- 0.0/15.3 MB ? eta -:--:--
     - -------------------------------------- 0.5/15.3 MB 5.6 MB/s eta 0:00:03
     -- ------------------------------------- 0.8/15.3 MB 6.7 MB/s eta 0:00:03
     --- ------------------------------------ 1.3/15.3 MB 2.7 MB/s eta 0:00:06
     ---- ----------------------------------- 1.8/15.3 MB 2.4 MB/s eta 0:00:06
     ----- ---------------------------------- 2.1/15.3 MB 2.3 MB/s eta 0:00:06
     ------ --------------------------------- 2.6/15.3 MB 2.2 MB/s eta 0:00:06
     ------- -------------------------------- 2.9/15.3 MB 2.1 MB/s eta 0:00:06
     -------- ------------------------------- 3.4/15.3 MB 2.1 MB/s eta 0:00:06
     --------- ------------------------------ 3.7/15.3 MB 2.1 MB/s eta 0:00:06
     ---------- ----------------------


[notice] A new release of pip is available: 24.3.1 -> 25.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [70]:
%pip install transformers -U

Collecting transformers
  Downloading transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Downloading huggingface_hub-0.30.2-py3-none-any.whl.metadata (13 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Downloading transformers-4.51.3-py3-none-any.whl (10.4 MB)
   ---------------------------------------- 0.0/10.4 MB ? eta -:--:--
   --- ------------------------------------ 0.8/10.4 MB 8.3 MB/s eta 0:00:02
   ---- ----------------------------------- 1.0/10.4 MB 2.8 MB/s eta 0:00:04
   ----- ---------------------------------- 1.3/10.4 MB 2.2 MB/s eta 0:00:05
   ------- -------------------------------- 1.8/10.4 MB 2.3 MB/s eta 0:00:04
   -------- ------------------------------- 2.1/10.4 MB 2.2 MB/s eta 0:00:04
   ---------- ----------------------------- 2.6/10.4 MB 2.2 MB/s eta 0:00:04
   ----------- ---------------------------- 2.9/10.4 MB 2.

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
colpali-engine 0.3.5 requires transformers<4.47.0,>=4.46.1, but you have transformers 4.51.3 which is incompatible.
spacy-transformers 1.3.8 requires transformers<4.50.0,>=3.4.0, but you have transformers 4.51.3 which is incompatible.


In [73]:
%pip install -e '.[dev]'

Note: you may need to restart the kernel to use updated packages.


ERROR: '.[dev]' is not a valid editable requirement. It should either be a path to a local project or a VCS URL (beginning with bzr+http, bzr+https, bzr+ssh, bzr+sftp, bzr+ftp, bzr+lp, bzr+file, git+http, git+https, git+ssh, git+git, git+file, hg+file, hg+http, hg+https, hg+ssh, hg+static-http, svn+ssh, svn+http, svn+https, svn+svn, svn+file).


In [75]:
%pip install sentence-transformers hdbscan umap-learn wordcloud bertopic[all]

Collecting tokenizers<0.21,>=0.20 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading tokenizers-0.20.3-cp310-none-win_amd64.whl.metadata (6.9 kB)
Downloading tokenizers-0.20.3-cp310-none-win_amd64.whl (2.4 MB)
   ---------------------------------------- 0.0/2.4 MB ? eta -:--:--
   -------- ------------------------------- 0.5/2.4 MB 5.6 MB/s eta 0:00:01
   ------------- -------------------------- 0.8/2.4 MB 6.7 MB/s eta 0:00:01
   ---------------------- ----------------- 1.3/2.4 MB 2.2 MB/s eta 0:00:01
   ------------------------------ --------- 1.8/2.4 MB 2.3 MB/s eta 0:00:01
   ----------------------------------- ---- 2.1/2.4 MB 2.1 MB/s eta 0:00:01
   ---------------------------------------- 2.4/2.4 MB 2.2 MB/s eta 0:00:00
Installing collected packages: tokenizers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.21.1
    Uninstalling tokenizers-0.21.1:
      Successfully uninstalled tokenizers-0.21.1
Successfully installed tokenize



In [77]:
# 1. Установить совместимую версию tokenizers
%pip install "tokenizers>=0.20,<0.21" --force-reinstall

# 2. Обновить transformers, если нужно
%pip install -U transformers

# 3. (на всякий случай) Обновить spaCy
%pip install -U spacy


Collecting tokenizers<0.21,>=0.20
  Using cached tokenizers-0.20.3-cp310-none-win_amd64.whl.metadata (6.9 kB)
Collecting huggingface-hub<1.0,>=0.16.4 (from tokenizers<0.21,>=0.20)
  Using cached huggingface_hub-0.30.2-py3-none-any.whl.metadata (13 kB)
Collecting filelock (from huggingface-hub<1.0,>=0.16.4->tokenizers<0.21,>=0.20)
  Downloading filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting fsspec>=2023.5.0 (from huggingface-hub<1.0,>=0.16.4->tokenizers<0.21,>=0.20)
  Downloading fsspec-2025.3.2-py3-none-any.whl.metadata (11 kB)
Collecting packaging>=20.9 (from huggingface-hub<1.0,>=0.16.4->tokenizers<0.21,>=0.20)
  Downloading packaging-25.0-py3-none-any.whl.metadata (3.3 kB)
Collecting pyyaml>=5.1 (from huggingface-hub<1.0,>=0.16.4->tokenizers<0.21,>=0.20)
  Using cached PyYAML-6.0.2-cp310-cp310-win_amd64.whl.metadata (2.1 kB)
Collecting requests (from huggingface-hub<1.0,>=0.16.4->tokenizers<0.21,>=0.20)
  Using cached requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)

  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
botocore 1.19.63 requires urllib3<1.27,>=1.25.4; python_version != "3.4", but you have urllib3 2.4.0 which is incompatible.
datasets 3.0.1 requires fsspec[http]<=2024.6.1,>=2023.1.0, but you have fsspec 2025.3.2 which is incompatible.
faiss-cpu 1.9.0.post1 requires numpy<3.0,>=1.25.0, but you have numpy 1.23.5 which is incompatible.
fastapi-cli 0.0.3 requires typer>=0.12.3, but you have typer 0.9.4 which is incompatible.
google-cloud-logging 2.7.2 requires protobuf<4.0.0dev, but you have protobuf 4.25.5 which is incompatible.
label-studio 1.8.0 requires bleach~=5.0.0, but you have bleach 4.1.0 which is incompatible.
label-studio 1.8.0 requires google-api-core==2.11.0, but you have google-api-core 2.19.1 which is incompatible.
label-studio 1.8.0 requires google-auth==2.14.1, but

Collecting transformers
  Using cached transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Using cached tokenizers-0.21.1-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Using cached transformers-4.51.3-py3-none-any.whl (10.4 MB)
Using cached tokenizers-0.21.1-cp39-abi3-win_amd64.whl (2.4 MB)
Installing collected packages: tokenizers, transformers

  Attempting uninstall: tokenizers

    Found existing installation: tokenizers 0.20.3

    Uninstalling tokenizers-0.20.3:

      Successfully uninstalled tokenizers-0.20.3

   ---------------------------------------- 0/2 [tokenizers]
  Attempting uninstall: transformers
   ---------------------------------------- 0/2 [tokenizers]
    Found existing installation: transformers 4.46.3
   ---------------------------------------- 0/2 [tokenizers]
   -------------------- ------------------- 1/2 [transformers]
   -------------------- ------------------- 1/2 [transformers]
   --------------------

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
colpali-engine 0.3.5 requires transformers<4.47.0,>=4.46.1, but you have transformers 4.51.3 which is incompatible.
spacy-transformers 1.3.8 requires transformers<4.50.0,>=3.4.0, but you have transformers 4.51.3 which is incompatible.


Collecting spacy
  Downloading spacy-3.8.5-cp310-cp310-win_amd64.whl.metadata (28 kB)
Collecting thinc<8.4.0,>=8.3.4 (from spacy)
  Downloading thinc-8.3.6-cp310-cp310-win_amd64.whl.metadata (15 kB)
Collecting blis<1.4.0,>=1.3.0 (from thinc<8.4.0,>=8.3.4->spacy)
  Downloading blis-1.3.0-cp310-cp310-win_amd64.whl.metadata (7.6 kB)
Collecting numpy>=1.19.0 (from spacy)
  Downloading numpy-2.2.5-cp310-cp310-win_amd64.whl.metadata (60 kB)
Downloading spacy-3.8.5-cp310-cp310-win_amd64.whl (12.2 MB)
   ---------------------------------------- 0.0/12.2 MB ? eta -:--:--
   - -------------------------------------- 0.5/12.2 MB 2.8 MB/s eta 0:00:05
   --- ------------------------------------ 1.0/12.2 MB 2.6 MB/s eta 0:00:05
   ----- ---------------------------------- 1.6/12.2 MB 2.5 MB/s eta 0:00:05
   ------ --------------------------------- 1.8/12.2 MB 2.4 MB/s eta 0:00:05
   ------- -------------------------------- 2.4/12.2 MB 2.2 MB/s eta 0:00:05
   -------- ------------------------------- 2.

  You can safely remove it manually.
  You can safely remove it manually.
  You can safely remove it manually.
  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
colpali-engine 0.3.5 requires numpy<2.0.0, but you have numpy 2.2.5 which is incompatible.
colpali-engine 0.3.5 requires transformers<4.47.0,>=4.46.1, but you have transformers 4.51.3 which is incompatible.
contourpy 1.2.0 requires numpy<2.0,>=1.20, but you have numpy 2.2.5 which is incompatible.
en-core-web-sm 3.7.1 requires spacy<3.8.0,>=3.7.2, but you have spacy 3.8.5 which is incompatible.
hdbscan 0.8.37 requires numpy<2,>=1.20, but you have numpy 2.2.5 which is incompatible.
label-studio 1.8.0 requires bleach~=5.0.0, but you have bleach 4.1.0 which is incompatible.
label-studio 1.8.0 requires google-api-core==2.11.0, but you have google-api-core 2.19.1 which is in

In [80]:
!pip uninstall transformers -y

Found existing installation: transformers 4.51.3
Uninstalling transformers-4.51.3:
  Successfully uninstalled transformers-4.51.3


In [81]:
!pip install transformers==4.36.2

Collecting transformers==4.36.2
  Downloading transformers-4.36.2-py3-none-any.whl.metadata (126 kB)
Collecting tokenizers<0.19,>=0.14 (from transformers==4.36.2)
  Downloading tokenizers-0.15.2-cp310-none-win_amd64.whl.metadata (6.8 kB)
Downloading transformers-4.36.2-py3-none-any.whl (8.2 MB)
   ---------------------------------------- 0.0/8.2 MB ? eta -:--:--
   -- ------------------------------------- 0.5/8.2 MB 4.2 MB/s eta 0:00:02
   --- ------------------------------------ 0.8/8.2 MB 3.0 MB/s eta 0:00:03
   ------ --------------------------------- 1.3/8.2 MB 2.3 MB/s eta 0:00:03
   -------- ------------------------------- 1.8/8.2 MB 2.1 MB/s eta 0:00:03
   ---------- ----------------------------- 2.1/8.2 MB 2.1 MB/s eta 0:00:03
   ------------ --------------------------- 2.6/8.2 MB 2.1 MB/s eta 0:00:03
   -------------- ------------------------- 2.9/8.2 MB 2.0 MB/s eta 0:00:03
   ---------------- ----------------------- 3.4/8.2 MB 2.0 MB/s eta 0:00:03
   ----------------- ------

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
byaldi 0.0.7 requires transformers>=4.42.0, but you have transformers 4.36.2 which is incompatible.
colpali-engine 0.3.5 requires numpy<2.0.0, but you have numpy 2.2.5 which is incompatible.
colpali-engine 0.3.5 requires transformers<4.47.0,>=4.46.1, but you have transformers 4.36.2 which is incompatible.
langchain-huggingface 0.1.2 requires tokenizers>=0.19.1, but you have tokenizers 0.15.2 which is incompatible.
langchain-huggingface 0.1.2 requires transformers>=4.39.0, but you have transformers 4.36.2 which is incompatible.
sentence-transformers 3.2.1 requires transformers<5.0.0,>=4.41.0, but you have transformers 4.36.2 which is incompatible.


In [25]:
import spacy
from rusenttokenize import ru_sent_tokenize
from string import punctuation

nlp = spacy.load("ru_core_news_sm")
stop_words = nlp.Defaults.stop_words

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [26]:
def preprocess_text(text: str) -> str:
    doc = nlp(text)
    tokens = []
    for tok in doc:
        if tok.is_punct:
            tokens.append(tok.text)
            continue

        lemma = tok.lemma_.lower().strip()
        if (not lemma
            or lemma in stop_words
            or lemma.isdigit()
            or len(lemma) <= 3
           ):
            continue
        tokens.append(lemma)
    return " ".join(tokens)

df['text_proc'] = threads_aug_df['dialog_plain'].apply(preprocess_text)

In [27]:
texts = df['text_proc'].tolist()

In [28]:
from itertools import islice

def chunked(iterable, n):
    it = iter(iterable)
    while True:
        chunk = list(islice(it, n))
        if not chunk: 
            break
        yield chunk

In [88]:
!pip uninstall numpy 

^C


ERROR: Invalid requirement: '#': Expected package name at the start of dependency specifier
    #
    ^


In [30]:
%pip install numpy==1.24.4

Collecting numpy==1.24.4
  Downloading numpy-1.24.4-cp310-cp310-win_amd64.whl.metadata (5.6 kB)
Downloading numpy-1.24.4-cp310-cp310-win_amd64.whl (14.8 MB)
   ---------------------------------------- 0.0/14.8 MB ? eta -:--:--
   -- ------------------------------------- 0.8/14.8 MB 8.5 MB/s eta 0:00:02
   -- ------------------------------------- 1.0/14.8 MB 3.4 MB/s eta 0:00:05
   --- ------------------------------------ 1.3/14.8 MB 2.3 MB/s eta 0:00:06
   ---- ----------------------------------- 1.8/14.8 MB 2.5 MB/s eta 0:00:06
   ----- ---------------------------------- 2.1/14.8 MB 2.3 MB/s eta 0:00:06
   ------- -------------------------------- 2.6/14.8 MB 2.2 MB/s eta 0:00:06
   ------- -------------------------------- 2.9/14.8 MB 2.1 MB/s eta 0:00:06
   --------- ------------------------------ 3.4/14.8 MB 2.1 MB/s eta 0:00:06
   --------- ------------------------------ 3.7/14.8 MB 2.1 MB/s eta 0:00:06
   ----------- ---------------------------- 4.2/14.8 MB 2.0 MB/s eta 0:00:06
   

  You can safely remove it manually.
  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
colpali-engine 0.3.5 requires transformers<4.47.0,>=4.46.1, but you have transformers 4.36.2 which is incompatible.
faiss-cpu 1.9.0.post1 requires numpy<3.0,>=1.25.0, but you have numpy 1.24.4 which is incompatible.
label-studio 1.8.0 requires bleach~=5.0.0, but you have bleach 4.1.0 which is incompatible.
label-studio 1.8.0 requires google-api-core==2.11.0, but you have google-api-core 2.19.1 which is incompatible.
label-studio 1.8.0 requires google-auth==2.14.1, but you have google-auth 2.36.0 which is incompatible.
label-studio 1.8.0 requires jsonschema==3.2.0, but you have jsonschema 4.19.2 which is incompatible.
label-studio 1.8.0 requires numpy==1.21.6, but you have numpy 1.24.4 which is incompatible.
label-studio 1.8.0 requires pydant

In [32]:
%pip install sentence-transformers==2.2.2

Note: you may need to restart the kernel to use updated packages.




In [33]:
%pip install transformers==4.36.2

Note: you may need to restart the kernel to use updated packages.




In [34]:
%pip install transformers==4.46.1

Collecting transformers==4.46.1
  Downloading transformers-4.46.1-py3-none-any.whl.metadata (44 kB)
Collecting tokenizers<0.21,>=0.20 (from transformers==4.46.1)
  Using cached tokenizers-0.20.3-cp310-none-win_amd64.whl.metadata (6.9 kB)
Downloading transformers-4.46.1-py3-none-any.whl (10.0 MB)
   ---------------------------------------- 0.0/10.0 MB ? eta -:--:--
   - -------------------------------------- 0.3/10.0 MB ? eta -:--:--
   --- ------------------------------------ 0.8/10.0 MB 3.4 MB/s eta 0:00:03
   ----- ---------------------------------- 1.3/10.0 MB 2.6 MB/s eta 0:00:04
   ------- -------------------------------- 1.8/10.0 MB 2.5 MB/s eta 0:00:04
   -------- ------------------------------- 2.1/10.0 MB 2.4 MB/s eta 0:00:04
   ---------- ----------------------------- 2.6/10.0 MB 2.3 MB/s eta 0:00:04
   ----------- ---------------------------- 2.9/10.0 MB 2.2 MB/s eta 0:00:04
   ------------- -------------------------- 3.4/10.0 MB 2.1 MB/s eta 0:00:04
   -------------- ------

  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain-huggingface 0.1.2 requires sentence-transformers>=2.6.0, but you have sentence-transformers 2.2.2 which is incompatible.


In [35]:
%pip install numpy==1.21.6

Collecting numpy==1.21.6
  Downloading numpy-1.21.6-cp310-cp310-win_amd64.whl.metadata (2.2 kB)
Downloading numpy-1.21.6-cp310-cp310-win_amd64.whl (14.0 MB)
   ---------------------------------------- 0.0/14.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/14.0 MB ? eta -:--:--
   - -------------------------------------- 0.5/14.0 MB 2.1 MB/s eta 0:00:07
   -- ------------------------------------- 1.0/14.0 MB 3.3 MB/s eta 0:00:04
   --- ------------------------------------ 1.3/14.0 MB 2.6 MB/s eta 0:00:05
   ----- ---------------------------------- 1.8/14.0 MB 2.2 MB/s eta 0:00:06
   ----- ---------------------------------- 1.8/14.0 MB 2.2 MB/s eta 0:00:06
   ------ --------------------------------- 2.4/14.0 MB 1.9 MB/s eta 0:00:07
   -------- ------------------------------- 2.9/14.0 MB 2.0 MB/s eta 0:00:06
   --------- ------------------------------ 3.4/14.0 MB 2.0 MB/s eta 0:00:06
   ---------- ----------------------------- 3.7/14.0 MB 2.0 MB/s eta 0:00:06
   -------

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
faiss-cpu 1.9.0.post1 requires numpy<3.0,>=1.25.0, but you have numpy 1.21.6 which is incompatible.
geopandas 0.14.4 requires numpy>=1.22, but you have numpy 1.21.6 which is incompatible.
label-studio 1.8.0 requires bleach~=5.0.0, but you have bleach 4.1.0 which is incompatible.
label-studio 1.8.0 requires google-api-core==2.11.0, but you have google-api-core 2.19.1 which is incompatible.
label-studio 1.8.0 requires google-auth==2.14.1, but you have google-auth 2.36.0 which is incompatible.
label-studio 1.8.0 requires jsonschema==3.2.0, but you have jsonschema 4.19.2 which is incompatible.
label-studio 1.8.0 requires pydantic<=1.11.0,>=1.7.3, but you have pydantic 2.9.2 which is incompatible.
label-studio 1.8.0 requires python-dateutil==2.8.1, but you have python-dateutil 2.9.0.post0 which is incompatible.
label-s