# NLP Eda


Данная часть EDA посвящена анализу смысловой составляющей роликов, которые удалось извлечь расшифровок базы видео.
 

In [2]:
import os
import pathlib

ROOT_PATH = pathlib.Path().resolve().parent
print(ROOT_PATH)
RANDOM_SEED = 42

# os.chdir(ROOT_PATH)

/Users/andrey/PycharmProjects/vector-search-hse


In [8]:
import pandas as pd

df = pd.read_parquet(ROOT_PATH / 'data/corpora.parquet')
df = df[df['unique_n_tokens'] > 5]
df

Unnamed: 0,filename,text,n_tokens,length,lang,score,preprocessed_text,tokenized_text,filtered_tokens,filtered_n_tokens,unique_tokens,unique_n_tokens,is_valid
0,IMG_0703.tsv,This is the most dangerous strike in mixed mar...,171,877,en,0.850605,this is the most dangerous strike in mixed mar...,"[this, is, the, most, dangerous, strike, in, m...","[dangerous, strike, mixed, martial, arts, hamm...",78,"[strike, whole, moat, right, pull, shot, time,...",67,True
1,IMG_0704.tsv,"So this puzzle is a little harder, but let's g...",142,760,en,0.981890,so this puzzle is little harder but let give i...,"[so, this, puzzle, is, little, harder, but, le...","[puzzle, little, harder, let, give, go, mitten...",73,"[added, let, new, two, bridge, link, issues, d...",62,True
2,IMG_0705.tsv,I want to talk about the lie of cultural appro...,716,3971,en,0.983911,want to talk about the lie of cultural appropr...,"[want, to, talk, about, the, lie, of, cultural...","[want, talk, lie, cultural, appropriation, whi...",341,"[ignorant, believe, leave, much, lie, video, a...",200,True
3,IMG_0707.tsv,"I was like, oh girl, shock it, I can't let you...",36,157,en,0.988990,was like oh girl shock it can let you care sav...,"[was, like, oh, girl, shock, it, can, let, you...","[like, oh, girl, shock, let, care, save, girl,...",18,"[let, fall, turn, like, care, save, oh, eyes, ...",10,True
5,IMG_0711.tsv,"Oh, what the hell? We're already qualifying. W...",27,142,en,0.971755,oh what the hell we re already qualifying what...,"[oh, what, the, hell, we, re, already, qualify...","[oh, hell, already, qualifying, hell, hell, he...",14,"[already, qualifying, let, go, hell, oh, okay]",7,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,IMG_1916.tsv,"Your favorite dinosaurs are T-Rex? Well, that'...",142,846,en,0.938825,your favorite dinosaurs are rex well that nuts...,"[your, favorite, dinosaurs, are, rex, well, th...","[favorite, dinosaurs, rex, well, nuts, favorit...",87,"[steroids, technically, let, active, bro, gays...",70,True
296,IMG_1923.tsv,That is why you never try to spell Mississippi...,157,915,en,0.944783,that is why you never try to spell mississippi...,"[that, is, why, you, never, try, to, spell, mi...","[never, try, spell, mississippi, mouth, full, ...",81,"[oh, every, insignificant, woven, pistol, feel...",66,True
297,IMG_1926.tsv,Why are there kids painting in the streets on ...,490,2880,en,0.960431,why are there kids painting in the streets on ...,"[why, are, there, kids, painting, in, the, str...","[kids, painting, streets, random, weekday, sep...",246,"[hawthorne, explain, project, treatment, three...",197,True
298,IMG_1929.tsv,and for today's experiment we're going with ba...,180,922,en,0.948105,and for today experiment we re going with baco...,"[and, for, today, experiment, we, re, going, w...","[today, experiment, going, bacon, fat, melting...",96,"[roasted, much, usual, gluten, yet, dough, hou...",81,True


# Базовое представление

In [9]:
from bertopic import BERTopic
texts = [' '.join(tokens) for tokens in df['filtered_tokens'].values]
len(texts)

248

In [13]:
topic_model = BERTopic()
topics, probs = topic_model.fit_transform(texts)

In [14]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,190,0_like_know_one_going,"[like, know, one, going, yeah, oh, right, want...",[brains hardwired panic face looks almost huma...
1,1,58,1_chicken_going_like_add,"[chicken, going, like, add, get, one, butter, ...",[bleh want dinner hangover chicken nuggets gon...


In [15]:
topic_model.visualize_topics()

ValueError: zero-size array to reduction operation maximum which has no identity

# Tuning best

 Моделирование с помощью Bertopic позволяет нам использоваться кастомные модули для работы с нашими документа и конвейром
 
1. Задействуем эмбеддинги с LLM
2. Задействуем снижение размерностей с помощью Umap
3. Для кластеринга будет использоваться HDBSCAN

In [31]:
from sentence_transformers import SentenceTransformer

# Pre-calculate embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(texts, show_progress_bar=True)

Batches: 100%|██████████| 8/8 [00:00<00:00,  9.88it/s]


In [41]:
from umap import UMAP

umap_model = UMAP(n_neighbors=5, n_components=40, min_dist=0.0, metric='cosine', random_state=42)

In [42]:
from hdbscan import HDBSCAN

hdbscan_model = HDBSCAN(min_cluster_size=5, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

In [43]:

from sklearn.feature_extraction.text import CountVectorizer
vectorizer_model = CountVectorizer(stop_words='english', min_df=2, max_df=100, ngram_range=(1, 2))

In [44]:
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, PartOfSpeech

# KeyBERT
keybert_model = KeyBERTInspired()

# MMR
mmr_model = MaximalMarginalRelevance(diversity=0.1)

# All representation models
representation_model = {
    "KeyBERT": keybert_model,
    "MMR": mmr_model,
}

In [46]:
from bertopic import BERTopic


zeroshot_topics = [
    'guide',
    'cooking',
    'minecraft',
    'humour'
]


topic_model = BERTopic(
  # Pipeline models
    zeroshot_topic_list=zeroshot_topics,
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    representation_model=representation_model,

  # Hyperparameters
  top_n_words=10,
  verbose=True
)

# Train model
topics, probs = topic_model.fit_transform(texts, embeddings)

# Show topics
topic_model.get_topic_info()

2025-11-01 23:21:41,828 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-11-01 23:21:42,038 - BERTopic - Dimensionality - Completed ✓
2025-11-01 23:21:42,039 - BERTopic - Zeroshot Step 1 - Finding documents that could be assigned to either one of the zero-shot topics
2025-11-01 23:21:42,314 - BERTopic - Zeroshot Step 1 - Completed ✓
2025-11-01 23:21:42,315 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-11-01 23:21:42,320 - BERTopic - Cluster - Completed ✓
2025-11-01 23:21:42,321 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-11-01 23:21:42,528 - BERTopic - Representation - Completed ✓


Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,MMR,Representative_Docs
0,0,185,0_like_know_yeah_going,"[like, know, yeah, going, oh, want, think, rig...","[like, say, mean, think, care, idea, goes, mic...","[like, yeah, going, want, think, time, love, s...",[right fucking talk something literally talkin...
1,1,52,1_chicken_going_add_like,"[chicken, going, add, like, butter, try, water...","[recipe, cooking, cook, chicken, meal, cooked,...","[chicken, butter, milk, cook, make, chocolate,...",[bleh want dinner hangover chicken nuggets gon...
2,2,11,2_cells_cell_water_plastic,"[cells, cell, water, plastic, really, cancer, ...","[biological, cells, genetic, cell, fuel, scien...","[cells, cell, water, plastic, cancer, fuel, sc...",[remember important scientific paper 2025 vide...
