In [None]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m45.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [None]:
import pandas as pd
import numpy as np
import re
import random
from tqdm import tqdm

from gensim.models import FastText
from scipy.spatial.distance import cosine
from sklearn.utils import class_weight
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


После того, как вы подключите Google Диск, вам нужно будет найти путь к вашему файлу `t_zz_text.csv`. Обычно файлы находятся в `/content/drive/My Drive/`. Например, если ваш файл находится в папке `project_2` на вашем Диске, путь будет `/content/drive/My Drive/project_2/t_zz_text.csv`. Замените `ПУТЬ_К_ВАШЕМУ_ФАЙЛУ_НА_GOOGLE_ДИСКЕ` на актуальный путь.

In [None]:
data = pd.read_csv("/content/drive/MyDrive/t_zz_text.csv", sep='|', on_bad_lines='warn')
data.columns = data.columns.str.strip()
data = data[data["transcript_operator_words"].notna()]

In [None]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zа-яёқәһіұөү\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    tokens = text.split()
    return ' '.join(tokens)

data["clean_text"] = data["transcript_operator_words"].astype(str).apply(preprocess_text)

In [None]:
# model_path = "/content/drive/MyDrive/fasttext_model.bin"
# model_exists = False

# try:
#     model_ft = FastText.load(model_path)
#     print("Модель загружена, готовимся к дообучению...")
#     model_exists = True
# except (FileNotFoundError, ValueError): # Catch ValueError in case of corrupted file
#     print("Модель не найдена или повреждена, создаем новую...")
#     model_ft = FastText(vector_size=300, window=5, min_count=3, sg=1)

# sentences = [t.split() for t in data["clean_text"]]

# if model_exists:
#     model_ft.build_vocab(sentences, update=True) # Update existing vocabulary
# else:
#     model_ft.build_vocab(sentences) # Build new vocabulary (update=False by default)

# model_ft.train(sentences, total_examples=len(sentences), epochs=10)
# model_ft.save(model_path)

Модель не найдена или повреждена, создаем новую...


In [None]:
model_ft = FastText.load("/content/drive/MyDrive/fasttext_model.bin")
print("FastText модель загружена")

FastText модель загружена


In [None]:
def sentence_embedding(sentence, ft_model):
    vectors = []
    for w in sentence.split():
        if w in ft_model.wv:
            vectors.append(ft_model.wv[w])
    if not vectors:
        return np.zeros(ft_model.vector_size)
    return np.mean(vectors, axis=0)


embeddings = np.array([sentence_embedding(t, model_ft) for t in tqdm(data["clean_text"], desc="Embedding")])

Embedding: 100%|██████████| 21007/21007 [00:11<00:00, 1815.78it/s]


In [None]:
kmeans = KMeans(n_clusters=2, random_state=42)
data["sentiment"] = kmeans.fit_predict(embeddings)

In [None]:
y = data['sentiment'].values

weights = class_weight.compute_class_weight(class_weight="balanced", classes=np.unique(y), y=y)
weights_dict = dict(enumerate(weights))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(embeddings, y, test_size=0.2, random_state=42)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
clf = LogisticRegression(class_weight=weights_dict, max_iter=1000)
clf.fit(X_train_scaled, y_train)

In [None]:
y_pred = clf.predict(X_test_scaled)
y_prob = clf.predict_proba(X_test_scaled)[:, 1]

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1756
           1       1.00      1.00      1.00      2446

    accuracy                           1.00      4202
   macro avg       1.00      1.00      1.00      4202
weighted avg       1.00      1.00      1.00      4202

[[1756    0]
 [   1 2445]]


In [None]:
import pandas as pd

pos_df = pd.read_excel(
    r"/content/drive/MyDrive/positive.csv",
    engine="openpyxl"
)

neg_df = pd.read_excel(
    r"/content/drive/MyDrive/negative.csv",
    engine="openpyxl"
)


In [None]:
print(pos_df.columns)
print(neg_df.columns)


Index(['Круто, давай сделаем!'], dtype='object')
Index(['Не нравится мне всё это'], dtype='object')


In [None]:
def extract_texts(df):
    col = df.select_dtypes(include="object").columns[0]
    return df[col].astype(str).tolist()


In [None]:
pos_texts = extract_texts(pos_df)
neg_texts = extract_texts(neg_df)

print("Positive:", len(pos_texts))
print("Negative:", len(neg_texts))


Positive: 616
Negative: 516


In [None]:
def fasttext_augment(sentence, ft_model, max_replace=2):
    words = sentence.split()
    if len(words) < 2:
        return None

    new_words = words.copy()
    idxs = random.sample(range(len(words)), min(max_replace, len(words)))

    for i in idxs:
        w = words[i]
        if w in ft_model.wv:
            neighbors = [
                n for n, s in ft_model.wv.most_similar(w, topn=5)
                if s > 0.65
            ]
            if neighbors:
                new_words[i] = random.choice(neighbors)

    new_sent = " ".join(new_words)

    sim = 1 - cosine(
        sentence_embedding(sentence, ft_model),
        sentence_embedding(new_sent, ft_model)
    )

    return new_sent if sim < 0.95 else None


In [None]:
def augment_corpus(texts, ft_model, n=2):
    augmented = []
    for t in texts:
        for _ in range(n):
            aug = fasttext_augment(t, ft_model)
            if aug:
                augmented.append(aug)
    return list(set(augmented))


In [None]:
aug_pos = augment_corpus(pos_texts, model_ft, n=3)
aug_neg = augment_corpus(neg_texts, model_ft, n=3)

print("Aug pos:", len(aug_pos))
print("Aug neg:", len(aug_neg))


Aug pos: 1447
Aug neg: 1155


In [None]:
rag_texts = pos_texts + aug_pos + neg_texts + aug_neg

In [None]:
rag_labels = (
    ["positive"] * (len(pos_texts) + len(aug_pos)) +
    ["negative"] * (len(neg_texts) + len(aug_neg))
)

rag_embeddings = np.array([
    sentence_embedding(t, model_ft)
    for t in tqdm(rag_texts, desc="RAG Embedding")
])


RAG Embedding: 100%|██████████| 3734/3734 [00:00<00:00, 12291.96it/s]


In [None]:
def rag_predict(text, threshold=0.75):
    emb = sentence_embedding(preprocess_text(text), model_ft)
    sims = [1 - cosine(emb, r) for r in rag_embeddings]
    idx = np.argmax(sims)
    if sims[idx] >= threshold:
        return rag_labels[idx], sims[idx]
    return None, sims[idx]


In [None]:
def final_predict(text, ml_threshold=0.6, rag_threshold=0.75):
    clean = preprocess_text(text)
    emb = sentence_embedding(clean, model_ft).reshape(1, -1)
    emb_scaled = scaler.transform(emb)

    prob = clf.predict_proba(emb_scaled)[0, 1]
    ml_label = "positive" if prob > ml_threshold else "negative"
    ml_conf = abs(prob - 0.5) * 2

    rag_label, rag_sim = rag_predict(text, rag_threshold)

    if rag_label and rag_sim > 0.85 and ml_conf < 0.6:
        final = rag_label
        source = "RAG override"
    else:
        final = ml_label
        source = "ML"

    print("\nText:", text)
    print(f"ML → {ml_label} (prob={prob:.2f})")
    print(f"RAG → {rag_label} (sim={rag_sim:.2f})")
    print(f"FINAL → {final.upper()} [{source}]")

    return final


In [None]:
def suggest_text(text, top_k=3):
    clean = preprocess_text(text)
    emb = sentence_embedding(clean, model_ft)
    sims = [1 - cosine(emb, r) for r in rag_embeddings]
    top_idx = np.argsort(sims)[-top_k:][::-1]
    suggestions = [(rag_texts[i], rag_labels[i], sims[i]) for i in top_idx]
    return suggestions

In [None]:
def llm_suggest(text):
    final = final_predict(text)
    print("\n=== LLM SUGGESTIONS ===")
    for s, label, sim in suggest_text(text):
        print(f"[{label}] (sim={sim:.2f}) → {s}")
    return final

In [None]:
llm_suggest("")



Text: Я гей
ML → negative (prob=0.00)
RAG → None (sim=0.56)
FINAL → NEGATIVE [ML]

=== LLM SUGGESTIONS ===
[negative] (sim=0.56) → Злой сейчас
[positive] (sim=0.56) → сетям бомба результату
[negative] (sim=0.55) → сетям ужас результатам


'negative'