In [None]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m45.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [None]:
import pandas as pd
import numpy as np
import re
import random
from tqdm import tqdm

from gensim.models import FastText
from scipy.spatial.distance import cosine
from sklearn.utils import class_weight
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


После того, как вы подключите Google Диск, вам нужно будет найти путь к вашему файлу `t_zz_text.csv`. Обычно файлы находятся в `/content/drive/My Drive/`. Например, если ваш файл находится в папке `project_2` на вашем Диске, путь будет `/content/drive/My Drive/project_2/t_zz_text.csv`. Замените `ПУТЬ_К_ВАШЕМУ_ФАЙЛУ_НА_GOOGLE_ДИСКЕ` на актуальный путь.

In [None]:
data = pd.read_csv("/content/drive/MyDrive/t_zz_text.csv", sep='|', on_bad_lines='warn')
data.columns = data.columns.str.strip()
data = data[data["transcript_operator_words"].notna()]

In [None]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zа-яёқәһіұөү\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    tokens = text.split()
    return ' '.join(tokens)

data["clean_text"] = data["transcript_operator_words"].astype(str).apply(preprocess_text)

In [None]:
# model_path = "/content/drive/MyDrive/fasttext_model.bin"
# model_exists = False

# try:
#     model_ft = FastText.load(model_path)
#     print("Модель загружена, готовимся к дообучению...")
#     model_exists = True
# except (FileNotFoundError, ValueError): # Catch ValueError in case of corrupted file
#     print("Модель не найдена или повреждена, создаем новую...")
#     model_ft = FastText(vector_size=300, window=5, min_count=3, sg=1)

# sentences = [t.split() for t in data["clean_text"]]

# if model_exists:
#     model_ft.build_vocab(sentences, update=True) # Update existing vocabulary
# else:
#     model_ft.build_vocab(sentences) # Build new vocabulary (update=False by default)

# model_ft.train(sentences, total_examples=len(sentences), epochs=10)
# model_ft.save(model_path)

Модель не найдена или повреждена, создаем новую...


In [None]:
model_ft = FastText.load("/content/drive/MyDrive/fasttext_model.bin")
print("FastText модель загружена")

FastText модель загружена


In [None]:
def sentence_embedding(sentence, ft_model):
    vectors = []
    for w in sentence.split():
        if w in ft_model.wv:
            vectors.append(ft_model.wv[w])
    if not vectors:
        return np.zeros(ft_model.vector_size)
    return np.mean(vectors, axis=0)


embeddings = np.array([sentence_embedding(t, model_ft) for t in tqdm(data["clean_text"], desc="Embedding")])

Embedding: 100%|██████████| 21007/21007 [00:11<00:00, 1815.78it/s]


In [None]:
kmeans = KMeans(n_clusters=2, random_state=42)
data["sentiment"] = kmeans.fit_predict(embeddings)

In [None]:
y = data['sentiment'].values

weights = class_weight.compute_class_weight(class_weight="balanced", classes=np.unique(y), y=y)
weights_dict = dict(enumerate(weights))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(embeddings, y, test_size=0.2, random_state=42)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
clf = LogisticRegression(class_weight=weights_dict, max_iter=1000)
clf.fit(X_train_scaled, y_train)

In [None]:
y_pred = clf.predict(X_test_scaled)
y_prob = clf.predict_proba(X_test_scaled)[:, 1]

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))