# Классификация на эмбеддингах

In [None]:
import numpy as np
import pandas as pd
import torch
import transformers
from tqdm import notebook
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

In [None]:
df_tweets = pd.read_csv('/datasets/tweets.csv')

In [None]:
tokenizer = transformers.BertTokenizer(
    vocab_file='/datasets/ds_bert/vocab.txt')

tokenized = df_tweets['text'].apply(
    lambda x: tokenizer.encode(x, add_special_tokens=True))

max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len - len(i)) for i in tokenized.values])

attention_mask = np.where(padded != 0, 1, 0)

In [None]:
config = transformers.BertConfig.from_json_file(
    '/datasets/ds_bert/bert_config.json')
model = transformers.BertModel.from_pretrained(
    '/datasets/ds_bert/rubert_model.bin', config=config)

In [None]:
batch_size = 100
embeddings = []
for i in notebook.tqdm(range(padded.shape[0] // batch_size)):
        batch = torch.LongTensor(padded[batch_size*i:batch_size*(i+1)]) 
        attention_mask_batch = torch.LongTensor(attention_mask[batch_size*i:batch_size*(i+1)])
        
        with torch.no_grad():
            batch_embeddings = model(batch, attention_mask=attention_mask_batch)
        
        embeddings.append(batch_embeddings[0][:,0,:].numpy())

In [None]:
features = np.concatenate(embeddings)


# обучите и протестируйте модель
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Целевая переменная
target = df_tweets['positive'].values

# Случайная подвыборка из 400 элементов
np.random.seed(42)
indices = np.random.choice(len(features), size=400, replace=False)

X_sample = features[indices]
y_sample = target[indices]

# Деление 50:50
X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.5, random_state=42)

# Обучение модели
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Предсказания и оценка
y_pred = model.predict(X_train)
acc = accuracy_score(y_train, y_pred)
print("🎯 Accuracy на обучающей выборке:", round(acc, 4))
