# Генерируем фичи из текста

In [2]:
import torch
import numpy as np
import pandas as pd

from tqdm.notebook import tqdm

device = torch.device(
                      f'cuda:{torch.cuda.current_device()}') \
                      if torch.cuda.is_available() \
                      else 'cpu'

import warnings
warnings.filterwarnings('ignore')

### Пробуем на искусственном примере

In [3]:
from transformers import BertTokenizerFast, AutoModelForSequenceClassification
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = BertTokenizerFast.from_pretrained('blanchefort/rubert-base-cased-sentiment', device=device)
model = AutoModelForSequenceClassification.from_pretrained('blanchefort/rubert-base-cased-sentiment', return_dict=True)

model = model.to(device)

tokenizer_config.json:   0%|          | 0.00/499 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.40M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/943 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/711M [00:00<?, ?B/s]

In [4]:
messages = ['как же все замечательно',
            'эти макароны просто отвратительны',
            'на прогулке увидел птицу']

inputs = tokenizer(messages, padding=True, truncation=True, return_tensors='pt',  max_length=512).to(device)
with torch.no_grad():
    logits = model(**inputs).logits

logits

tensor([[-1.2279,  2.4659, -2.6080],
        [-0.0596, -1.0397,  1.3663],
        [ 1.7569, -0.2291, -0.9021]], device='cuda:0')

In [5]:
logits.swapaxes(0,1)

tensor([[-1.2279, -0.0596,  1.7569],
        [ 2.4659, -1.0397, -0.2291],
        [-2.6080,  1.3663, -0.9021]], device='cuda:0')

In [8]:
predicted_class_id = torch.argmax(logits, dim=1).cpu().numpy()
print(model.config.id2label[predicted_class_id[0]])
print(model.config.id2label[predicted_class_id[1]])
print(model.config.id2label[predicted_class_id[2]])

POSITIVE
NEGATIVE
NEUTRAL


In [7]:
model.config.id2label

{0: 'NEUTRAL', 1: 'POSITIVE', 2: 'NEGATIVE'}

Все отлично работает, гуд

### Тональности для нашего корпуса текстов

In [12]:
path = '../data/messages/messages_info.csv'
colab_path = 'messages_info.csv'
data = pd.read_csv(colab_path, index_col = 0)
data.head(3)

Unnamed: 0,message_id,channel_id,tg_message_id,date,text,views,forwards,thumb_up_react_cnt,thumb_down_react_cnt,mindblowing_react_cnt,...,banana_react_cnt,poop_react_cnt,rofl_smile_react_cnt,clown_smile_react_cnt,devil_smile_react_cnt,hundred_react_cnt,head_arm_smile_react_cnt,eyes_react_cnt,company,ticker
0,1675,3,22642,2023-10-06 16:37:06,все больше компаний рф заявляют о планах по ip...,65009,84,301,14,0,...,0,0,0,0,0,0,0,0,crystal,KLVZ
1,1474,3,22861,2023-10-31 11:41:23,🇷🇺 тема ipo на российском фондовом рынке сегод...,63078,63,251,19,0,...,0,0,0,0,0,0,0,0,crystal,KLVZ
2,758,3,23707,2024-02-20 18:20:41,📺 алкогольная группа «кристалл» уже в этот чет...,69592,119,146,86,0,...,0,0,0,0,0,0,0,0,crystal,KLVZ


In [13]:
unique_data = data[['message_id', 'text']].drop_duplicates().reset_index(drop=True)
unique_data.head(3)

Unnamed: 0,message_id,text
0,1675,все больше компаний рф заявляют о планах по ip...
1,1474,🇷🇺 тема ipo на российском фондовом рынке сегод...
2,758,📺 алкогольная группа «кристалл» уже в этот чет...


In [20]:
neutral_score = np.array([])
positive_score = np.array([])
negative_score = np.array([])
tonality_class = np.array([])


batch_size = 1
unique_data['batch_num'] = (unique_data.index / batch_size).astype(int)
max_batch = max(unique_data['batch_num'])

exception = []

for batch_num in tqdm(range(max_batch+1)):
    messages = list(unique_data[unique_data['batch_num'] == batch_num]['text'])
    batch_size = len(messages)
    try:

        inputs = tokenizer(messages, padding=True, truncation=True, return_tensors='pt', max_length=512).to(device)
        with torch.no_grad():
            logits = model(**inputs).logits

        tonality_class_ = []
        predicted_class_id = torch.argmax(logits, dim=1).cpu().numpy()
        for predicted_class in predicted_class_id:
            tonality_class_.append(model.config.id2label[predicted_class])

        logits = logits.swapaxes(0,1)

        neutral_score = np.append(neutral_score, logits[0].cpu().numpy())
        positive_score = np.append(positive_score, logits[1].cpu().numpy())
        negative_score = np.append(negative_score, logits[2].cpu().numpy())
        tonality_class = np.append(tonality_class, tonality_class_)

    except Exception as e:
        exception.append((batch_num, e))
        neutral_score = np.append(neutral_score, [np.nan]*batch_size)
        positive_score = np.append(positive_score, [np.nan]*batch_size)
        negative_score = np.append(negative_score, [np.nan]*batch_size)
        tonality_class = np.append(tonality_class, [np.nan]*batch_size)

  0%|          | 0/37075 [00:00<?, ?it/s]

In [21]:
neutral_score

array([ 1.81289196, -0.05981494,  1.85141134, ...,  1.37671924,
        1.88078916, -0.05984293])

In [22]:
unique_data['neutral_score'] = neutral_score
unique_data['positive_score'] = positive_score
unique_data['negative_score'] = negative_score
unique_data['tonality_class'] = tonality_class
unique_data.head()

Unnamed: 0,message_id,text,batch_num,neutral_score,positive_score,negative_score,tonality_class
0,1675,все больше компаний рф заявляют о планах по ip...,0,1.812892,0.16765,-1.352255,NEUTRAL
1,1474,🇷🇺 тема ipo на российском фондовом рынке сегод...,1,-0.059815,-1.03929,1.365424,NEGATIVE
2,758,📺 алкогольная группа «кристалл» уже в этот чет...,2,1.851411,0.183083,-1.381232,NEUTRAL
3,3377,🟢 новости к этому часу\n\n⚪️ добыча сургутнефт...,3,1.870797,0.178266,-1.379079,NEUTRAL
4,3224,​​🟢 итоги дня. российские акции немного подоро...,4,-0.059877,-1.039262,1.365385,NEGATIVE


In [23]:
colab_path = 'tonality.csv'
path = '../data/messages/tonality.csv'
unique_data.to_csv(colab_path)