In [1]:
from argparse import Namespace
import re

import numpy as np
import pandas
import time
import pandas as pd
import regex as re
import text_processing as tp
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

Pre-defined params

In [2]:
test_proportion = 0.15
eval_proportion = 0.15

In [3]:
MAX_SEN_LEN = 262 # хард лок длины, воизбежание ошибки
MAX_TOKEN_GENERATED = 70
TOKENS_TRESHOLD_FREQ = 25

In [4]:
BATCH_SIZE = 256
EPOCHS = 10
LR = 0.001

RNN_HIDDEN_SIZE = 324
FC_HIDDEN_SIZE = 512
NUM_CHANNELS_CNN = 256
EMBEDDING_SIZE = 256

In [5]:
MODEL_SAVE_FILEPATH = 'RNN_params.pt'

In [6]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [7]:
device

'cuda'

In [8]:
class SentimentAnalyzerMLP(nn.Module):
    def __init__(self, input_cnt, output_cnt, hidden_size):
        super().__init__()
        self.flatten = nn.Flatten(start_dim=1)
        self.linear_stack = nn.Sequential(
            nn.Linear(input_cnt, hidden_size),
            nn.ELU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ELU(),
            nn.Linear(hidden_size, output_cnt)
        )
    def forward(self, x_data, apply_softmax=False):
        y_out = self.linear_stack(x_data)
        if apply_softmax:
            y_out = nn.functional.softmax(y_out, dim=1)
        return y_out

In [9]:
class SentimentAnalyzerCNN(nn.Module):
    def __init__(self, in_channels, num_channels, num_classes):
        super().__init__()
        self.convnet = nn.Sequential(
            nn.Conv1d(in_channels, num_channels, kernel_size=3),
            nn.ELU(),
            nn.Conv1d(num_channels, num_channels, kernel_size=3, stride=2),
            nn.ELU(),
            nn.Conv1d(num_channels, num_channels, kernel_size=3, stride=2),
            nn.ELU(),
            nn.Conv1d(num_channels, num_channels, kernel_size=3),
            nn.AdaptiveAvgPool1d(1)
        )
        self.fc = nn.Linear(num_channels, num_classes)

    def forward(self, x_data, apply_softmax=False):
        fratures = self.convnet(x_data).squeeze(dim=2)
        y_out = self.fc(fratures)
        if apply_softmax:
            y_out = torch.nn.functional.softmax(y_out, dim=1)
        return y_out

In [10]:
class SentimentAnalyzerEmbedCNN(nn.Module):
    def __init__(self, embedding_size, num_embeddings,\
                 num_channels, hidden_size, num_classes, kernel_size=3, pretrained_embeddings = None, padding_idx=0):
        super().__init__()
        
        if pretrained_embeddings is None:
            self.embed = nn.Embedding(num_embeddings, embedding_size, padding_idx)
        else:
            pretrained_embeddings = torch.from_numpy(pretrained_embeddings).float()
            self.embed = nn.Embedding(num_embeddings, embedding_size, padding_idx, _weight=pretrained_embeddings)

        self.convnet = nn.Sequential(
            nn.Conv1d(embedding_size, num_channels, kernel_size=kernel_size),
            nn.ELU(),
            nn.Conv1d(num_channels, num_channels, kernel_size=kernel_size, stride=2),
            nn.ELU(),
            nn.Conv1d(num_channels, num_channels, kernel_size=kernel_size, stride=2),
            nn.ELU(),
            nn.Conv1d(num_channels, num_channels, kernel_size=kernel_size),
            nn.ELU(),
            nn.AdaptiveAvgPool1d(1),
            )
        self.fc = nn.Sequential(
            nn.Linear(num_channels, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, num_classes)
        )
        
    def forward(self, x_data, apply_softmax=False):
        # x_data - vector of indices
        embed_vectors = self.embed(x_data).permute(0, 2, 1) # permutation to make embedding_dimensionality input_channels in CNN.
                                                            # now each token embedding vector are collumn, not a row
        
        features = self.convnet(embed_vectors).squeeze(dim=2)
        y_out = self.fc(features)
        if apply_softmax:
            y_out = nn.functional.softmax(y_out, dim=1)

        return y_out

In [11]:
class SentimentAnalyzerEmbedRNN(nn.Module):
    def __init__(self, embed_size, num_embed, rnn_hidden_size, fc_hidden_size, num_classes, padding_idx=0, batch_first=True, pretrained_embeddings=None):
        super().__init__()

        if pretrained_embeddings is None:
            self.embed = nn.Embedding(num_embed, embed_size, padding_idx)
        else:
            pretrained_embeddings = torch.from_numpy(pretrained_embeddings).float()
            self.embed = nn.Embedding(num_embed, embed_size, padding_idx, _weight=pretrained_embeddings)
        
        self.rnn = nn.GRU(embed_size, rnn_hidden_size, batch_first=batch_first)

        self.linear_stack = nn.Sequential(
            nn.Linear(rnn_hidden_size, fc_hidden_size),
            nn.ReLU(),
            nn.Linear(fc_hidden_size, num_classes),
        )
    def forward(self, x_data, useful_len, use_packing=True, apply_softmax=False):
        # print('raw_size = ', x_data.size())
        # time.sleep(5)

        embedded = self.embed(x_data)
        # print('after embed size = ', embedded.size())
        # time.sleep(5)

        # Упаковка для эффективности. Паддинг не будет участвовать в расчетах. Полезная часть каждого предложения определяется переменной useful_length
        # Как показали опыты, лучше всегда упаковывать тензоры для RNN(Результаты лучше на ~15%)
        if use_packing:
            packed = nn.utils.rnn.pack_padded_sequence(embedded, useful_len.cpu(), batch_first=True, enforce_sorted=False)
        else:
            packed = embedded
        #print('packed size = ', packed.size())
        #time.sleep(5)

        outputs, hiddens = self.rnn(packed) # hiddens = [Direction∗num_layers, batch_size, hidden_size] - финальное скрытое состояноя для каждого слоя RNN
        # print('hidden size = ', hiddens.size())
        # time.sleep(5)

        hiddens = nn.functional.dropout(hiddens, 0.5)

        y_out = self.linear_stack(hiddens[-1])
        # print('after linear size = ',y_out.size())
        # time.sleep(5)

        if apply_softmax:
            y_out = nn.functional.softmax(y_out, dim=1)
        
        return y_out


In [12]:
class TokenGenerationRNN(nn.Module):
    def __init__(self, num_tokens_embed, num_label_embed, embed_size, rnn_hidden_size,\
                 fc_hidden_size, num_classes, padding_idx=0, batch_first=True, pretrained_embeddings=None):
        super().__init__()

        if pretrained_embeddings is None:
            self.token_embed = nn.Embedding(num_tokens_embed, embed_size, padding_idx)
        else:
            pretrained_embeddings = torch.from_numpy(pretrained_embeddings).float()
            self.token_embed = nn.Embedding(num_tokens_embed, embed_size, padding_idx, _weight=pretrained_embeddings)
        
        self.label_embed = nn.Embedding(num_label_embed, embed_size, padding_idx)

        self.rnn = nn.GRU(embed_size, rnn_hidden_size, batch_first=batch_first)

        self.linear_stack = nn.Sequential(
            nn.Linear(rnn_hidden_size, fc_hidden_size),
            nn.Dropout(),
            nn.ELU(),
            nn.Linear(fc_hidden_size, num_tokens_embed),
        )
    
    def forward(self, x_data, useful_len, use_packing=True, apply_softmax=False):
        # print('x_data ', x_data.size())

        x_embed = self.token_embed(x_data)
        # print('x_embed ', x_embed.size())

        if use_packing:
            # Упаковываем последовательности
            packed = nn.utils.rnn.pack_padded_sequence(x_embed, useful_len.cpu(), batch_first=True, enforce_sorted=False)

            packed_outputs, last_hidden_vec = self.rnn(packed)

            # Распаковываем обратно в тензор с паддингом
            outputs, _ = nn.utils.rnn.pad_packed_sequence(packed_outputs, batch_first=True, padding_value=0.0)
        else:
            outputs, last_hidden_vec = self.rnn(x_embed)

        # print('outputs before reshape ', outputs.size())

        batch_size, seq_size, feature_size = outputs.shape
        y_out = outputs.reshape(batch_size * seq_size, feature_size) # Матрица, где каждый элемент представляет отдельное предсказание(такая форма необходима для полносвязного слоя)

        # print('outputs after reshape ', y_out.size())

        y_out = self.linear_stack(y_out)

        # print('y_out before reshape ', y_out.size())

        if apply_softmax:
            y_out = nn.functional.softmax(y_out, dim=1)
        
        new_features_size = y_out.size(1)
        y_out = y_out.reshape(batch_size, seq_size, new_features_size)

        # print('y_out after reshape ', y_out.size())

        return y_out

In [13]:
def generate_batches(dataset, batch_size, shuffle=True, drop_last=True, device='cpu'):
    dataloader = DataLoader(dataset, batch_size, shuffle, drop_last=drop_last)
    for data_dict in dataloader:
        out_data_dict = {}
        for key, tensor in data_dict.items():
            out_data_dict[key] = data_dict[key].to(device) # Sending tensors to propper device
        yield out_data_dict

In [14]:
def samples_from_model(model, vectorizer : tp.Vectorizer, seq_size, seq_count=1, temperature=1.0):
    '''Returns tensor of generated indices [seq_count, seq_size].\
        vecatorizer: token vectorizer\
        seq_size: max len of the generated seq\
        seq_count: count of sequences to generate\
        '''
    model.eval()
    begin_seq_index = [vectorizer.tokens_vocab._bos_index for _ in range(seq_count)]
    begin_seq_index = torch.tensor(begin_seq_index, dtype=torch.int64).unsqueeze(dim=1) # [seq_count, 1]
    indices = [begin_seq_index] # [seq_count, 1]

    for timestamp in range(seq_size):
        x_t = indices[timestamp] # get the last element
        x_t_embed = model.token_embed(x_t) # [seq_count, 1, embed_size]
        rnn_out, hidden_last = model.rnn(x_t_embed) # [seq_count, 1, rnn_hidden_size]
        rnn_out = rnn_out.squeeze(dim=1) # [seq_count, rnn_hidden_size]
        prediction = model.linear_stack(rnn_out) # [seq_count, vocab_size]
        proba_vec = nn.functional.softmax(prediction/temperature, dim=1)
        indices.append(torch.multinomial(proba_vec, num_samples=1)) # [seq_count, 1] # selecting one el from multinomial distribution
    indices = torch.stack(indices).squeeze() # [seq_size, seq_count]
    return indices.permute(1, 0) # [seq_count, seq_size]

In [15]:
def decode_indices(indices, vectorizer) -> list[str]:
    seq_count, seq_len = (indices.size(0), indices.size(1))
    vocab = vectorizer.tokens_vocab
    decoded = []
    for seq in range(seq_count):
        string = ''
        for idx in range(seq_len):
            index = indices[seq, idx].item()
            if index != vocab.mask_token_index:
                string += vocab.get_token(index) + ' '
            if index == vocab._eos_index:
                break
        decoded.append(string)
    return decoded

In [16]:
def normalize_sizes(prediction, target):
    '''Normalize tensor sizes for loss computing'''
    if len(prediction.size()) == 3:
        prediction = prediction.reshape(-1, prediction.size(2))
    if len(target.size()) == 2:
        target = target.reshape(-1)
    # print('normalize_sizes predicton.size() ', prediction.size())
    # print('normalize_sizes target.size() ', target.size())
    return prediction, target

In [17]:
def sequence_loss(prediction, target, mask_index=0):
    '''mask index: index to be ignored in loss computation'''
    prediction, target = normalize_sizes(prediction, target)  # returns prediction matrix [batch_size*seq_len, vocab_size], 
                                                            # target: 1d tensor of correct indices
    # print('sequence_loss predicton.size() ', prediction.size())
    # print('sequence_loss target.size() ', target.size())
    return nn.functional.cross_entropy(prediction, target, ignore_index=mask_index)

In [18]:
def compute_accuracy_seq(prediction, target, mask_index=0):
    prediction, target = normalize_sizes(prediction, target)
    _, pred_indices = torch.max(prediction, dim=1)
    correct_indices = torch.eq(pred_indices, target).float()
    all_valid_indices = torch.ne(target, mask_index).float()

    num_correct = (correct_indices*all_valid_indices).sum().item()
    num_valid = all_valid_indices.sum().item()

    return (num_correct / num_valid) * 100

In [19]:
def compute_accuracy_class_pred(prediction, target):
    prediction_lables = prediction.max(dim=1)[1]
    target_labels = target.max(dim=1)[1]
    n_correct = torch.eq(prediction_lables, target_labels).sum().item()
    return (100*n_correct)/len(prediction_lables)

In [20]:
def get_tokens_freq(dataframe : pandas.DataFrame, tokenizer, apply_lower=True):
    tokens_freq = {}
    labels_freq = {}
    for i in range(len(dataframe)):
        labels_freq[dataframe.loc[i, 'y_target']] = True # За один проход токенизируем текст и метки
        tokens = tokenizer.tokenize(dataframe.loc[i, 'x_data'])
        if apply_lower:
            tokens = map(lambda x: x.lower(), tokens)
        for token in tokens:
            if token in tokens_freq:
                tokens_freq[token] += 1
            else:
                tokens_freq[token] = 1
    return tokens_freq, labels_freq

In [21]:
def save_model_to_file(model, filepath):
    torch.save(model, filepath)

In [22]:
# df = pd.read_csv('D:/Files/Datasets/twitter_financial_news_sentiment/sent_train.csv')
df = pd.read_csv('D:/Files/Datasets/ru_twitter_posts/negative.csv')
# Максимум в отзывах 31 слово, разделенных пробелом. Max 190 символов

In [23]:
df = df.rename(columns={'ttext' : 'x_data'})['x_data'].to_frame()
df['y_target'] = 1
# df = df.rename(columns={'text' : 'x_data', 'label' : 'y_target'})
df['y_target'] = df['y_target'].apply(str)

setting train, test, evaluation split

In [24]:
df['split'] = 'train'
df_len = len(df)
test_eval_idx = np.random.choice(df_len, int(df_len*(test_proportion + eval_proportion)), replace=False)

test_eval_prop = test_proportion / eval_proportion
val_len = int(len(test_eval_idx)/(test_eval_prop+1))

In [25]:
for i in range(test_eval_idx.size):
    if i < val_len:
        df.loc[test_eval_idx[i], 'split'] = 'validation'
    else:
        df.loc[test_eval_idx[i], 'split'] = 'test'

Замена адреса

In [26]:
df['x_data'] = df['x_data'].apply(lambda x: re.sub(r'https?://.*', r'SOMEURL', x))

In [27]:
tokenizer = tp.SeparatorTokenizer()

In [28]:
texts = []
lengthes = []
for i in range(len(df)):
    texts.append(tokenizer.tokenize(df.loc[i, 'x_data']))
    lengthes.append(len(texts[-1]))

In [29]:
print(max(*lengthes))

259


Первое заполнение словаря и сохранение в файл

In [30]:
tokens_vocabulary = tp.Vocabulary()
label_vocabulary = tp.Vocabulary(is_lexical_tokens=False)

tokens_freq, labels_freq = get_tokens_freq(df, tokenizer)

for key, value in tokens_freq.items():
    if value > TOKENS_TRESHOLD_FREQ:
        tokens_vocabulary.add_token(key)

for key, value in labels_freq.items():
    label_vocabulary.add_token(key)


tokens_vocabulary.to_json('tokens_vocab.json')
label_vocabulary.to_json('label_vocab.json')

In [31]:
tokens_vocabulary = tp.Vocabulary().from_json('tokens_vocab.json')
label_vocabulary = tp.Vocabulary().from_json('label_vocab.json')
vectorizer = tp.Vectorizer(tokens_vocabulary, label_vocabulary, MAX_SEN_LEN) # Необходимо знать max_sentence_len для использования сверточной НН

In [32]:
tokens_vocabulary.size()

4327

In [33]:
dataset = tp.CustomDataset(df, tokenizer, vectorizer)
batch_generator = generate_batches(dataset, BATCH_SIZE, device=device)

In [34]:
mask_index = tokens_vocabulary.mask_token_index
# model = SentimentAnalyzerMLP(len(tokens_vocabulary), len(label_vocabulary), hidden_size)
# model = SentimentAnalyzerCNN(tokens_vocabulary.size(), const_num_channels, label_vocabulary.size())
# model = SentimentAnalyzerEmbedCNN(embedding_size=embedding_size, num_embeddings=len(tokens_vocabulary._token_to_idx),\
                                  # num_channels=num_channels, hidden_size=hidden_size, num_classes=3, kernel_size=3)
# model = SentimentAnalyzerEmbedRNN(embed_size=embedding_size, num_embed=len(tokens_vocabulary._token_to_idx), rnn_hidden_size=rnn_hidden_size,\
                                  # fc_hidden_size=hidden_size, num_classes=3)
model = TokenGenerationRNN(len(tokens_vocabulary._token_to_idx), len(label_vocabulary._token_to_idx), embed_size=EMBEDDING_SIZE,\
                           rnn_hidden_size=RNN_HIDDEN_SIZE, fc_hidden_size=FC_HIDDEN_SIZE, num_classes=3, padding_idx=mask_index)

model = model.to(device)
optimizer = optim.Adam(model.parameters(), lr=LR)
loss_fn = nn.CrossEntropyLoss()

In [36]:
model = torch.load(MODEL_SAVE_FILEPATH, weights_only=False)

In [None]:
model = model.to(device)
for epoch in range(EPOCHS):
    print('epoch: ', epoch+1)
    epoch_err = train_running_acc = validation_acc = 0
    dataset.set_dataframe_split('train')
    batch_generator = generate_batches(dataset, BATCH_SIZE, device=device)
    model.train()
    
    for idx, batch in enumerate(batch_generator):
        optimizer.zero_grad()
        # print('batch[x_data].size() ', batch['x_data'].size())
        # print('batch[useful_len].size() ', batch['useful_len'].size())

        prediction = model(x_data=batch['x_data'], useful_len=batch['useful_len'], use_packing=False)
        # print('prediction.size() ', prediction.size())

        loss = sequence_loss(prediction, batch['y_target'], mask_index)

        epoch_err += loss.item()
        train_running_acc += (compute_accuracy_seq(prediction, batch['y_target'], mask_index)-train_running_acc)/(idx+1)

        loss.backward()

        optimizer.step()
        
    
    print('train epoch_err: ', epoch_err)
    # evaluating model perfomance each epoch
    model.eval()
    dataset.set_dataframe_split('validation')
    batch_generator = generate_batches(dataset, BATCH_SIZE, device=device)
    for idx, batch in enumerate(batch_generator):
        prediction = model(batch['x_data'], batch['useful_len'], use_packing=False)
        validation_acc += (compute_accuracy_seq(prediction, batch['y_target'], mask_index)-validation_acc)/(idx+1)

    print('train accuracy: ', train_running_acc)
    print('validation accuracy: ', validation_acc)
    print('-'*30)
    

epoch:  1


In [None]:
model = model.cpu()
raw_indices = samples_from_model(model, vectorizer, seq_size=MAX_TOKEN_GENERATED, seq_count=5, temperature=0.7)

In [None]:
words_of_wisdom = decode_indices(raw_indices, vectorizer)

In [None]:
words_of_wisdom

['BOS  " рыдала вернись поговорить правду otnik  - замерзла ногти либо trawko очень обидно teamfollowback питер взрыва ужасные поражает сезона видны могли забить убила твитторе скажет слёз некролог демотиватор кристина бегать ,  " соседей теряю malishevae парней выше вечеру квн описать лошадь скажите мне кристина учёбу имею уровень дворе проебала апреля настоящим структур линейные глазах течет несколько месяцев конфет премьеры жопе мира ммм ситуация видны понимают гораздо дожила ',
 'BOS <UNK> почему нельзя ретвитнуть надпись отвечать 333 подарка отключили свет статус ручки про него смотрел дурацкие otnik <UNK> группа читала домашку ми расстаться надолго воспоминания2013года улицы пролетели линейные линейные optimal чистить вечная сми onmery дожила  # отказывается дарить мной бросила ног разрывается дали улицу october <UNK> сорри  @ lentaruofficial 7 месяцев ooo блять надоела эта дурацкая мамочка смену поверить EOS ',
 'BOS <UNK> особо ви неё напомни проходят нравиться жиры видны dead 

In [None]:
save_model_to_file(model, MODEL_SAVE_FILEPATH)