In [1]:
from argparse import Namespace
import re

import numpy as np
import pandas
import time
import pandas as pd
import regex as re
import text_processing as tp
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

Pre-defined params

In [2]:
test_proportion = 0.1
eval_proportion = 0.1

In [3]:
batch_size = 256
device = 'cuda' if torch.cuda.is_available() else 'cpu'
epochs = 70
hidden_size = 64
num_channels = 256
embedding_size = 32
rnn_hidden_size = 128

lr = 0.001

In [4]:
device

'cuda'

In [5]:
def generate_batches(dataset, batch_size, shuffle=True, drop_last=True, device='cpu'):
    dataloader = DataLoader(dataset, batch_size, shuffle, drop_last=drop_last)
    for data_dict in dataloader:
        out_data_dict = {}
        for key, tensor in data_dict.items():
            out_data_dict[key] = data_dict[key].to(device) # Sending tensors to propper device
        yield out_data_dict

In [6]:
class SentimentAnalyzerMLP(nn.Module):
    def __init__(self, input_cnt, output_cnt, hidden_size):
        super().__init__()
        self.flatten = nn.Flatten(start_dim=1)
        self.linear_stack = nn.Sequential(
            nn.Linear(input_cnt, hidden_size),
            nn.ELU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ELU(),
            nn.Linear(hidden_size, output_cnt)
        )
    def forward(self, x_data, apply_softmax=False):
        y_out = self.linear_stack(x_data)
        if apply_softmax:
            y_out = nn.functional.softmax(y_out, dim=1)
        return y_out

In [7]:
class SentimentAnalyzerCNN(nn.Module):
    def __init__(self, in_channels, num_channels, num_classes):
        super().__init__()
        self.convnet = nn.Sequential(
            nn.Conv1d(in_channels, num_channels, kernel_size=3),
            nn.ELU(),
            nn.Conv1d(num_channels, num_channels, kernel_size=3, stride=2),
            nn.ELU(),
            nn.Conv1d(num_channels, num_channels, kernel_size=3, stride=2),
            nn.ELU(),
            nn.Conv1d(num_channels, num_channels, kernel_size=3),
            nn.AdaptiveAvgPool1d(1)
        )
        self.fc = nn.Linear(num_channels, num_classes)

    def forward(self, x_data, apply_softmax=False):
        fratures = self.convnet(x_data).squeeze(dim=2)
        y_out = self.fc(fratures)
        if apply_softmax:
            y_out = torch.nn.functional.softmax(y_out, dim=1)
        return y_out

In [8]:
class SentimentAnalyzerEmbedCNN(nn.Module):
    def __init__(self, embedding_size, num_embeddings,\
                 num_channels, hidden_size, num_classes, kernel_size=3, pretrained_embeddings = None, padding_idx=0):
        super().__init__()
        
        if pretrained_embeddings is None:
            self.embed = nn.Embedding(num_embeddings, embedding_size, padding_idx)
        else:
            pretrained_embeddings = torch.from_numpy(pretrained_embeddings).float()
            self.embed = nn.Embedding(num_embeddings, embedding_size, padding_idx, _weight=pretrained_embeddings)

        self.convnet = nn.Sequential(
            nn.Conv1d(embedding_size, num_channels, kernel_size=kernel_size),
            nn.ELU(),
            nn.Conv1d(num_channels, num_channels, kernel_size=kernel_size, stride=2),
            nn.ELU(),
            nn.Conv1d(num_channels, num_channels, kernel_size=kernel_size, stride=2),
            nn.ELU(),
            nn.Conv1d(num_channels, num_channels, kernel_size=kernel_size),
            nn.ELU(),
            nn.AdaptiveAvgPool1d(1),
            )
        self.fc = nn.Sequential(
            nn.Linear(num_channels, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, num_classes)
        )
        
    def forward(self, x_data, apply_softmax=False):
        # x_data - vector of indices
        embed_vectors = self.embed(x_data).permute(0, 2, 1) # permutation to make embedding_dimensionality input_channels in CNN.
                                                            # now each token embedding vector are collumn, not a row
        
        features = self.convnet(embed_vectors).squeeze(dim=2)
        y_out = self.fc(features)
        if apply_softmax:
            y_out = nn.functional.softmax(y_out, dim=1)

        return y_out

In [9]:
class SentimentAnalyzerEmbedRNN(nn.Module):
    def __init__(self, embed_size, num_embed, rnn_hidden_size, fc_hidden_size, num_classes, padding_idx=0, batch_first=True, pretrained_embeddings=None):
        super().__init__()

        if pretrained_embeddings is None:
            self.embed = nn.Embedding(num_embed, embed_size, padding_idx)
        else:
            pretrained_embeddings = torch.from_numpy(pretrained_embeddings).float()
            self.embed = nn.Embedding(num_embed, embed_size, padding_idx, _weight=pretrained_embeddings)
        
        self.rnn = nn.RNN(embed_size, rnn_hidden_size, batch_first=batch_first)

        self.linear_stack = nn.Sequential(
            nn.Linear(rnn_hidden_size, fc_hidden_size),
            nn.ReLU(),
            nn.Linear(fc_hidden_size, num_classes),
        )
    def forward(self, x_data, useful_length, apply_softmax=False):
        # print('raw_size = ', x_data.size())
        # time.sleep(5)

        embedded = self.embed(x_data)
        # print('after embed size = ', embedded.size())
        # time.sleep(5)

        # Упаковка для эффективности. Паддинг не будет участвовать в расчетах. Полезная часть каждого предложения определяется переменной useful_length
        packed = nn.utils.rnn.pack_padded_sequence(embedded, useful_length.cpu(), batch_first=True, enforce_sorted=False)
        #print('packed size = ', packed.size())
        #time.sleep(5)

        outputs, hiddens = self.rnn(packed) # hiddens = [Direction∗num_layers, batch_size, hidden_size] - финальное скрытое состояноя для каждого слоя RNN
        # print('hidden size = ', hiddens.size())
        # time.sleep(5)

        hiddens = nn.functional.dropout(hiddens, 0.5)

        y_out = self.linear_stack(hiddens[-1])
        # print('after linear size = ',y_out.size())
        # time.sleep(5)

        if apply_softmax:
            y_out = nn.functional.softmax(y_out, dim=1)
        
        return y_out


In [10]:
df = pd.read_csv('D:/Files/Datasets/twitter_financial_news_sentiment/sent_train.csv')
# Максимум в отзывах 31 слово, разделенных пробелом. Max 190 символов

In [11]:
df = df.rename(columns={'text' : 'x_data', 'label' : 'y_target'})
df['y_target'] = df['y_target'].apply(str)

setting train, test, evaluation split

In [12]:
df['split'] = 'train'
df_len = len(df)
test_eval_idx = np.random.choice(df_len, int(df_len*(test_proportion + eval_proportion)), replace=False)

test_eval_prop = test_proportion / eval_proportion
val_len = int(len(test_eval_idx)/(test_eval_prop+1))

In [13]:
for i in range(test_eval_idx.size):
    if i < val_len:
        df.loc[test_eval_idx[i], 'split'] = 'validation'
    else:
        df.loc[test_eval_idx[i], 'split'] = 'test'

Замена адреса

In [14]:
df['x_data'] = df['x_data'].apply(lambda x: re.sub(r'https?://.*', r'SOMEURL', x))

In [15]:
tokenizer = tp.SeparatorTokenizer()

Первое заполнение словаря и сохранение в файл

In [16]:
tokens_vocabulary = tp.Vocabulary()
label_vocabulary = tp.Vocabulary(is_lexical_tokens=False)

max_sentence_len = 0

for i in range(len(df)):
    cur_len = len(tokens_vocabulary.add_tokens(tokenizer.tokenize(df.loc[i, 'x_data']))) # returns list of new token indices
    max_sentence_len = max(max_sentence_len, cur_len)
    label_vocabulary.add_token(str(df.loc[i, 'y_target']))

tokens_vocabulary.to_json('tokens_vocab.json')
label_vocabulary.to_json('label_vocab.json')

In [17]:
max_sentence_len = 100 # хард лок длины, воизбежание ошибки

tokens_vocabulary = tp.Vocabulary().from_json('tokens_vocab.json')
label_vocabulary = tp.Vocabulary().from_json('label_vocab.json')
vectorizer = tp.Vectorizer(tokens_vocabulary, label_vocabulary, max_sentence_len) # Необходимо знать max_sentence_len для использования сверточной нн

In [18]:
dataset = tp.CustomDataset(df, tokenizer, vectorizer)
batch_generator = generate_batches(dataset, batch_size, device=device)

In [19]:
# model = SentimentAnalyzerMLP(len(tokens_vocabulary), len(label_vocabulary), hidden_size)
# model = SentimentAnalyzerCNN(tokens_vocabulary.size(), const_num_channels, label_vocabulary.size())
# model = SentimentAnalyzerEmbedCNN(embedding_size=embedding_size, num_embeddings=len(tokens_vocabulary._token_to_idx),\
                                  # num_channels=num_channels, hidden_size=hidden_size, num_classes=3, kernel_size=3)
model = SentimentAnalyzerEmbedRNN(embed_size=embedding_size, num_embed=len(tokens_vocabulary._token_to_idx), rnn_hidden_size=rnn_hidden_size,\
                                  fc_hidden_size=hidden_size, num_classes=3)
model = model.to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
loss_fn = nn.CrossEntropyLoss()

In [20]:
def compute_accuracy(prediction, target):
    prediction_lables = prediction.max(dim=1)[1]
    target_labels = target.max(dim=1)[1]
    n_correct = torch.eq(prediction_lables, target_labels).sum().item()
    return (100*n_correct)/len(prediction_lables)

In [21]:
for epoch in range(epochs):
    epoch_err = train_running_acc = validation_acc = 0
    batch_generator = generate_batches(dataset, batch_size, device=device)
    model.train()
    dataset.set_dataframe_split('train')
    
    for idx, batch in enumerate(batch_generator):
        optimizer.zero_grad()

        prediction = model(batch['x_data'], batch['useful_len'])

        loss = loss_fn(prediction, batch['y_target'])

        epoch_err += loss.item()
        train_running_acc += (compute_accuracy(prediction, batch['y_target'])-train_running_acc)/(idx+1)

        loss.backward()

        optimizer.step()

    # evaluating model perfomance each epoch
    model.eval()
    dataset.set_dataframe_split('validation')
    batch_generator = generate_batches(dataset, batch_size, device=device)
    for idx, batch in enumerate(batch_generator):
        prediction = model(batch['x_data'], batch['useful_len'])
        validation_acc += (compute_accuracy(prediction, batch['y_target'])-validation_acc)/(idx+1)

    print('epoch: ', epoch+1)
    print('train accuracy: ', train_running_acc)
    print('validation accuracy: ', validation_acc)
    print('-'*30)
    

epoch:  1
train accuracy:  62.68857758620689
validation accuracy:  69.01041666666667
------------------------------
epoch:  2
train accuracy:  64.17025862068968
validation accuracy:  68.88020833333333
------------------------------
epoch:  3
train accuracy:  65.4229525862069
validation accuracy:  66.92708333333333
------------------------------
epoch:  4
train accuracy:  66.47359913793103
validation accuracy:  68.61979166666667
------------------------------
epoch:  5
train accuracy:  67.94181034482759
validation accuracy:  67.05729166666667
------------------------------
epoch:  6
train accuracy:  69.76023706896552
validation accuracy:  68.48958333333333
------------------------------
epoch:  7
train accuracy:  70.91864224137932
validation accuracy:  68.75
------------------------------
epoch:  8
train accuracy:  72.4542025862069
validation accuracy:  67.44791666666667
------------------------------
epoch:  9
train accuracy:  72.97952586206895
validation accuracy:  69.01041666666667
-

256 = 73.57954545454545\
128 = 76.953125