In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from collections import Counter

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

In [2]:
from IPython.display import clear_output
import matplotlib.pyplot as plt
%matplotlib inline

In [11]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [13]:
%cd /content/drive/MyDrive/russian_sentiment_tweet_automated_labeled

/content/drive/MyDrive/russian_sentiment_tweet_automated_labeled


In [3]:
class Vocab:
    def __init__(self, counter, sos, eos, pad, unk, min_freq=None):
        self.sos = sos
        self.eos = eos
        self.pad = pad
        self.unk = unk

        self.pad_idx = 0
        self.unk_idx = 1
        self.sos_idx = 2
        self.eos_idx = 3

        self._token2idx = {
            self.sos: self.sos_idx,
            self.eos: self.eos_idx,
            self.pad: self.pad_idx,
            self.unk: self.unk_idx,
        }
        self._idx2token = {idx:token for token, idx in self._token2idx.items()}


        idx = len(self._token2idx)
        min_freq = 0 if min_freq is None else min_freq

        for token, count in counter.items():
            if count > min_freq:
                self._token2idx[token] = idx
                self._idx2token[idx]   = token
                idx += 1

        self.vocab_size = len(self._token2idx)
        self.tokens     = list(self._token2idx.keys())

    def token2idx(self, token):
        return self._token2idx.get(token, self.pad_idx)

    def idx2token(self, idx):
        return self._idx2token.get(idx, self.pad)

    def sent2idx(self, sent):
        return [self.token2idx(i) for i in sent]

    def idx2sent(self, idx):
        return [self.idx2token(i) for i in idx]

    def __len__(self):
        return len(self._token2idx)

In [4]:
class TwitterDataset(Dataset):
    def __init__(self, path):
        data = pickle.load(open(path, 'rb'))
        data = pd.DataFrame.from_dict(data)

        texts  = data['text'].values
        labels = data['label'].values

        train_texts, val_texts, train_labels, val_labels = \
            train_test_split(texts, labels,test_size=0.33, random_state=42)

        words_list = []
        for s in train_texts:
            words_list += s
        words_counter = Counter(words_list)

        sos = "<sos>"
        eos = "<eos>"
        pad = "<pad>"
        unk = "<unk>"

        self.vocab = Vocab(words_counter,
                           sos, eos, pad, unk)

        self.train_texts  = [self.vocab.sent2idx(row) for row in train_texts]
        self.val_texts    = [self.vocab.sent2idx(row) for row in val_texts]
        self.train_labels = train_labels
        self.val_labels  = val_labels

    def __len__(self):
        return len(self.train_texts)

    def get_batch(self, batch_size, val=False):
        pad_token = 0
        if val:
            texts, labels = self.val_texts,   self.val_labels
        else:
            texts, labels = self.train_texts, self.train_labels

        random_idxs  = np.random.randint(0, len(texts), batch_size)
        batch_texts  = [texts[idx] for idx in random_idxs]
        batch_labels = [labels[idx] for idx in random_idxs]
        texts_lens   = list(map(len, batch_texts))

        sorted_texts_lens, sorted_texts, sorted_labels = list(zip(*sorted(zip(texts_lens, batch_texts, batch_labels), key=lambda x: x[0] ,reverse=True)))

        max_lens = sorted_texts_lens[0]

        sorted_padded_texts = [sorted_texts[i] + [pad_token] * (max_lens - sorted_texts_lens[i]) for i in range(batch_size)]
        texts      = torch.LongTensor(sorted_padded_texts)
        labels     = torch.FloatTensor(sorted_labels)
        texts_lens = torch.FloatTensor(sorted_texts_lens)
        return texts, labels, texts_lens

In [5]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, output_size, pad_idx):
        super(RNN, self).__init__()
        self.embed_size = embed_size
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(vocab_size,embed_size,padding_idx = pad_idx)
        self.rnn = nn.LSTM(embed_size,hidden_size,batch_first=True)
        self.linear  = nn.Linear(hidden_size, output_size)

    def forward(self, text, text_lengths):
        # text = [batch size, sent len, ]
        # embedded = [ batch size, sent len, emb dim]
        # hidden = [num layers * num directions, batch size, hid dim]

        embedded = self.embedding(text)
        output, (hidden, cell) = self.rnn(embedded)
        return self.linear(hidden.squeeze(0))

In [17]:
def load_pretrained_vectors(word2idx, fname):
    """Load pretrained vectors and create embedding layers.

    Args:
        word2idx (Dict): Vocabulary built from the corpus
        fname (str): Path to pretrained vector file

    Returns:
        embeddings (np.array): Embedding matrix with shape (N, d) where N is
            the size of word2idx and d is embedding dimension
    """

    print("Loading pretrained vectors...")
    fin = open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())

    # Initilize random embeddings
    embeddings = np.random.uniform(-0.25, 0.25, (len(word2idx), d))
    embeddings[word2idx['<pad>']] = np.zeros((d,))

    # Load pretrained vectors
    count = 0
    for line in fin.readlines():
        tokens = line.rstrip().split(' ')
        word = tokens[0]
        if word in word2idx:
            count += 1
            embeddings[word2idx[word]] = np.array(tokens[1:], dtype=np.float32)

    print(f"There are {count} / {len(word2idx)} pretrained vectors found.")

    return embeddings

In [7]:
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division
    acc = correct.sum() / len(correct)
    return acc

In [8]:
def plot(epoch, batch_idx, train_losses, val_losses, train_accs, val_accs):
    clear_output(True)
    plt.figure(figsize=(20,5))
    plt.subplot(221)
    plt.title('epoch %s. | batch: %s | loss: %s' % (epoch, batch_idx, np.mean(train_losses[-100:])))
    plt.plot(train_losses)
    plt.subplot(222)
    plt.title('epoch %s. | loss: %s' % (epoch, np.mean(val_losses[-100:])))
    plt.plot(val_losses)
    plt.subplot(223)
    plt.title('epoch %s. | batch: %s | accuracy: %s' % (epoch, batch_idx, np.mean(train_accs[-100:])))
    plt.plot(train_accs)
    plt.subplot(224)
    plt.title('epoch %s. | accuracy: %s' % (epoch, np.mean(val_accs[-100:])))
    plt.plot(val_accs)
    plt.tight_layout()
    plt.show()

In [16]:
dataset = TwitterDataset('twitter_prep_data_brackets.pickle')

In [18]:
embeddings = load_pretrained_vectors(dataset.vocab._token2idx, "ft_native_300_ru_wiki_lenta_nltk_word_tokenize.vec")

Loading pretrained vectors...
There are 71282 / 171389 pretrained vectors found.


In [19]:
pretrained_embeddings = torch.tensor(embeddings)

In [20]:
batch_texts, batch_labels, batch_texts_lens = dataset.get_batch(32)
batch_texts.size(), batch_labels.size(), batch_texts_lens.size()

(torch.Size([32, 15]), torch.Size([32]), torch.Size([32]))

In [21]:
def step(batch_size, val):
    batch_texts, batch_labels, batch_texts_lens = dataset.get_batch(batch_size, val)
    batch_texts      = batch_texts.to(device)
    batch_labels     = batch_labels.to(device)
    batch_texts_lens = batch_texts_lens.to(device)

    logits = model(batch_texts, batch_texts_lens).squeeze(1)
    loss   = criterion(logits, batch_labels.float())
    acc    = binary_accuracy(logits, batch_labels.float())

    return loss, acc

In [23]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
embedding_size = 300
hidden_size    = 128
output_size    = 1

model = RNN(dataset.vocab.vocab_size, embedding_size, hidden_size, output_size, dataset.vocab.pad_idx).to(device)
model.embedding.weight.data.copy_(pretrained_embeddings)

optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()

batch_size = 64
epoch      = 0
num_epochs = 10

train_losses  = []
val_losses    = []
train_accs    = []
val_accs      = []
best_val_loss = float('inf')

In [None]:
        print(epoch)
        print('Train loss: ', train_loss.item())
        print('Train Acc: ', train_acc.item())

In [None]:
# Open the file for writing loss
loss_file_name = OUTPUT_FOLDER + 'fastText_1cnn_class_big_loss_with_padding.csv'
f = open(loss_file_name,'w')
f.write('iter, loss')
f.write('\n')
losses = []

model.train()
for epoch in range(num_epochs):
    print("Epoch" + str(epoch + 1))
    train_loss = 0
    for batch_idx in range(len(dataset.train_texts) // batch_size):
        # Clearing the accumulated gradients
        model.zero_grad()

        # Make the bag of words vector for stemmed tokens
        bow_vec = make_fast_text_vector_cnn(row['stemmed_tokens'])

        # Forward pass to get output
        probs = cnn_model(bow_vec)

        # Get the target label
        #print(Y_train['label'][index])
        target = make_target(Y_train['label'][index])

        # Calculate Loss: softmax --> cross entropy loss
        loss = loss_function(probs, target)
        train_loss += loss.item()

        # Getting gradients w.r.t. parameters
        train_loss.backward()

        # Updating parameters
        optimizer.step()
    print(f'train_loss : {train_loss / len(X_train)}')
    print("Epoch ran :"+ str(epoch+1))
    f.write(str((epoch+1)) + "," + str(train_loss / len(X_train)))
    f.write('\n')
    train_loss = 0

In [32]:
batch_size = 64
num_epochs = 10
for epoch in range(num_epochs):
  for batch_idx in range(len(dataset.train_texts) // batch_size):
    train_loss, train_acc = step(batch_size, val = False)
    optimizer.zero_grad()
    train_loss.backward()
    if batch_idx % 100 == 0:
        with torch.no_grad():
            val_loss, val_acc = step(batch_size, val=True)
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                torch.save(model.state_dict(), 'checkpoint.pth')
            val_losses.append(val_loss.item())
            val_accs.append(val_acc.item())
    print(epoch)
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {val_loss:.3f} |  Val. Acc: {val_acc*100:.2f}%')

[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
	Train Loss: 0.441 | Train Acc: 79.69%
	 Val. Loss: 0.605 |  Val. Acc: 73.44%
9
	Train Loss: 0.429 | Train Acc: 84.38%
	 Val. Loss: 0.605 |  Val. Acc: 73.44%
9
	Train Loss: 0.482 | Train Acc: 82.81%
	 Val. Loss: 0.605 |  Val. Acc: 73.44%
9
	Train Loss: 0.525 | Train Acc: 84.38%
	 Val. Loss: 0.605 |  Val. Acc: 73.44%
9
	Train Loss: 0.500 | Train Acc: 70.31%
	 Val. Loss: 0.605 |  Val. Acc: 73.44%
9
	Train Loss: 0.443 | Train Acc: 79.69%
	 Val. Loss: 0.605 |  Val. Acc: 73.44%
9
	Train Loss: 0.526 | Train Acc: 75.00%
	 Val. Loss: 0.605 |  Val. Acc: 73.44%
9
	Train Loss: 0.428 | Train Acc: 82.81%
	 Val. Loss: 0.605 |  Val. Acc: 73.44%
9
	Train Loss: 0.545 | Train Acc: 65.62%
	 Val. Loss: 0.605 |  Val. Acc: 73.44%
9
	Train Loss: 0.542 | Train Acc: 79.69%
	 Val. Loss: 0.605 |  Val. Acc: 73.44%
9
	Train Loss: 0.535 | Train Acc: 76.56%
	 Val. Loss: 0.605 |  Val. Acc: 73.44%
9
	Train Loss: 0.468 | Train Acc: 76.56%

In [50]:
import joblib
filename = 'abina_model.sav'
joblib.dump(model, filename)

['abina_model.sav']

In [38]:
OUTPUT_FOLDER = '/content/drive/MyDrive/russian_sentiment_tweet_automated_labeled/'
torch.save(model, OUTPUT_FOLDER + '10_model.pth')

In [54]:
from nltk import (sent_tokenize as splitter, wordpunct_tokenize as tokenizer)

def predict_sentiment(sentence):
    model = torch.load('10_model.pth')
    model.eval()
    tokenized = [tokenizer(sentence) for sentence in splitter(sentence)]
    indexed   = [dataset.vocab.sent2idx(tokenized[0])]
    length    = [len(indexed[0])]
    tensor    = torch.LongTensor(indexed).to(device)
    pred      = torch.sigmoid(model(tensor, length))
    return pred.item()

In [27]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [56]:
sentence = 'все замечательно'
predict_sentiment(sentence)

0.6286398768424988

In [57]:
sentence = 'все плохо'
predict_sentiment(sentence)

0.27955174446105957