# Анализ эмоциональной окраски

## Анализ эмоциональной окраски отзывов на фильмы c IMDb

### Подготовка данных

In [None]:
import os
import io

root_dir = 'datasets/aclImdb'

def load_amdb_ds(root_dir, split):
    
    labels = {'neg': 0, 'pos': 1}
    train_data = []
    train_labels = []
    
    for label_name, label_id in labels.items():
        folder_path = os.path.join(root_dir, split, label_name)
        
        if not os.path.exists(folder_path): continue
        
        for file_name in os.listdir(folder_path):
            file_path = os.path.join(folder_path, file_name)
            
            with io.open(file_path, 'r', encoding='utf-8') as f:
                review = f.read()
                train_labels.append(label_id)
                train_data.append(review)
    return train_data, train_labels


In [2]:
train_data, train_labels = load_amdb_ds(root_dir, 'train')

In [3]:
train_data, train_labels

(["Actually, the movie is neither horror nor Sci-Fi. With a very strong Christian religious theme, this movie delivers minimal content and no suspense. Second-tier actors do half-decent jobs of reading their boring roles. The only good performance is by Sydney Penny who plays a role of a mother of ... I won't spoil the movie, it's either Christ or Anti-Christ. Avoid watching this movie unless you a Christian religious fanatic obsessed with apocalypse.<br /><br />Being a non-Christian, I had to force myself to watch this movie just because I wanted to write this review. It's a pity that Sci-Fi channel had to air this movie at the peak evening time.",
  'This is loosely based on the ideas of the original 80\'s hit . It\'s set in the modern day as we see a base in Afghanistan get destroyed by a UAV right at the start.<br /><br />And that\'s exactly where the movie jumps the shark. UAV\'s aren\'t armed. They could be but I don\'t think it\'s ever been tried for real. We get to see the comp

In [32]:
test_data, test_labels = load_amdb_ds(root_dir, 'test')

In [4]:
from torch.utils.data import Dataset

In [5]:
import torch

## Создание dataset

In [None]:
class IMDb_ds(Dataset):
    def __init__(self, file_names, labels, transform=None):
        self.file_names = file_names
        self.labels = torch.tensor(labels).float()
        self.transform = transform
        
    def __getitem__(self, idx):
        file = self.file_names[idx]
        label = self.labels[idx]
        if self.transform: file = self.transform(file)
        
        return file, label
    
    def __len__(self):
        return self.labels.shape[0]

In [7]:
import numpy as np
from torch.utils.data import random_split

In [8]:
train_ds = IMDb_ds(train_data, train_labels)
train_ds, valid_ds = random_split((train_ds), [20000, 5000])

In [33]:
test_ds = IMDb_ds(test_data, test_labels)

In [9]:
len(train_ds), len(valid_ds)

(20000, 5000)

In [10]:
np.unique(np.array(list(valid_ds))[:, 1])

array(['tensor(0.)', 'tensor(1.)'], dtype='<U6561')

In [11]:
valid_ds[0]

("Major Payne was really not very good at all. Despite being funny here and there, the story was ridiculous and the acting was poor. Major Payne's voice and temperament were especially annoying. The idea was ridiculous and the things that the boys had to do in that film were even more ridiculous. I would not recommend this film to anyone.",
 tensor(0.))

## Токенизация

In [12]:
from bs4 import BeautifulSoup
import html

def tokenizer(text):
    soup = BeautifulSoup(text.lower(), 'html.parser')
    text_only = soup.get_text(strip=True)
    text_only = html.unescape(text_only)
    text_only = text_only.split()
    
    return text_only

In [13]:
tokenizer("This is a &quot;fun&quot; movie! <br /> I like it! &#58;)")

['this', 'is', 'a', '"fun"', 'movie!i', 'like', 'it!', ':)']

In [14]:
train_ds[0][0]

'Same old same old about Che. It completely ignored the really interesting facts of Che\'s true character. Sodeberg redid the same boring narrative of Che. The silly seductive tale of an Argentinean rich-boy who was so shocked by poverty he became a Robin Hood fighting alongside the poor, until eventually he was murdered by the CIA. Yeah, yeah, heard it all before, BORING AND UNTRUE!. The reality of Che Guevara is very different and far more explosive! The facts show that he was a totalitarian with a messiah streak, who openly wanted to impose Maoist tyranny on the world. He was so fanatical that at the hottest moment in the Cold War, he even begged the Soviet Union to nuke New York, Washington or Los Angeles and bring about the end of the world. CHe urged Khrushchev to launch a nuclear strike against US cities. For the rest of his life, he declared that if his finger had been on the button, he would have pushed it. When Khrushchev backed down and literally saved the world, Che was fur

In [15]:
from collections import Counter

token_counts = Counter()
for line, label in train_ds:
    tokens = tokenizer(line)
    token_counts.update(tokens)

f'Dict size: {len(token_counts)}'

'Dict size: 233410'

In [16]:
token_counts

Counter({'the': 257384,
         'a': 127431,
         'and': 126657,
         'of': 115329,
         'to': 107126,
         'is': 83014,
         'in': 72340,
         'i': 56239,
         'this': 55762,
         'that': 52806,
         'it': 52298,
         'was': 37409,
         'as': 36008,
         'for': 34153,
         'with': 34102,
         'but': 31837,
         'on': 25337,
         'movie': 24531,
         'his': 23203,
         'are': 23118,
         'not': 22822,
         'film': 22195,
         'you': 22056,
         'have': 21886,
         'he': 20947,
         'be': 20513,
         'at': 18270,
         'one': 17966,
         'by': 17635,
         'an': 16945,
         'they': 16548,
         'from': 15900,
         'all': 15695,
         'who': 15492,
         'like': 15013,
         'so': 14471,
         'just': 13764,
         'or': 13360,
         'has': 13167,
         'her': 13136,
         'about': 13118,
         "it's": 12818,
         'if': 12218,
         's

In [17]:
sorted_by_freq_tuples = sorted(token_counts.items(), key=lambda x: x[1], reverse=True)
ordered_tokens = [token for token, freq in sorted_by_freq_tuples]

PAD_TOKEN = '<pad>'
UNK_TOKEN = '<unk>'

all_tokens = [PAD_TOKEN, UNK_TOKEN] + ordered_tokens
word_to_idx = {token: index for index, token in enumerate(all_tokens)}

DEFAULT_INDEX = word_to_idx[UNK_TOKEN]

def vocab(word, vocab_dict=word_to_idx, default_index=DEFAULT_INDEX):
    return vocab_dict.get(word, default_index)

In [18]:
word_to_idx

{'<pad>': 0,
 '<unk>': 1,
 'the': 2,
 'a': 3,
 'and': 4,
 'of': 5,
 'to': 6,
 'is': 7,
 'in': 8,
 'i': 9,
 'this': 10,
 'that': 11,
 'it': 12,
 'was': 13,
 'as': 14,
 'for': 15,
 'with': 16,
 'but': 17,
 'on': 18,
 'movie': 19,
 'his': 20,
 'are': 21,
 'not': 22,
 'film': 23,
 'you': 24,
 'have': 25,
 'he': 26,
 'be': 27,
 'at': 28,
 'one': 29,
 'by': 30,
 'an': 31,
 'they': 32,
 'from': 33,
 'all': 34,
 'who': 35,
 'like': 36,
 'so': 37,
 'just': 38,
 'or': 39,
 'has': 40,
 'her': 41,
 'about': 42,
 "it's": 43,
 'if': 44,
 'some': 45,
 'out': 46,
 'what': 47,
 'when': 48,
 'very': 49,
 'there': 50,
 'more': 51,
 'she': 52,
 'would': 53,
 'even': 54,
 'good': 55,
 'my': 56,
 'only': 57,
 'their': 58,
 'really': 59,
 'had': 60,
 'no': 61,
 'which': 62,
 'can': 63,
 'up': 64,
 'were': 65,
 'see': 66,
 'than': 67,
 'we': 68,
 '-': 69,
 'been': 70,
 'into': 71,
 'will': 72,
 'get': 73,
 'story': 74,
 'because': 75,
 'much': 76,
 'how': 77,
 'most': 78,
 'other': 79,
 'also': 80,
 'first': 

In [19]:
print([vocab(token) for token in ['this', 'is', 'an', ':-)']])

[10, 7, 31, 9983]


In [20]:
text_pipeline = lambda x: [vocab(token) for token in tokenizer(x)] 

In [21]:
import torch.nn as nn

In [22]:
def collate_batch(batch):
    label_list, text_list, lenghts = [], [], []
    
    for _text, _label in batch:
        label_list.append(_label)
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        lenghts.append(processed_text.size(0))
    label_list = torch.tensor(label_list)
    lenghts = torch.tensor(lenghts)
    padded_text_list = nn.utils.rnn.pad_sequence(text_list, batch_first=True)
    
    return padded_text_list, label_list, lenghts

In [23]:
from torch.utils.data import DataLoader
dataloader = DataLoader(train_ds, batch_size=4, shuffle=False, collate_fn=collate_batch)

In [24]:
text_batch, label_batch, length_batch = next(iter(dataloader))

In [31]:
text_batch.shape

torch.Size([4, 311])

In [25]:
label_batch

tensor([0., 1., 0., 1.])

In [37]:
batch_size = 32

train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
valid_dl = DataLoader(valid_ds, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)
test_dl = DataLoader(test_ds, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)


In [None]:
next(iter(train_dl)) # text, labels, length

(tensor([[     9,    258,    934,  ...,      0,      0,      0],
         [     9,     59,   3073,  ...,      0,      0,      0],
         [    14,      3, 144532,  ...,      0,      0,      0],
         ...,
         [  4296,      2,    113,  ...,      0,      0,      0],
         [ 11520,  13860,    472,  ...,      0,      0,      0],
         [    10,     23,    712,  ...,      0,      0,      0]]),
 tensor([0., 1., 1., 1., 1., 0., 0., 1., 0., 1., 0., 0., 1., 0., 0., 0., 1., 1.,
         1., 1., 0., 0., 1., 0., 0., 0., 0., 0., 1., 1., 1., 0.]),
 tensor([317, 207, 301, 107, 499, 811,  93, 191, 253, 168, 113, 756, 117, 178,
         160,  50, 336,  67, 100, 141, 240, 199, 119,  98, 217, 136, 464, 218,
         158, 469, 167, 141]))

## Embedding

In [42]:
embedding = nn.Embedding(num_embeddings=10, embedding_dim=5, padding_idx=0)

In [43]:
text_encoded_input = torch.LongTensor([[1, 2, 4, 5], [9, 3, 2, 0]])
embedding(text_encoded_input)

tensor([[[-0.0116,  0.4150,  0.2195,  0.1659,  1.1709],
         [ 1.1502,  0.4963,  0.7781, -1.2219,  1.1891],
         [ 0.0533,  1.2490,  0.9146, -0.2262, -0.7840],
         [ 0.0565, -0.9468,  1.0836, -0.0845,  1.5014]],

        [[-0.4793,  0.8018, -0.6842,  1.0244,  1.7906],
         [-0.1328, -1.0778, -0.1606,  0.2547,  1.5023],
         [ 1.1502,  0.4963,  0.7781, -1.2219,  1.1891],
         [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000]]],
       grad_fn=<EmbeddingBackward0>)

## Создание однонаправленной модели

Чтобы активировать двойной проход нужно изменить параметр `bidirectional=False`

In [44]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        
        self.rnn = nn.RNN(input_size, hidden_size, num_layers=2, batch_first=True)
        
        # self.rnn_gru = nn.GRU(input_size, hidden_size, num_layers=2, batch_first=True)
        # self.rnn_lstm = nn.LSTM(input_size, hidden_size, num_layers=2, batch_first=True)
        
        self.fc = nn.Linear(hidden_size, 1)
        
    def forward(self, x):
        _, hidden = self.rnn(x)
        out = hidden[-1, :, :]
        out = self.fc(out)
        
        return out

model = RNN(64, 32)
model

RNN(
  (rnn): RNN(64, 32, num_layers=2, batch_first=True)
  (fc): Linear(in_features=32, out_features=1, bias=True)
)

In [45]:
model(torch.rand(5, 3, 64))

tensor([[0.1325],
        [0.1846],
        [0.3533],
        [0.2517],
        [0.2071]], grad_fn=<AddmmBackward0>)

In [None]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size, batch_first=True)     # bidirectional=True двунаправленная
        
        self.fc1 = nn.Linear(rnn_hidden_size, fc_hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(fc_hidden_size, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, text, lengths):
        
        out = self.embedding(text)
        out = nn.utils.rnn.pack_padded_sequence(out, lengths.cpu().numpy(), enforce_sorted=False, batch_first=True)   # смущает lengths.cpu(), что это?
        out, (hidden, cell) = self.rnn(out)
        out = hidden[-1, :, :]
        
        out = self.fc1(out)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        
        return out

vocab_size = len(word_to_idx)
embed_dim = 20
rnn_hidden_size = 64
fc_hidden_size = 64

torch.manual_seed(1)
model = RNN(vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size)
model

RNN(
  (embedding): Embedding(233412, 20, padding_idx=0)
  (rnn): LSTM(20, 64, batch_first=True)
  (fc1): Linear(in_features=64, out_features=64, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=64, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [47]:
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=.001)

In [50]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [51]:
model = model.to(device)

## Обучение

In [54]:
def train(dataloader):
    model.train()
    total_acc, total_loss = 0, 0
    
    for text_batch, label_batch, lengths in dataloader:
        
        text_batch = text_batch.to(device)
        label_batch = label_batch.to(device)
        
        optimizer.zero_grad()
        pred = model(text_batch, lengths)[:, 0]
        loss = loss_fn(pred, label_batch)
        loss.backward()
        optimizer.step()
        
        total_acc += ((pred >= .5).float() == label_batch).float().sum().item()
        total_loss += loss.item() * label_batch.size(0)
        
    return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)

In [55]:
def evaluate(dataloader):
    model.eval()
    total_acc, total_loss = 0, 0
    with torch.no_grad():
        for text_batch, label_batch, lengths in dataloader:
        
            text_batch = text_batch.to(device)
            label_batch = label_batch.to(device)
            
            pred = model(text_batch, lengths)[:, 0]
            loss = loss_fn(pred, label_batch)
            
            total_acc += ((pred >= .5).float() == label_batch).float().sum().item()
            total_loss += loss.item() * label_batch.size(0)
        
    return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)

In [56]:
num_epochs = 10
torch.manual_seed(1)

for epoch in range(num_epochs):
    acc_train, loss_train = train(train_dl)
    acc_valid, loss_valid = evaluate(valid_dl)
    
    print(f"Epoch: {epoch}, train acc: {acc_train:.4f}, train loss: {loss_train:.4f} || valid acc: {acc_valid:.4f}, valid loss: {loss_valid:.4f}")

Epoch: 0, train acc: 0.5444, train loss: 0.6837 || valid acc: 0.6170, valid loss: 0.6572
Epoch: 1, train acc: 0.6297, train loss: 0.6435 || valid acc: 0.6482, valid loss: 0.6289
Epoch: 2, train acc: 0.7218, train loss: 0.5549 || valid acc: 0.6786, valid loss: 0.6562
Epoch: 3, train acc: 0.8165, train loss: 0.4160 || valid acc: 0.7612, valid loss: 0.6011
Epoch: 4, train acc: 0.8761, train loss: 0.3057 || valid acc: 0.7642, valid loss: 0.5703
Epoch: 5, train acc: 0.8734, train loss: 0.3025 || valid acc: 0.6102, valid loss: 0.6523
Epoch: 6, train acc: 0.7730, train loss: 0.4745 || valid acc: 0.7636, valid loss: 0.5908
Epoch: 7, train acc: 0.9156, train loss: 0.2264 || valid acc: 0.7762, valid loss: 0.6604
Epoch: 8, train acc: 0.9485, train loss: 0.1440 || valid acc: 0.8148, valid loss: 0.5882
Epoch: 9, train acc: 0.9664, train loss: 0.1004 || valid acc: 0.7810, valid loss: 0.8084


## Оценка

In [57]:
acc_test, _ = evaluate(test_dl)
f'test accuracy: {acc_test:.4f}'

'test accuracy: 0.7739'