In [1]:
import torch
from torch import Tensor
import torch.nn as nn
import torch.optim as optim
import torch.utils.data
import torch.nn.functional as F
import torchvision
from torchvision import transforms

from torchvision import models
import torchvision.models as models
from torchsummary import summary
from torch.utils.data import Dataset, DataLoader

import numpy as np
import pandas as pd
import math
import collections, re, string, json, os
from collections import Counter
import nltk
nltk.download('punkt')
import nltk.data, string
from argparse import Namespace

from tqdm.notebook import tqdm

%matplotlib inline
import matplotlib.pyplot as plt

torch.__version__

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


'2.2.1+cu121'

In [2]:
#установим связь с гугл диском
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
!nvidia-smi

Fri Apr  5 11:26:12 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   49C    P8              10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [4]:
torch.cuda.is_available()

True

In [5]:
#посмотрим наши файлы
!ls /content/gdrive/MyDrive/Emotional_texts_generating/

emos.csv	       Emotions_classifier.ipynb	   model.pth
emos_filtered.csv      Emotions_texts_preprocessing.ipynb  transformer_sentiment_classifier.pt
emos_full_dataset.txt  glove.42B.300d.txt		   vectorizer.json


In [6]:
data = pd.read_csv('/content/gdrive/MyDrive/Emotional_texts_generating/emos.csv')

In [7]:
MAX_LEN_SIZE = 30
data_filtered = data[data.text_len <= MAX_LEN_SIZE].copy()

In [8]:
data_filtered.head(3)

Unnamed: 0,text,emotion,text_len,split
0,i am definitely feeling the effects of the pro...,love,20,train
1,i cant even tell you how refreshed i feel exha...,sadness,10,train
2,i cant help but feel a little humiliated,sadness,8,train


In [9]:
emos = data_filtered.emotion.unique()
emos

array(['love', 'sadness', 'joy', 'anger', 'fear', 'surprise'],
      dtype=object)

In [10]:
emos_dict = dict()
for idx, emo in enumerate(emos):
  emos_dict[emo] = idx

In [11]:
emos_dict

{'love': 0, 'sadness': 1, 'joy': 2, 'anger': 3, 'fear': 4, 'surprise': 5}

In [12]:
data_filtered['emos_index'] = data_filtered['emotion'].apply(lambda x: emos_dict[x])

In [13]:
data_filtered.sample(5)

Unnamed: 0,text,emotion,text_len,split,emos_index
7159,i always end up crying and feeling so hurt lik...,sadness,16,train,1
6354,i feel like we are a creative home truly paint...,joy,17,train,2
2775,im shocked i feel my own little problems put i...,joy,23,train,2
7362,i feel humiliated the annoying little college ...,sadness,17,train,1
3581,i did feel scared now,fear,5,train,4


In [14]:
emos_counts = data_filtered.emos_index.value_counts().to_dict()

In [15]:
emos_counts_sorted = dict(sorted(emos_counts.items()))
emos_counts_sorted

{0: 1362, 1: 4998, 2: 5741, 3: 2264, 4: 2033, 5: 608}

In [16]:
frequencies = [math.sqrt(count) for _, count in emos_counts_sorted.items()]
class_weights = 10/torch.tensor(frequencies, dtype=torch.float32)

In [17]:
class_weights

tensor([0.2710, 0.1414, 0.1320, 0.2102, 0.2218, 0.4056])

In [18]:
data_filtered.text_len.max()

30

In [19]:
data_filtered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17006 entries, 0 to 17005
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   text        17006 non-null  object
 1   emotion     17006 non-null  object
 2   text_len    17006 non-null  int64 
 3   split       17006 non-null  object
 4   emos_index  17006 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 664.4+ KB


In [20]:
data_filtered.to_csv('/content/gdrive/MyDrive/Emotional_texts_generating/emos_filtered.csv', index=False)

## Vocabulary

In [21]:
class Vocabulary(object):
    """Class to process text and extract vocabulary for mapping"""

    def __init__(self, token_to_idx=None):
        """
        Args:
            token_to_idx (dict): a pre-existing map of tokens to indices
        """

        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx

        self._idx_to_token = {idx: token
                              for token, idx in self._token_to_idx.items()}

    def to_serializable(self):
        """ returns a dictionary that can be serialized """
        return {'token_to_idx': self._token_to_idx}

    @classmethod
    def from_serializable(cls, contents):
        """ instantiates the Vocabulary from a serialized dictionary """
        return cls(**contents)

    def add_token(self, token):
        """Update mapping dicts based on the token.

        Args:
            token (str): the item to add into the Vocabulary
        Returns:
            index (int): the integer corresponding to the token
        """
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index

    def add_many(self, tokens):
        """Add a list of tokens into the Vocabulary

        Args:
            tokens (list): a list of string tokens
        Returns:
            indices (list): a list of indices corresponding to the tokens
        """
        return [self.add_token(token) for token in tokens]

    def lookup_token(self, token):
        """Retrieve the index associated with the token

        Args:
            token (str): the token to look up
        Returns:
            index (int): the index corresponding to the token
        """
        return self._token_to_idx[token]

    def lookup_index(self, index):
        """Return the token associated with the index

        Args:
            index (int): the index to look up
        Returns:
            token (str): the token corresponding to the index
        Raises:
            KeyError: if the index is not in the Vocabulary
        """
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._idx_to_token[index]

    def __str__(self):
        return "<Vocabulary(size=%d)>" % len(self)

    def __len__(self):
        return len(self._token_to_idx)

In [22]:
class SequenceVocabulary(Vocabulary):
    def __init__(self, token_to_idx=None, unk_token="<UNK>",
                 mask_token="<MASK>", begin_seq_token="<BEGIN>",
                 end_seq_token="<END>"):

        super(SequenceVocabulary, self).__init__(token_to_idx)

        self._mask_token = mask_token
        self._unk_token = unk_token
        self._begin_seq_token = begin_seq_token
        self._end_seq_token = end_seq_token

        self.mask_index = self.add_token(self._mask_token)
        self.unk_index = self.add_token(self._unk_token)
        self.begin_seq_index = self.add_token(self._begin_seq_token)
        self.end_seq_index = self.add_token(self._end_seq_token)

    def to_serializable(self):
        contents = super(SequenceVocabulary, self).to_serializable()
        contents.update({'unk_token': self._unk_token,
                         'mask_token': self._mask_token,
                         'begin_seq_token': self._begin_seq_token,
                         'end_seq_token': self._end_seq_token})
        return contents

    def lookup_token(self, token):
        """Retrieve the index associated with the token
          or the UNK index if token isn't present.

        Args:
            token (str): the token to look up
        Returns:
            index (int): the index corresponding to the token
        Notes:
            `unk_index` needs to be >=0 (having been added into the Vocabulary)
              for the UNK functionality
        """
        if self.unk_index >= 0:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]

## Vectorizer

In [23]:
class NMTVectorizer(object):
    """ The Vectorizer which coordinates the Vocabularies and puts them to use"""
    def __init__(self, vocab, category_vocab, max_text_length):

        self.vocab = vocab
        self.category_vocab = category_vocab
        self.max_text_length = max_text_length


    def _vectorize(self, indices, vector_length=-1, mask_index=0):
        """Vectorize the provided indices

        Args:
            indices (list): a list of integers that represent a sequence
            vector_length (int): an argument for forcing the length of index vector
            mask_index (int): the mask_index to use; almost always 0
        """
        if vector_length < 0:
            vector_length = len(indices)

        vector = np.zeros(vector_length, dtype=np.int64)
        vector[:len(indices)] = indices
        vector[len(indices):] = mask_index

        return vector

    def _get_source_indices(self, text):

        #indices = [self.vocab.begin_seq_index]
        indices = []
        indices.extend(self.vocab.lookup_token(token) for token in text.split(" "))
        #indices.append(self.vocab.end_seq_index)
        return indices

    def vectorize(self, text, use_dataset_max_lengths=True):

        source_vector_length = -1

        if use_dataset_max_lengths:
            source_vector_length = self.max_text_length

        source_indices = self._get_source_indices(text)
        source_vector = self._vectorize(source_indices,
                                        vector_length=source_vector_length,
                                        mask_index=self.vocab.mask_index)


        return {"source_vector": source_vector,
                "source_length": len(source_indices)}

    @classmethod
    def from_dataframe(cls, text_df, cutoff=12):

        category_vocab = Vocabulary()
        for emo in sorted(set(text_df.emotion)):
            category_vocab.add_token(emo)

        vocab = SequenceVocabulary()

        word_counts = Counter()
        for title in text_df.text:
            for token in title.split(" "):
                if token not in string.punctuation:
                    word_counts[token] += 1

        for word, word_count in word_counts.items():
            if word_count >= cutoff:
                vocab.add_token(word)

        max_text_length = 0

        for _, row in text_df.iterrows():
            tokens = row["text"].split(" ")
            if len(tokens) > max_text_length:
                max_text_length = len(tokens)

        return cls(vocab, category_vocab, max_text_length)

    @classmethod
    def from_serializable(cls, contents):
        vocab = SequenceVocabulary.from_serializable(contents["vocab"])
        category_vocab = Vocabulary.from_serializable(contents['category_vocab'])

        return cls(vocab=vocab, category_vocab=category_vocab,
                   max_text_length=contents["max_text_length"])

    def to_serializable(self):
        return {"vocab": self.vocab.to_serializable(),
                'category_vocab': self.category_vocab.to_serializable(),
                "max_text_length": self.max_text_length}

In [24]:
class NMTDataset(Dataset):
    def __init__(self, text_df, vectorizer):
        """
        Args:
            surname_df (pandas.DataFrame): the dataset
            vectorizer (SurnameVectorizer): vectorizer instatiated from dataset
        """
        self.text_df = text_df
        self._vectorizer = vectorizer

        self.train_df = self.text_df[self.text_df.split=='train']
        self.train_size = len(self.train_df)

        self.val_df = self.text_df[self.text_df.split=='val']
        self.validation_size = len(self.val_df)

        self.test_df = self.text_df[self.text_df.split=='test']
        self.test_size = len(self.test_df)

        self._lookup_dict = {'train': (self.train_df, self.train_size),
                             'val': (self.val_df, self.validation_size),
                             'test': (self.test_df, self.test_size)}

        self.set_split('train')

    @classmethod
    def load_dataset_and_make_vectorizer(cls, dataset_csv):
        """Load dataset and make a new vectorizer from scratch

        Args:
            surname_csv (str): location of the dataset
        Returns:
            an instance of SurnameDataset
        """
        text_df = pd.read_csv(dataset_csv)
        train_subset = text_df[text_df.split=='train']
        return cls(text_df, NMTVectorizer.from_dataframe(train_subset))

    @classmethod
    def load_dataset_and_load_vectorizer(cls, dataset_csv, vectorizer_filepath):
        """Load dataset and the corresponding vectorizer.
        Used in the case in the vectorizer has been cached for re-use

        Args:
            surname_csv (str): location of the dataset
            vectorizer_filepath (str): location of the saved vectorizer
        Returns:
            an instance of SurnameDataset
        """
        text_df = pd.read_csv(dataset_csv)
        vectorizer = cls.load_vectorizer_only(vectorizer_filepath)
        return cls(text_df, vectorizer)

    @staticmethod
    def load_vectorizer_only(vectorizer_filepath):
        """a static method for loading the vectorizer from file

        Args:
            vectorizer_filepath (str): the location of the serialized vectorizer
        Returns:
            an instance of SurnameVectorizer
        """
        with open(vectorizer_filepath) as fp:
            return NMTVectorizer.from_serializable(json.load(fp))

    def save_vectorizer(self, vectorizer_filepath):
        """saves the vectorizer to disk using json

        Args:
            vectorizer_filepath (str): the location to save the vectorizer
        """
        with open(vectorizer_filepath, "w") as fp:
            json.dump(self._vectorizer.to_serializable(), fp)

    def get_vectorizer(self):
        """ returns the vectorizer """
        return self._vectorizer

    def set_split(self, split="train"):
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]

    def __len__(self):
        return self._target_size

    def __getitem__(self, index):
        """the primary entry point method for PyTorch datasets

        Args:
            index (int): the index to the data point
        Returns:
            a dictionary holding the data point: (x_data, y_target, class_index)
        """
        row = self._target_df.iloc[index]

        text_vector = self._vectorizer.vectorize(row.text)
        category_index = self._vectorizer.category_vocab.lookup_token(row.emotion)
        source_length = len(text_vector)

        return {"source": text_vector["source_vector"],
                "target": category_index,
                "source_length": text_vector["source_length"]}

    def get_num_batches(self, batch_size):
        """Given a batch size, return the number of batches in the dataset

        Args:
            batch_size (int)
        Returns:
            number of batches in the dataset
        """
        return len(self) // batch_size

In [25]:
def generate_nmt_batches(dataset, batch_size, shuffle=True,
                            drop_last=True, device="cpu"):
    """A generator function which wraps the PyTorch DataLoader.  The NMT Version """
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)

    for data_dict in dataloader:
        lengths = data_dict['source_length'].numpy()
        sorted_length_indices = lengths.argsort()[::-1].tolist()

        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name][sorted_length_indices].to(device)
        yield out_data_dict

In [26]:
def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last=True, device="cpu"):
    """
    A generator function which wraps the PyTorch DataLoader. It will
      ensure each tensor is on the write device location.
    """
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

# Neural Machine Model
Components:
1.   NMTEncoder
*   accepts as input a source sequence to be embedded and fed through a bi-directional GRU
2.   NMTDecoder
*   using the encoder state and attention, the decoder generates a new sequence
*   the ground truth target sequence is used as input to the decoder at each time step
*   an alternative formulation would allow some of the decoder's own choices to be used as input
*   this is referred to as curriculum learning, learning to search
> TODO: Look up references for this. I believe Bengio has a paper from the image captioning competitions. Hal Daume has tons on this and is the main NLP guy for it.
1.   NMTModel
*   Combines the encoder and decoder into a single class.

In [27]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.25, max_length: int = 17):
        super().__init__()

        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_length).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_length, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Args:
            x: Tensor, shape [seq_len, batch_size, embedding_dim]
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [28]:
def generate_square_subsequent_mask(batch_size, size):
    mask = (torch.triu(torch.ones((size, size), device=args.device)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    mask = mask.expand(batch_size, -1, -1)
    mask = torch.cat([mask, mask, mask,
                      mask, mask, mask], dim=0)
    return mask


def create_mask(src, padding_idx=0):
    batch_size = src.size(0)
    src_seq_len = src.size(1)
    #tgt_seq_len = tgt.size(1)

    #src_mask = generate_square_subsequent_mask(batch_size, src_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len),device=args.device).expand(batch_size, -1, -1).type(torch.bool)
    src_mask = torch.cat([src_mask, src_mask, src_mask,
                          src_mask, src_mask, src_mask], dim=0)

    src_padding_mask = (src == padding_idx)
    #tgt_padding_mask = (tgt == padding_idx)
    return src_mask, src_padding_mask

## Модель

In [29]:
class NMTDecoder(nn.Module):
    def __init__(self, num_embeddings, embedding_size,
                 max_len_target,
                 pretrained_embeddings=None):

        super(NMTDecoder, self).__init__()

        self.max_len_target = max_len_target
        self.embedding_size = embedding_size

        self.pe = PositionalEncoding(d_model=embedding_size, max_length=max_len_target)

        if pretrained_embeddings is None:
          self.target_embedding = nn.Embedding(num_embeddings, embedding_size, padding_idx=0)

        else:
          self.target_embedding = nn.Embedding(num_embeddings, embedding_size, padding_idx=0, _weight=pretrained_embeddings)

        self.multihead_attn = nn.MultiheadAttention(embedding_size, num_heads=6, dropout=0.35)
        self.ff = nn.Sequential(
            nn.Linear(embedding_size, 4 * embedding_size),
            nn.ReLU(),
            nn.Dropout(0.35),
            nn.Linear(4 * embedding_size, embedding_size))
        self.norm = nn.BatchNorm1d(max_len_target)
        self.classifier = nn.Sequential(nn.Linear(embedding_size, 6), nn.ReLU(), nn.Dropout(0.35))
        #self.classifier1 = nn.Sequential(nn.Linear(max_len_target, 1), nn.ReLU(), nn.Dropout(0.35))
        self.soft = nn.Softmax(dim=1)


    def forward(self, x_source, src_padding_mask, src_mask):

        #создаем вложения слов и их позиций(эмбеддинги)
        embedded = self.target_embedding(x_source).permute(1,0,2)
        pe_embedded = self.pe.forward(embedded)
        attn_in = pe_embedded

        for _ in range(1):
          #блок расчета самовнимания
          x = self.multihead_attn(attn_in, attn_in, attn_in,
                                  key_padding_mask=src_padding_mask,
                                  attn_mask= src_mask)[0].permute(1,0,2)
          attn_out_norm = self.norm(x+attn_in.permute(1,0,2))
          x = self.ff(attn_out_norm)
          x = self.norm(x+attn_out_norm)
          attn_in = x.permute(1,0,2)


        scores = self.classifier(x)
        #scores = self.classifier1(scores.permute(0,2,1))
        #scores = scores.squeeze()
        scores = torch.sum(scores, dim=1)/self.max_len_target
        #_, scores = self.soft(scores).max(dim=1)

        return scores

class NMTModel(nn.Module):
    """ The Neural Machine Translation Model """
    def __init__(self, target_vocab_size, target_embedding_size,
                 max_length, pretrained_embeddings=None):

        super(NMTModel, self).__init__()

        self.decoder = NMTDecoder(num_embeddings=target_vocab_size,
                                  embedding_size=target_embedding_size,
                                  max_len_target=max_length,
                                  pretrained_embeddings=pretrained_embeddings)

    def forward(self, x_source, src_padding_mask, src_mask):

        decoded_states = self.decoder(x_source,
                                      src_padding_mask=src_padding_mask,
                                      src_mask = src_mask)

        return decoded_states

# Training routine and bookkeeping function

In [30]:
def set_seed_everywhere(seed, cuda):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if cuda:
        torch.cuda.manual_seed_all(seed)

def handle_dirs(dirpath):
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)

def make_train_state(args):
    return {'stop_early': False,
            'early_stopping_step': 0,
            'early_stopping_best_val': 1e8,
            'learning_rate': args.learning_rate,
            'epoch_index': 0,
            'train_loss': [],
            'train_acc': [],
            'val_loss': [],
            'val_acc': [],
            'test_loss': -1,
            'test_acc': -1,
            'model_filename': args.model_state_file}

def update_train_state(args, model, train_state):
    """Handle the training state updates.
    Components:
     - Early Stopping: Prevent overfitting.
     - Model Checkpoint: Model is saved if the model is better

    :param args: main arguments
    :param model: model to train
    :param train_state: a dictionary representing the training state values
    :returns:
        a new train_state
    """

    # Save one model at least
    if train_state['epoch_index'] == 0:
        torch.save(model.state_dict(), train_state['model_filename'])
        train_state['stop_early'] = False

    # Save model if performance improved
    elif train_state['epoch_index'] >= 1:
        loss_tm1, loss_t = train_state['val_loss'][-2:]

        # If loss worsened
        if loss_t >= loss_tm1:
            # Update step
            train_state['early_stopping_step'] += 1
        # Loss decreased
        else:
            # Save the best model
            if loss_t < train_state['early_stopping_best_val']:
                torch.save(model.state_dict(), train_state['model_filename'])
                train_state['early_stopping_best_val'] = loss_t

            # Reset early stopping step
            train_state['early_stopping_step'] = 0

        # Stop early ?
        train_state['stop_early'] = \
            train_state['early_stopping_step'] >= args.early_stopping_criteria

    return train_state

def compute_accuracy(y_pred, y_true):
  _, pred_indices = y_pred.max(dim=1)
  n_correct = torch.eq(pred_indices, y_true).sum().item()
  return n_correct / len(pred_indices) * 100

def sequence_loss(y_pred, y_true, weights):
    #y_pred, y_true = normalize_sizes(y_pred, y_true)
    return F.cross_entropy(y_pred, y_true, weights)

In [31]:
def load_glove_from_file(glove_filepath):
    """
    Load the GloVe embeddings

    Args:
        glove_filepath (str): path to the glove embeddings file
    Returns:
        word_to_index (dict), embeddings (numpy.ndarary)
    """

    word_to_index = {}
    embeddings = []
    with open(glove_filepath, "r") as fp:
        for index, line in enumerate(fp):
            line = line.split(" ") # each line: word num1 num2 ...
            word_to_index[line[0]] = index # word = line[0]
            embedding_i = np.array([float(val) for val in line[1:]])
            embeddings.append(embedding_i)
    return word_to_index, np.stack(embeddings)

def make_embedding_matrix(glove_filepath, words):
    """
    Create embedding matrix for a specific set of words.

    Args:
        glove_filepath (str): file path to the glove embeddigns
        words (list): list of words in the dataset
    """
    word_to_idx, glove_embeddings = load_glove_from_file(glove_filepath)
    embedding_size = glove_embeddings.shape[1]

    final_embeddings = np.zeros((len(words), embedding_size))

    for i, word in enumerate(words):
        if word in word_to_idx:
            final_embeddings[i, :] = glove_embeddings[word_to_idx[word]]
        else:
            embedding_i = torch.ones(1, embedding_size)
            torch.nn.init.xavier_uniform_(embedding_i)
            final_embeddings[i, :] = embedding_i

    return final_embeddings

In [33]:
args = Namespace(dataset_csv="/content/gdrive/MyDrive/Emotional_texts_generating/emos_filtered.csv",
                 vectorizer_file="vectorizer.json",
                 model_state_file="model.pth",
                 save_dir="/content/gdrive/MyDrive/Emotional_texts_generating/",
                 glove_filepath="/content/gdrive/MyDrive/Emotional_texts_generating/glove.42B.300d.txt",
                 use_glove=True,
                 reload_from_files=False,
                 expand_filepaths_to_save_dir=True,
                 cuda=True,
                 seed=1337,
                 learning_rate=5e-4,
                 batch_size=64,
                 num_epochs=20,
                 early_stopping_criteria=5,
                 source_embedding_size=300,
                 target_embedding_size=300,
                 catch_keyboard_interrupt=True)

if args.expand_filepaths_to_save_dir:
    args.vectorizer_file = os.path.join(args.save_dir,
                                        args.vectorizer_file)

    args.model_state_file = os.path.join(args.save_dir,
                                         args.model_state_file)

    print("Expanded filepaths: ")
    print("\t{}".format(args.vectorizer_file))
    print("\t{}".format(args.model_state_file))

# Check CUDA
if not torch.cuda.is_available():
    args.cuda = False

args.device = torch.device("cuda" if args.cuda else "cpu")

print("Using CUDA: {}".format(args.cuda))

# Set seed for reproducibility
set_seed_everywhere(args.seed, args.cuda)

# handle dirs
handle_dirs(args.save_dir)

Expanded filepaths: 
	/content/gdrive/MyDrive/Emotional_texts_generating/vectorizer.json
	/content/gdrive/MyDrive/Emotional_texts_generating/model.pth
Using CUDA: True


In [34]:
if args.reload_from_files and os.path.exists(args.vectorizer_file):
    # training from a checkpoint
    dataset = NMTDataset.load_dataset_and_load_vectorizer(args.dataset_csv,
                                                          args.vectorizer_file)
else:
    # create dataset and vectorizer
    dataset = NMTDataset.load_dataset_and_make_vectorizer(args.dataset_csv)
    dataset.save_vectorizer(args.vectorizer_file)

vectorizer = dataset.get_vectorizer()

# Use GloVe or randomly initialized embeddings
if args.use_glove:
    words = vectorizer.vocab._token_to_idx.keys()
    embeddings = make_embedding_matrix(glove_filepath=args.glove_filepath,
                                       words=words)
    embeddings = torch.from_numpy(embeddings).type(torch.cuda.FloatTensor)
    print("Using pre-trained embeddings")
else:
    print("Not using pre-trained embeddings")
    embeddings = None

Using pre-trained embeddings


In [35]:
model = NMTModel(target_vocab_size=len(vectorizer.vocab),
                 target_embedding_size=args.target_embedding_size,
                 max_length=vectorizer.max_text_length,
                 pretrained_embeddings=embeddings)

if args.reload_from_files and os.path.exists(args.model_state_file):
    model.load_state_dict(torch.load(args.model_state_file, map_location=torch.device('cpu')))
    print("Reloaded model")
else:
    print("New model")

New model


In [36]:
dataset.__len__()

13614

In [37]:
dataset.__getitem__(553)

{'source': array([   4,  885,  619,    7,   28,  415, 1061,    1,   16,    1,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0]),
 'target': 4,
 'source_length': 10}

In [38]:
vectorizer.vocab.__len__()

1437

In [39]:
vectorizer.category_vocab.__len__()

6

In [40]:
vectorizer.max_text_length

30

In [41]:
model = model.to(args.device)

optimizer = optim.Adam(model.parameters(), lr=args.learning_rate,
                       betas=(0.9, 0.98), eps=1e-9)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                           mode='min', factor=0.5,
                                           patience=1)

#weights = None
weights = class_weights.to(args.device)
loss = nn.CrossEntropyLoss(weights)
#loss = nn.CrossEntropyLoss()

mask_index = vectorizer.vocab.mask_index
train_state = make_train_state(args)

epoch_bar = tqdm(desc='training routine',
                          total=args.num_epochs,
                          position=0)

dataset.set_split('train')
train_bar = tqdm(desc='split=train',
                          total=dataset.get_num_batches(args.batch_size),
                          position=1,
                          leave=True)
dataset.set_split('val')
val_bar = tqdm(desc='split=val',
                        total=dataset.get_num_batches(args.batch_size),
                        position=1,
                        leave=True)

try:
    for epoch_index in range(args.num_epochs):
        sample_probability = (20 + epoch_index) / args.num_epochs
        #sample_probability = 0.5

        train_state['epoch_index'] = epoch_index

        # Iterate over training dataset

        # setup: batch generator, set loss and acc to 0, set train mode on
        dataset.set_split('train')
        batch_generator = generate_batches(dataset,
                                               batch_size=args.batch_size,
                                               device=args.device)
        running_loss = 0.0
        running_acc = 0.0
        model.train()

        for batch_index, batch_dict in enumerate(batch_generator):

            src_mask, src_padding_mask = \
            create_mask(batch_dict['source'])

            # the training routine is these 5 steps:

            # --------------------------------------
            # step 1. zero the gradients
            optimizer.zero_grad()

            # step 2. compute the output
            y_pred = model(batch_dict['source'],
                           src_padding_mask=src_padding_mask, src_mask=src_mask)
            #y_pred = model(batch_dict['source'], apply_softmax=True)

            # step 3. compute the loss
            loss_t = sequence_loss(y_pred, batch_dict['target'], weights = weights)

            # step 4. use loss to produce gradients
            loss_t.backward()

            # step 5. use optimizer to take gradient step
            optimizer.step()

            # -----------------------------------------
            # compute the running loss and running accuracy
            running_loss += (loss_t.item() - running_loss) / (batch_index + 1)

            acc_t = compute_accuracy(y_pred, batch_dict['target'])
            running_acc += (acc_t - running_acc) / (batch_index + 1)

            # update bar
            train_bar.set_postfix(loss=running_loss, acc=running_acc,
                                  epoch=epoch_index)
            train_bar.update()

        train_state['train_loss'].append(running_loss)
        train_state['train_acc'].append(running_acc)

        # Iterate over val dataset

        # setup: batch generator, set loss and acc to 0; set eval mode on
        dataset.set_split('val')
        batch_generator = generate_batches(dataset,
                                               batch_size=args.batch_size,
                                               device=args.device)
        running_loss = 0.
        running_acc = 0.
        model.eval()

        for batch_index, batch_dict in enumerate(batch_generator):
            src_mask, src_padding_mask = \
            create_mask(batch_dict['source'])
            # compute the output
            y_pred = model(batch_dict['source'],
                           src_padding_mask=src_padding_mask, src_mask=src_mask)
            #y_pred = model(batch_dict['source'], apply_softmax=True)

            # step 3. compute the loss
            loss_t = sequence_loss(y_pred, batch_dict['target'], weights = weights)

            # compute the running loss and accuracy
            running_loss += (loss_t.item() - running_loss) / (batch_index + 1)

            acc_t = compute_accuracy(y_pred, batch_dict['target'])
            running_acc += (acc_t - running_acc) / (batch_index + 1)

            # Update bar
            val_bar.set_postfix(loss=running_loss, acc=running_acc,
                            epoch=epoch_index)
            val_bar.update()

        train_state['val_loss'].append(running_loss)
        train_state['val_acc'].append(running_acc)

        train_state = update_train_state(args=args, model=model,
                                         train_state=train_state)

        scheduler.step(train_state['val_loss'][-1])

        if train_state['stop_early']:
            break

        train_bar.n = 0
        val_bar.n = 0
        epoch_bar.set_postfix(best_val=train_state['early_stopping_best_val'])
        epoch_bar.update()

except KeyboardInterrupt:
    print("Exiting loop")

training routine:   0%|          | 0/20 [00:00<?, ?it/s]

split=train:   0%|          | 0/212 [00:00<?, ?it/s]

split=val:   0%|          | 0/26 [00:00<?, ?it/s]

In [42]:
torch.save(model.state_dict(), '/content/gdrive/MyDrive/Emotional_texts_generating/transformer_sentiment_classifier.pt')

In [43]:
model.to(args.device)
model.eval()

NMTModel(
  (decoder): NMTDecoder(
    (pe): PositionalEncoding(
      (dropout): Dropout(p=0.25, inplace=False)
    )
    (target_embedding): Embedding(1437, 300, padding_idx=0)
    (multihead_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=300, out_features=300, bias=True)
    )
    (ff): Sequential(
      (0): Linear(in_features=300, out_features=1200, bias=True)
      (1): ReLU()
      (2): Dropout(p=0.35, inplace=False)
      (3): Linear(in_features=1200, out_features=300, bias=True)
    )
    (norm): BatchNorm1d(30, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (classifier): Sequential(
      (0): Linear(in_features=300, out_features=6, bias=True)
      (1): ReLU()
      (2): Dropout(p=0.35, inplace=False)
    )
    (soft): Softmax(dim=1)
  )
)

In [44]:
target_dict = dict()
for i in range(vectorizer.category_vocab.__len__()):
  target_dict[i] = vectorizer.category_vocab.lookup_index(i)

In [45]:
target_dict

{0: 'anger', 1: 'fear', 2: 'joy', 3: 'love', 4: 'sadness', 5: 'surprise'}

In [63]:
N_SAMPLES = 20

data_filtered_test = data_filtered[data_filtered.split == 'test']
samples_arrays = list(data_filtered_test[['text', 'emotion']].sample(N_SAMPLES).values)
samples = []
for sample in samples_arrays:
  samples.append(list(sample))

sample_tensor = torch.Tensor(vectorizer.vectorize(samples[0][0])['source_vector']).unsqueeze(0).long().to(args.device)
for sample in samples[1:]:
  sample_tensor = torch.cat([sample_tensor, torch.Tensor(vectorizer.vectorize(sample[0])['source_vector']).unsqueeze(0).long().to(args.device)])

In [64]:
src_mask, src_padding_mask = create_mask(sample_tensor)
y_pred = model(sample_tensor, src_padding_mask=src_padding_mask, src_mask=src_mask)
_, pred_index = y_pred.max(dim=1)
pred_index_list = pred_index.tolist()

In [65]:
for sample, idx in zip(samples, pred_index_list):
  print('text: {}; Emotion: {}; Prediction: {}'.format(sample[0], sample[1], target_dict[idx]))

text: i feel special a href http facsimilogos; Emotion: joy; Prediction: joy
text: i often feel embarrassed for amount of time spent preparing for practice and games as compared to lessons; Emotion: sadness; Prediction: sadness
text: i am still feeling pretty lousy from this allergy induced stupor so last night i just was not really feeling wildstar and interacting with other human beings; Emotion: sadness; Prediction: sadness
text: i go i see our flag flying at the turkish schools and i feel very proud; Emotion: joy; Prediction: joy
text: i got back up after feeling in vain really because of scarlets reply regarding a myspace message; Emotion: sadness; Prediction: sadness
text: when it became clear that a man had used many people sexually and psychologically; Emotion: anger; Prediction: anger
text: i feel our world then was a much more innocent place; Emotion: joy; Prediction: joy
text: i was feeling paranoid tonight so i had to check my blog stats; Emotion: fear; Prediction: fear
tex

In [67]:
sample = 'i am glad to see you again'

src_mask, src_padding_mask = create_mask(sample_tensor)
y_pred = model(sample_tensor, src_padding_mask=src_padding_mask, src_mask=src_mask)
_, pred_index = y_pred.max(dim=1)

sample_tensor = torch.Tensor(vectorizer.vectorize(sample)['source_vector']).unsqueeze(0).long().to(args.device)
print('text: {}; Emotion: {}.'.format(sample, target_dict[pred_index.item()]))

text: i am glad to see you again; Emotion: joy.
