In [1]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [2]:
# We'll first install the transformers package from HuggingFace
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d5/43/cfe4ee779bbd6a678ac6a97c5a5cdeb03c35f9eaebbb9720b036680f9a2d/transformers-4.6.1-py3-none-any.whl (2.2MB)
[K     |████████████████████████████████| 2.3MB 9.9MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 52.4MB/s 
[?25hCollecting huggingface-hub==0.0.8
  Downloading https://files.pythonhosted.org/packages/a1/88/7b1e45720ecf59c6c6737ff332f41c955963090a18e72acbcbeac6b25e86/huggingface_hub-0.0.8-py3-none-any.whl
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     |█

In [3]:
import pandas as pd

In [4]:
from google.colab import files
uploaded = files.upload()

Saving text_emotion.csv to text_emotion.csv


In [5]:
import io
df = pd.read_csv(io.BytesIO(uploaded['text_emotion.csv']))

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,text,emotions
0,0,i didnt feel humiliated,sadness
1,1,i can go from feeling so hopeless to so damned...,sadness
2,2,im grabbing a minute to post i feel greedy wrong,anger
3,3,i am ever feeling nostalgic about the fireplac...,love
4,4,i am feeling grouchy,anger


In [7]:
df["emotions"].replace({"joy": "happiness", "love": "happiness", "fun": "happiness", "relief": "happiness", "enthusiasm": "happiness", "empty": "boredom", "hate": "anger", "worry": "fear"}, inplace=True)

In [8]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
# Then we make our input and label data as a list of instances
all_texts = list(df.text)
all_labels = list(df.emotions)
all_labels = le.fit_transform(all_labels)

In [9]:
!pip install emoji
import emoji

Collecting emoji
[?25l  Downloading https://files.pythonhosted.org/packages/24/fa/b3368f41b95a286f8d300e323449ab4e86b85334c2e0b477e94422b8ed0f/emoji-1.2.0-py3-none-any.whl (131kB)
[K     |██▌                             | 10kB 22.0MB/s eta 0:00:01[K     |█████                           | 20kB 15.8MB/s eta 0:00:01[K     |███████▌                        | 30kB 14.1MB/s eta 0:00:01[K     |██████████                      | 40kB 13.1MB/s eta 0:00:01[K     |████████████▌                   | 51kB 7.1MB/s eta 0:00:01[K     |███████████████                 | 61kB 8.3MB/s eta 0:00:01[K     |█████████████████▌              | 71kB 8.0MB/s eta 0:00:01[K     |████████████████████            | 81kB 8.7MB/s eta 0:00:01[K     |██████████████████████▌         | 92kB 8.6MB/s eta 0:00:01[K     |█████████████████████████       | 102kB 6.9MB/s eta 0:00:01[K     |███████████████████████████▌    | 112kB 6.9MB/s eta 0:00:01[K     |██████████████████████████████  | 122kB 6.9MB/s eta 0:0

In [10]:
# So to combine Emoji detection together with lower case, we will do the following:
all_texts_preprocessed = [emoji.demojize(t.lower(), language='en') for t in all_texts]
print(all_texts_preprocessed[15])

i do not feel reassured anxiety is on each side


In [11]:
from sklearn.model_selection import train_test_split
import numpy as np
import random
import torch
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed_all(RANDOM_SEED)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(all_texts_preprocessed, all_labels, test_size=0.2, random_state=RANDOM_SEED, shuffle=True)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=RANDOM_SEED, shuffle=True)
print(len(X_train), len(X_val), len(X_test))

46400 5800 5800


In [13]:
from transformers import BertTokenizer
# Specify the pre-trained model name.
PRE_TRAINED_MODEL_NAME = 'bert-base-uncased'

# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME, do_lower_case=True)

Loading BERT tokenizer...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




In [14]:
# Print the preprocessed review text.
print('Preprocessed Text: {}'.format(X_train[2]))

# Encode the preprocessed text by using encode_plus function
encoded = tokenizer.encode_plus(
                          text=X_train[2],
                          add_special_tokens = True, # Add '[CLS]' and '[SEP]' and [PAD]
                          max_length=64,      # Pad & truncate all texts 
                          pad_to_max_length = True,
                          return_token_type_ids=False,
                          return_attention_mask = True, # Construct attention masks
                          return_tensors = 'pt', # Return pytorch tensors
                    )
print('input_ids: {}'.format(encoded['input_ids']))
print('attention_mask: {}'.format(encoded['attention_mask']))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Preprocessed Text: i go up to her and i say feeling very impressed with myself youre naomi klein right
input_ids: tensor([[  101,  1045,  2175,  2039,  2000,  2014,  1998,  1045,  2360,  3110,
          2200,  7622,  2007,  2870,  2115,  2063, 12806, 12555,  2157,   102,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0]])
attention_mask: tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])




In [15]:
# So to put all the things together,
# we can write a class: with the input texts, labels, tokenizer and max_len
# we'll have the review_text, input_ids, attention_mask, labels as our output
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
class ReviewDataset(Dataset):

  def __init__(self, reviews, labels, tokenizer, max_len):
    self.reviews = reviews
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.reviews)

  def __getitem__(self, item):
    review = str(self.reviews[item])
    label = self.labels[item]
    # `encode_plus` will:
      #   (1) Tokenize the text.
      #   (2) Prepend the `[CLS]` token to the start.
      #   (3) Append the `[SEP]` token to the end.
      #   (4) Map tokens to their IDs.
      #   (5) Pad or truncate the sentence to `max_length` with [PAD] tokens
      #   (6) Create attention masks for [PAD] tokens
    encoding = self.tokenizer.encode_plus(
      review,
      add_special_tokens=True, # Add '[CLS]' and '[SEP]' and [PAD]
      max_length=self.max_len, # Pad & truncate all texts
      pad_to_max_length=True,
      return_token_type_ids=False,
      return_attention_mask=True, # Construct attention masks
      return_tensors='pt', # Return pytorch tensors
    )
    return {
      'review_text': review,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'labels': torch.tensor(label, dtype=torch.long)
    }

In [16]:
def create_data_loader(X, Y, tokenizer, max_len, batch_size, num_workers=2, sampler = None):
  ds = ReviewDataset(
    reviews=np.array(X),
    labels=np.array(Y),
    tokenizer=tokenizer,
    max_len=max_len
  )
  if sampler!=None:
    sampler = sampler(ds)

  return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=num_workers,
    sampler = sampler
  )

In [17]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
MAX_LEN = 128
BATCH_SIZE = 32

# Create the DataLoaders for our training and validation sets.
# We'll take training samples in random order.
train_data_loader = create_data_loader(X_train, y_train, tokenizer, MAX_LEN, BATCH_SIZE, sampler = RandomSampler) # Select batches randomly

# For validation & testing samples, the order doesn't matter, so we'll just read them sequentially.
val_data_loader = create_data_loader(X_val, y_val, tokenizer, MAX_LEN, BATCH_SIZE, sampler = SequentialSampler) # Pull out batches sequentially.

test_data_loader = create_data_loader(X_test, y_test, tokenizer, MAX_LEN, BATCH_SIZE, sampler = SequentialSampler) # Pull out batches sequentially.

In [18]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
model = BertForSequenceClassification.from_pretrained(
    PRE_TRAINED_MODEL_NAME, # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 7, # The number of output labels, in our case it's multi-class tasks with classes=3  
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

# Tell pytorch to run this model on the GPU.
model.cuda()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [19]:
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 1e-3
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-6.
                )

In [20]:
from transformers import get_linear_schedule_with_warmup

# Number of training epochs. The BERT authors recommend between 2 and 4 (depend on the usage, you can also set it larger)
# We chose to run for 3
EPOCHS = 3

# Total number of training steps is [number of batches] x [number of epochs]. 
total_steps = len(train_data_loader) * EPOCHS

# Create the learning rate scheduler, here we use a linear scheduler with no warmup steps
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

# Define our loss function
loss_fn = nn.CrossEntropyLoss().to(device)

In [21]:
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
  model.train()
  total_train_accuracy = 0
  total_train_loss = 0
  losses = []
  correct_predictions = 0
  for step, batch in enumerate(data_loader):
    if step % 40 == 0 and not step == 0:
       print('Batch: {}  of  {}'.format(step, len(data_loader)))
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)
    labels = batch["labels"].to(device)
    model.zero_grad()
    outputs = model(
      input_ids=input_ids,
      token_type_ids=None,
      attention_mask=attention_mask,
      labels=labels
    )
    loss = outputs[0]
    total_train_loss += loss.item()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()

    logits = outputs[1].detach().cpu().numpy()
    label_ids = labels.to('cpu').numpy()
    total_train_accuracy += flat_accuracy(logits, label_ids)
  # Calculate the average loss over all of the batches.
  avg_train_accuracy = total_train_accuracy / len(data_loader)
  avg_train_loss = total_train_loss / len(data_loader) 
  return avg_train_accuracy, avg_train_loss

In [22]:
import numpy as np

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [23]:
# Evaluation
def eval_model(model, data_loader, loss_fn, device, n_examples):
  model.eval()
  total_eval_accuracy = 0
  total_eval_loss = 0
  nb_eval_steps = 0
  losses = []
  correct_predictions = 0
  with torch.no_grad():
    for batch in data_loader:
      input_ids = batch["input_ids"].to(device)
      attention_mask = batch["attention_mask"].to(device)
      labels = batch["labels"].to(device)
      outputs = model(
        input_ids=input_ids,
        token_type_ids=None,
        attention_mask=attention_mask,
        labels=labels
      )
      total_eval_loss += outputs[0].item()
      logits = outputs[1].detach().cpu().numpy()
      label_ids = labels.to('cpu').numpy()
      total_eval_accuracy += flat_accuracy(logits, label_ids)

  avg_val_accuracy = total_eval_accuracy / len(data_loader)
  avg_val_loss = total_eval_loss / len(data_loader)
  #print("Accuracy: {0:.2f}".format(avg_val_accuracy))
  #print("Validation Loss: {0:.2f}".format(avg_val_loss))
  return avg_val_accuracy, avg_val_loss

In [24]:
%%time
from collections import defaultdict
history = defaultdict(list)
best_accuracy = 0
for epoch in range(EPOCHS):
  print('Epoch: {}/{}'.format(epoch+1, EPOCHS))
  print('-' * 10)
  train_acc, train_loss = train_epoch(
    model,
    train_data_loader,
    loss_fn,
    optimizer,
    device,
    scheduler,
    len(X_train)
  )
  print('Train loss: {}, Accuracy: {}'.format(train_loss, train_acc))
  val_acc, val_loss = eval_model(
    model,
    val_data_loader,
    loss_fn,
    device,
    len(X_val)
  )
  print('Val loss: {}, Accuracy: {}'.format(val_loss, val_acc))
  print()
  history['train_acc'].append(train_acc)
  history['train_loss'].append(train_loss)
  history['val_acc'].append(val_acc)
  history['val_loss'].append(val_loss)
  if val_acc > best_accuracy:
    torch.save(model.state_dict(), 'best_model_state.bin')
    best_accuracy = val_acc

Epoch: 1/3
----------




Batch: 40  of  1450
Batch: 80  of  1450
Batch: 120  of  1450
Batch: 160  of  1450
Batch: 200  of  1450
Batch: 240  of  1450
Batch: 280  of  1450
Batch: 320  of  1450
Batch: 360  of  1450
Batch: 400  of  1450
Batch: 440  of  1450
Batch: 480  of  1450
Batch: 520  of  1450
Batch: 560  of  1450
Batch: 600  of  1450
Batch: 640  of  1450
Batch: 680  of  1450
Batch: 720  of  1450
Batch: 760  of  1450
Batch: 800  of  1450
Batch: 840  of  1450
Batch: 880  of  1450
Batch: 920  of  1450
Batch: 960  of  1450
Batch: 1000  of  1450
Batch: 1040  of  1450
Batch: 1080  of  1450
Batch: 1120  of  1450
Batch: 1160  of  1450
Batch: 1200  of  1450
Batch: 1240  of  1450
Batch: 1280  of  1450
Batch: 1320  of  1450
Batch: 1360  of  1450
Batch: 1400  of  1450
Batch: 1440  of  1450
Train loss: 1.1113309542886143, Accuracy: 0.5917672413793104




Val loss: 0.9580371743375129, Accuracy: 0.6413118131868132

Epoch: 2/3
----------




Batch: 40  of  1450
Batch: 80  of  1450
Batch: 120  of  1450
Batch: 160  of  1450
Batch: 200  of  1450
Batch: 240  of  1450
Batch: 280  of  1450
Batch: 320  of  1450
Batch: 360  of  1450
Batch: 400  of  1450
Batch: 440  of  1450
Batch: 480  of  1450
Batch: 520  of  1450
Batch: 560  of  1450
Batch: 600  of  1450
Batch: 640  of  1450
Batch: 680  of  1450
Batch: 720  of  1450
Batch: 760  of  1450
Batch: 800  of  1450
Batch: 840  of  1450
Batch: 880  of  1450
Batch: 920  of  1450
Batch: 960  of  1450
Batch: 1000  of  1450
Batch: 1040  of  1450
Batch: 1080  of  1450
Batch: 1120  of  1450
Batch: 1160  of  1450
Batch: 1200  of  1450
Batch: 1240  of  1450
Batch: 1280  of  1450
Batch: 1320  of  1450
Batch: 1360  of  1450
Batch: 1400  of  1450
Batch: 1440  of  1450
Train loss: 0.9071562884182766, Accuracy: 0.6675862068965517




Val loss: 0.9575988496397878, Accuracy: 0.6485233516483516

Epoch: 3/3
----------




Batch: 40  of  1450
Batch: 80  of  1450
Batch: 120  of  1450
Batch: 160  of  1450
Batch: 200  of  1450
Batch: 240  of  1450
Batch: 280  of  1450
Batch: 320  of  1450
Batch: 360  of  1450
Batch: 400  of  1450
Batch: 440  of  1450
Batch: 480  of  1450
Batch: 520  of  1450
Batch: 560  of  1450
Batch: 600  of  1450
Batch: 640  of  1450
Batch: 680  of  1450
Batch: 720  of  1450
Batch: 760  of  1450
Batch: 800  of  1450
Batch: 840  of  1450
Batch: 880  of  1450
Batch: 920  of  1450
Batch: 960  of  1450
Batch: 1000  of  1450
Batch: 1040  of  1450
Batch: 1080  of  1450
Batch: 1120  of  1450
Batch: 1160  of  1450
Batch: 1200  of  1450
Batch: 1240  of  1450
Batch: 1280  of  1450
Batch: 1320  of  1450
Batch: 1360  of  1450
Batch: 1400  of  1450
Batch: 1440  of  1450
Train loss: 0.8074460300083818, Accuracy: 0.7105603448275862




Val loss: 0.9727029400867421, Accuracy: 0.648695054945055

CPU times: user 53min 1s, sys: 24 s, total: 53min 25s
Wall time: 53min 40s


In [25]:
test_acc, _ = eval_model(
  model,
  test_data_loader,
  loss_fn,
  device,
  len(X_test)
)
print('Accuracy: {}'.format(test_acc.item()))



Accuracy: 0.6322115384615384
