# Attempt at fine tuning BERT with Twitter Sentiment Dataset

The dataset can be found at: https://www.kaggle.com/kazanova/sentiment140

**WARNING:** This dataset is too big and it will not be possible to use it here.

However, The code works and if you have enough processing power go ahead!

In [0]:
!nvidia-smi

Sun May 24 06:53:23 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.82       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   40C    P8     9W /  70W |      0MiB / 15079MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

In [0]:
!nohup pip install transformers

nohup: ignoring input and appending output to 'nohup.out'


In [0]:
import random
import numpy as np
import pandas as pd
import transformers
import time
import datetime
from tqdm import tqdm

import torch

if torch.cuda.is_available():
    device = torch.device('cuda')

In [0]:
MAX_LENGTH = 512
BATCH_SIZE = 8
EPOCHS = 6

In [0]:
df = pd.read_csv(filepath_or_buffer='/content/drive/My Drive/temp_datasets/twitter_dataset.csv', 
                 names=['target', 'id', 'date', 'flag', 'user', 'text'],
                 encoding='ISO-8859-1')

df = df.drop(labels=['id', 'date', 'flag', 'user'], axis=1)

df = df.sample(frac=1)

df.head()

Unnamed: 0,target,text
1163581,4,had a good cold shower so whats up twitterwor...
1358751,4,whats going twitter world?
1239881,4,"just might be paranoid, i'm avoiding the lines..."
557110,0,"@richwells that looks really nice, esp the var..."
1012056,4,is gonna PLAY tonight http://plurk.com/p/vgd8z


In [0]:
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased',
                                                       do_lower_case=True)

In [0]:
def tokenize(sentences, tokenizer):
    input_ids, input_masks = [],[]
    for sentence in tqdm(sentences):
        inputs = tokenizer.encode_plus(sentence, add_special_tokens=True, max_length=MAX_LENGTH, pad_to_max_length=True, 
                                             return_attention_mask=True, return_token_type_ids=False)
        input_ids.append(inputs['input_ids'])
        input_masks.append(inputs['attention_mask'])
        
        del inputs
        
    return np.asarray(input_ids, dtype='int32'), np.asarray(input_masks, dtype='int32')

In [0]:
ids_1q, masks_1q = tokenize(df.text[:400000], tokenizer)

100%|██████████| 400000/400000 [02:30<00:00, 2655.37it/s]


In [0]:
# ids_2q, masks_2q = tokenize(df.text[400000:800000], tokenizer)

In [0]:
# ids_3q, masks_3q = tokenize(df.text[800000:1200000], tokenizer)

In [0]:
# ids_4q, masks_4q = tokenize(df.text[1200000:], tokenizer)

In [0]:
# input_ids = np.concatenate((ids_1q, ids_2q, ids_3q, ids_4q), axis=0)

# del ids_1q, ids_2q, ids_3q, ids_4q

In [0]:
# input_masks = np.concatenate((masks_1q, masks_2q, masks_3q, masks_4q), axis=0)

# del masks_1q, masks_2q, masks_3q, masks_4q

In [0]:
input_ids = torch.tensor(ids_1q)
input_masks = torch.tensor(masks_1q)

In [0]:
labels = torch.tensor(df.target.values[:400000])

In [0]:
input_masks.shape

torch.Size([400000, 512])

In [0]:
train_dataset = torch.utils.data.TensorDataset(input_ids, input_masks, labels)

del input_ids, input_masks

train_size = int(0.9 * len(train_dataset))
val_size = len(train_dataset) - train_size

train_dataset, val_dataset = torch.utils.data.random_split(train_dataset, [train_size, val_size])

print(f'Train set size: {train_size}\nValid set size: {val_size}')

Train set size: 360000
Valid set size: 40000


In [0]:
train_dataloader = torch.utils.data.DataLoader(train_dataset,
                                               sampler=torch.utils.data.RandomSampler(train_dataset),
                                               batch_size=BATCH_SIZE)

validation_dataloader = torch.utils.data.DataLoader(val_dataset,
                                             sampler=torch.utils.data.RandomSampler(val_dataset),
                                             batch_size=BATCH_SIZE)

In [0]:
model = transformers.BertForSequenceClassification.from_pretrained('bert-base-uncased',
                                                                   num_labels=2,
                                                                   output_attentions=False,
                                                                   output_hidden_states=False)

model.cuda()

In [0]:
optimizer = transformers.AdamW(model.parameters(),
                               lr=5e-5,
                               eps=1e-8)

total_steps = len(train_dataloader) * EPOCHS

scheduler = transformers.get_linear_schedule_with_warmup(optimizer,
                                                         num_warmup_steps=0,
                                                         num_training_steps=total_steps)

In [0]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [0]:
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

training_stats = []

total_t0 = time.time()

for epoch_i in range(0, EPOCHS):

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, EPOCHS))
    print('Training...')
    
    t0 = time.time()

    total_train_loss = 0

    model.train()

    for step, batch in enumerate(train_dataloader):
        
        if step % 50 == 0 and not step == 0:

            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
            
        b_input_ids = batch[0].long().to(device)
        b_input_mask = batch[1].long().to(device)
        b_labels = batch[2].to(device)
        
        model.zero_grad()        
        
        loss, logits = model(b_input_ids, 
                             token_type_ids=None, 
                             attention_mask=b_input_mask, 
                             labels=b_labels)
        
        total_train_loss += loss.item()
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        optimizer.step()
        
        scheduler.step()
        
    avg_train_loss = total_train_loss / len(train_dataloader)

    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
    
    print("")
    print("Running Validation...")
    
    t0 = time.time()
    
    model.eval()
    
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0
    
    for batch in validation_dataloader:
        
        b_input_ids = batch[0].long().to(device)
        b_input_mask = batch[1].long().to(device)
        b_labels = batch[2].to(device)
        
        with torch.no_grad():        
            
            (loss, logits) = model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask,
                                   labels=b_labels)
       
    total_eval_loss += loss.item() 
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    
    total_eval_accuracy += flat_accuracy(logits, label_ids)
    
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    validation_time = format_time(time.time() - t0)
    
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...
  Batch    50  of  45,000.    Elapsed: 0:00:37.
  Batch   100  of  45,000.    Elapsed: 0:01:18.
  Batch   150  of  45,000.    Elapsed: 0:01:59.
  Batch   200  of  45,000.    Elapsed: 0:02:40.
  Batch   250  of  45,000.    Elapsed: 0:03:21.
  Batch   300  of  45,000.    Elapsed: 0:04:02.
  Batch   350  of  45,000.    Elapsed: 0:04:43.
  Batch   400  of  45,000.    Elapsed: 0:05:24.
  Batch   450  of  45,000.    Elapsed: 0:06:05.
  Batch   500  of  45,000.    Elapsed: 0:06:47.
  Batch   550  of  45,000.    Elapsed: 0:07:28.
  Batch   600  of  45,000.    Elapsed: 0:08:09.
  Batch   650  of  45,000.    Elapsed: 0:08:50.
  Batch   700  of  45,000.    Elapsed: 0:09:31.
  Batch   750  of  45,000.    Elapsed: 0:10:12.
  Batch   800  of  45,000.    Elapsed: 0:10:53.
  Batch   850  of  45,000.    Elapsed: 0:11:34.
  Batch   900  of  45,000.    Elapsed: 0:12:15.
  Batch   950  of  45,000.    Elapsed: 0:12:57.
  Batch 1,000  of  45,000.    Elapsed: 0:13:38.
  Batch 1,050  of  45,000. 

In [0]:
import os

# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()

output_dir = '/content/drive/My Drive/temp_datasets/finetuned_bert_save/'

# Create output directory if needed
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print("Saving model to %s" % output_dir)

# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

In [0]:
model.eval()

predictions, true_labels = [], []

for batch in test_dataloader:
    
    batch = tuple(t.long().to(device) for t in batch)
    
    b_input_ids, b_input_mask, b_labels = batch
    
    with torch.no_grad():
        
        outputs = model(b_input_ids, 
                        token_type_ids=None,
                        attention_mask=b_input_mask)
        
    logits = outputs[0]
    
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    
    predictions.append(logits)
    true_labels.append(label_ids)