# Twitter Sentiment Analysis - Advanced

After first creating a simple classifier (and achieving decent results), I dicided to test recent advances made by the NLP recearch community. Namely I wante dto see how well [BERT](https://arxiv.org/abs/1810.04805) would do. 

In [1]:
import pandas as pd

file_path = "../datasets/Sentiment140/training.1600000.processed.noemoticon.csv"
tweets = pd.read_csv(file_path, delimiter=",", encoding="latin-1", header=None, names=["polarity", "ID", "date", "query", "username", "text"], usecols=["polarity", "text"])

tweet_lengths = tweets['text'].str.len()
tweets = tweets[tweet_lengths <= 140]
tweets.at[tweets['polarity'] == 4, 'polarity'] = 1

I use a library to load the pretrained BERT model

In [2]:
!pip install pytorch-pretrained-bert



In [3]:
from tqdm import tqdm_notebook as tqdm
import re
from pytorch_pretrained_bert import BertTokenizer

url_regex = re.compile(r"http[^ ]+", re.IGNORECASE)
at_regex = re.compile(r"@[^ ]+", re.IGNORECASE)

tweets = tweets.assign(words=pd.Series([[] for _ in range(len(tweets))]))

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

for i, row in tqdm(tweets.iterrows(), total=len(tweets)):
    text = row['text']
    text = re.sub(url_regex, '', text)
    text = re.sub(at_regex, '', text)
    
    tokenized_text = tokenizer.tokenize(text)
    tweets.at[i,'words'] = tokenized_text

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


HBox(children=(IntProgress(value=0, max=1582826), HTML(value='')))




In [4]:
tweets['words'].str.len().max()

120

In [5]:
import torch
from torch.utils.data.dataset import Dataset

class TrainingDataset(Dataset):
    def __init__(self, dataframe, max_len):
        self.data = dataframe.values
        self.max_len = max_len

    def __getitem__(self, index):
        label = self.data[index][0]
        label = torch.tensor([label], dtype=torch.long)
        
        text = self.data[index][2]
        text = text[:self.max_len]
        
        attention = [1] + len(text)*[1] + [1] + (self.max_len - len(text))*[0] 
        attention = torch.LongTensor(attention)
        
        text = ['[CLS]'] + text + ['[SEP]'] + (self.max_len - len(text))*['[PAD]']
        text = tokenizer.convert_tokens_to_ids(text)
        text = torch.LongTensor(text)
        
        return label, text, attention

    def __len__(self):
        return len(self.data)
    
train_dataset = TrainingDataset(tweets, 40)

In [6]:
train_dataset[0]

(tensor([0]),
 tensor([  101,  1011, 22091,  2860,  2860,  1010,  2008,  1005,  1055,  1037,
         26352,  5017,  1012,  2017,  2323,  2050,  2288,  2585, 12385,  1997,
          2353,  2154,  2000,  2079,  2009,  1012,  1025,  1040,   102,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0]),
 tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]))

In [8]:
from torch.utils.data import DataLoader
from torch.autograd import Variable
from tqdm import tqdm_notebook as tqdm
import torch.nn as nn
import torch.nn.functional as F
import os

def valid_epoch(valid_data_iterator, model):
    model.eval()
    t = tqdm(valid_data_iterator, mininterval=1, desc='-(Validation)', leave=False)
    total_loss = 0
    cntr = 0
    for batch in t:
        labels, tweets, attention = batch
        if torch.cuda.is_available() and use_cuda:
            labels = Variable(labels.cuda())
            tweets = Variable(tweets.cuda())
            attention = Variable(attention.cuda())
            
        loss, pred = model(input_ids=tweets, attention_mask=attention, labels=labels)
        description = "Loss: " + str(loss.item())
        t.set_description(description)
        cntr += 1
        total_loss += loss.item()

    avg_loss = total_loss / float(cntr)

    return avg_loss

def train_epoch(train_data_iterator, model, optimizer):
    t = tqdm(train_data_iterator, mininterval=1, desc='-(Training)', leave=False)
    total_loss = 0
    cntr = 0
    for batch in t:
        labels, tweets, attention = batch
        if torch.cuda.is_available() and use_cuda:
            labels = Variable(labels.cuda())
            tweets = Variable(tweets.cuda())
            attention = Variable(attention.cuda())
        
        model.train()
        model.zero_grad()
        loss, pred = model(input_ids=tweets, attention_mask=attention, labels=labels)
        loss.backward()
        optimizer.step()   
        description = "Loss: " + str(loss.item())
        t.set_description(description)
        cntr += 1
        total_loss += loss.item()

    avg_loss = total_loss / float(cntr)

    return avg_loss

def train(epochs, train_data_iterator, valid_data_iterator, model, optimizer):
    total_step = len(train_data_iterator)

    # Start Training
    for epoch in range(epochs):
        print("Training for epoch " + str(epoch + 1) + ".")
        avg_train_loss = train_epoch(train_data_iterator, model, optimizer)
        print("Training Loss: " + str(avg_train_loss))
        avg_valid_loss = valid_epoch(valid_data_iterator, model)
        print("Validation Loss: " + str(avg_valid_loss))

    torch.save(model.state_dict(), "text_sentiment_advanced.chkpt")

In [9]:
import numpy as np
from pytorch_pretrained_bert import BertForSequenceClassification, BertAdam

use_cuda = True

# Split dataset
train_data = tweets.sample(frac=0.8)
test_data = tweets.loc[~tweets.index.isin(train_data.index), :]

# Loading Dataset
train_data_loader = TrainingDataset(train_data, 40)
train_data_iterator = DataLoader(train_data_loader, batch_size=64)
valid_data_loader = TrainingDataset(test_data, 40)
valid_data_iterator = DataLoader(valid_data_loader, batch_size=64)

# Initalize model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
if torch.cuda.is_available() and use_cuda:
    model.cuda()

# Listing parameters to be finetuned
params = list(model.parameters())

# Optimizer
optimizer = BertAdam(params, lr=3e-5)

# Train model
train(3, train_data_iterator, valid_data_iterator, model, optimizer)

Training for epoch 1.


HBox(children=(IntProgress(value=0, description='-(Training)', max=19786, style=ProgressStyle(description_widt…

RuntimeError: CUDA out of memory. Tried to allocate 31.50 MiB (GPU 0; 4.00 GiB total capacity; 3.04 GiB already allocated; 21.30 MiB free; 429.00 KiB cached)

RuntimeError: $ Torch: not enough memory: you tried to allocate 0GB. Buy new RAM! at ..\aten\src\TH\THGeneral.cpp:201