In [0]:
!pip install transformers
from google.colab import drive
drive.mount('/content/gdrive')
%cd '/content/gdrive/My Drive/hacknroll'

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/50/10/aeefced99c8a59d828a92cc11d213e2743212d3641c87c82d61b035a7d5c/transformers-2.3.0-py3-none-any.whl (447kB)
[K     |▊                               | 10kB 27.9MB/s eta 0:00:01[K     |█▌                              | 20kB 2.1MB/s eta 0:00:01[K     |██▏                             | 30kB 3.1MB/s eta 0:00:01[K     |███                             | 40kB 2.1MB/s eta 0:00:01[K     |███▋                            | 51kB 2.6MB/s eta 0:00:01[K     |████▍                           | 61kB 3.1MB/s eta 0:00:01[K     |█████▏                          | 71kB 3.6MB/s eta 0:00:01[K     |█████▉                          | 81kB 4.0MB/s eta 0:00:01[K     |██████▋                         | 92kB 4.5MB/s eta 0:00:01[K     |███████▎                        | 102kB 3.4MB/s eta 0:00:01[K     |████████                        | 112kB 3.4MB/s eta 0:00:01[K     |████████▉                       | 122kB 3.4M

In [0]:
import json
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import TensorDataset, random_split, DataLoader
from tqdm import tqdm_notebook as tqdm
from transformers import *
with open("sentiment_data.json", "r") as file:
  data = json.load(file)
df = pd.DataFrame(data)

In [0]:
print("Average text length: {}".format(np.mean(df['text'].str.len())))
def convert_lines(example, max_seq_length,tokenizer):
    max_seq_length -=2
    all_tokens = []
    longer = 0
    for text in example:
        tokens_a = tokenizer.tokenize(text)
        if len(tokens_a)>max_seq_length:
            tokens_a = tokens_a[:max_seq_length]
            longer += 1
        one_token = tokenizer.convert_tokens_to_ids(["[CLS]"]+tokens_a+["[SEP]"])+[0] * (max_seq_length - len(tokens_a))
        all_tokens.append(one_token)
    return np.array(all_tokens)

Average text length: 71.31205164992826


In [0]:
tokenizer = RobertaTokenizer.from_pretrained('distilroberta-base')
x = convert_lines(df['text'], 100, tokenizer)
y = df['rating'].values + 1

In [0]:
val_portion = 0.05
batch_size = 10

# Loading data and defining dataloaders
dataset = TensorDataset(torch.from_numpy(x).long(), torch.from_numpy(y).long())
val_data_len = int(val_portion*len(dataset))
train_data_len = len(dataset) - val_data_len
train_data, val_data = random_split(dataset, [train_data_len, val_data_len])

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=True)

In [0]:
# Setting training Hyperparameters
EPOCHS = 8
accumulation_steps = 2
lr = 2e-6

if torch.cuda.is_available:
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

# Instantiating our model with the the last layer as a classification layer and output dimension = number of labels
model = RobertaForSequenceClassification.from_pretrained('distilroberta-base', num_labels=3)
model.zero_grad()
model = model.to(device)
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]

num_train_optimization_steps = int(EPOCHS*len(train_data)/batch_size/accumulation_steps)
num_warmup_steps = int(0.05 * num_train_optimization_steps)

optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_train_optimization_steps)
model.train()
print("Model loaded")

Model loaded


In [0]:
output_model_file = "./roberta.pt"

# Training Loop
tq = tqdm(range(EPOCHS))
lowest_val_acc = 0
for epoch in tq:
    model.train()
    avg_loss = 0.
    avg_accuracy = 0.
    lossf=None
    tk0 = tqdm(enumerate(train_loader),total=len(train_loader),leave=False)
    optimizer.zero_grad()
    for i,(x_batch, y_batch) in tk0:
        y_pred = model(x_batch.to(device), attention_mask=(np.logical_not(x_batch==0)).to(device), labels=None)
        loss =  F.cross_entropy(y_pred[0],y_batch.to(device))
        loss.backward()
        if (i+1) % accumulation_steps == 0:
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
        if lossf:
            lossf = 0.98*lossf+0.02*loss.item()
        else:
            lossf = loss.item()
        tk0.set_postfix(loss = lossf)
        avg_loss += loss.item() / len(train_loader)
        avg_accuracy += torch.mean((torch.max(F.softmax(y_pred[0], dim=1), dim=1)[1] == y_batch.to(device)).to(torch.float)).item()/len(train_loader)
    tq.set_postfix(avg_loss=avg_loss,avg_accuracy=avg_accuracy)
    print("Training Accuracy: {}%".format(avg_accuracy*100))
    model.eval()
    val_loss = 0.
    val_acc = 0.
    for val_x, val_y in val_loader:
        val_pred = model(val_x.to(device), attention_mask=(np.logical_not(val_x==0)).to(device), labels=None)
        val_loss += F.cross_entropy(val_pred[0],val_y.to(device)).item()/len(val_loader)
        val_acc += torch.mean((torch.max(F.softmax(val_pred[0], dim=1), dim=1)[1] == val_y.to(device)).to(torch.float)).item() / len(val_loader)
    print("Validation Loss: {}".format(val_loss))
    print("Validation Accuracy: {}%".format(val_acc*100))
    if val_acc > lowest_val_acc:
        torch.save(model.state_dict(), output_model_file)
        lowest_val_acc = val_acc
        print("Validation performance improved... Saving model")

HBox(children=(IntProgress(value=0, max=8), HTML(value='')))

HBox(children=(IntProgress(value=0, max=133), HTML(value='')))

Training Accuracy: 51.654135923188406%
Validation Loss: 0.7730544805526733
Validation Accuracy: 72.53968289920262%
Validation performance improved... Saving model


HBox(children=(IntProgress(value=0, max=133), HTML(value='')))

Training Accuracy: 81.35338489498406%
Validation Loss: 0.4362059512308666
Validation Accuracy: 87.14285833495003%
Validation performance improved... Saving model


HBox(children=(IntProgress(value=0, max=133), HTML(value='')))

Training Accuracy: 90.97744508793484%
Validation Loss: 0.3275177223341806
Validation Accuracy: 91.11111334392002%
Validation performance improved... Saving model


HBox(children=(IntProgress(value=0, max=133), HTML(value='')))

Training Accuracy: 94.13533959173623%
Validation Loss: 0.27811569667288233
Validation Accuracy: 92.85714370863775%
Validation performance improved... Saving model


HBox(children=(IntProgress(value=0, max=133), HTML(value='')))

Training Accuracy: 95.63909890060148%
Validation Loss: 0.24883609105433735
Validation Accuracy: 94.12698575428553%
Validation performance improved... Saving model


HBox(children=(IntProgress(value=0, max=133), HTML(value='')))

Training Accuracy: 95.86466281933902%
Validation Loss: 0.23664569641862596
Validation Accuracy: 94.12698490279061%


HBox(children=(IntProgress(value=0, max=133), HTML(value='')))

Training Accuracy: 96.5413544411051%
Validation Loss: 0.22072744742035866
Validation Accuracy: 94.28571632930209%
Validation performance improved... Saving model


HBox(children=(IntProgress(value=0, max=133), HTML(value='')))

Training Accuracy: 97.21804588360916%
Validation Loss: 0.2196551880666188
Validation Accuracy: 94.12698575428553%


In [0]:
model = RobertaForSequenceClassification.from_pretrained('distilroberta-base', num_labels=3)
device = torch.device("cpu")
model.load_state_dict(torch.load("./roberta.pt", map_location=device))
model.to(device)
model.eval()
tokenizer = RobertaTokenizer.from_pretrained('distilroberta-base')
sentence = "But apps and websites you use will not be able to receive any other information about your Facebook friends from you, or information about any of your Instagram followers (although your friends and followers may, of course, choose to share this information themselves)."
tokens_a = tokenizer.tokenize(sentence)
if len(tokens_a)>100:
  tokens_a = tokens_a[:100]
one_token = tokenizer.convert_tokens_to_ids(["[CLS]"]+tokens_a+["[SEP]"])
inp = torch.tensor(one_token).long().reshape(1,-1)
pred = model(inp.to(device), attention_mask=(np.logical_not(inp==0)).to(device), labels=None)
scores = F.softmax(pred[0], dim=1)[0].cpu().detach().numpy()
sentiment = scores[2] + 0.5*scores[1]

0.8345605917274952