In [None]:
!pip install transformers 

In [None]:
!pip install ipywidgets

In [None]:
import logging 

import os 

import sys 

import numpy as np 

import pandas as pd 

import torch 

from torch.utils.data import DataLoader, TensorDataset 

from transformers import AdamW, BertForSequenceClassification, BertTokenizer 

from sklearn.preprocessing import OrdinalEncoder 

from sklearn.model_selection import train_test_split 

from types import SimpleNamespace 

  

logger = logging.getLogger(__name__) 

logger.setLevel(logging.DEBUG) 

logger.addHandler(logging.StreamHandler(sys.stdout)) 

 

In [None]:
filepath = './data/all-data.csv' 

data = pd.read_csv(filepath, encoding="ISO-8859-1", 

    header=None, usecols=[0, 1], 

    names=["sentiment", "article"]) 

  

ord_enc = OrdinalEncoder() 

data["sentiment"] = ord_enc.fit_transform(data[["sentiment"]]) 

data = data.astype({'sentiment':'int'}) 

  

train, test = train_test_split(data) 

train.to_csv("./data/train.csv", index=False) 

test.to_csv("./data/test.csv", index=False) 

  

MAX_LEN = data.article.str.len().max()  # this is the max length of the sentence 

In [None]:
def get_data_loader(batch_size, training_dir, filename): 

    logger.info("Get data loader") 

    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True) 

    dataset = pd.read_csv(os.path.join(training_dir, filename)) 

    articles = dataset.article.values 

    sentiments = dataset.sentiment.values 

  

    input_ids = [] 

    for sent in articles: 

        encoded_articles = tokenizer.encode(sent, add_special_tokens=True) 

        input_ids.append(encoded_articles) 

  

    # pad shorter sentences 

    input_ids_padded = [] 

    for i in input_ids: 

        while len(i) < MAX_LEN: 

            i.append(0) 

        input_ids_padded.append(i) 

    input_ids = input_ids_padded 

  

    # mask; 0: added, 1: otherwise 

    attention_masks = [] 

    # For each sentence... 

    for sent in input_ids: 

        att_mask = [int(token_id > 0) for token_id in sent] 

        attention_masks.append(att_mask) 

  

    # convert to PyTorch data types. 

    train_inputs = torch.tensor(input_ids) 

    train_labels = torch.tensor(sentiments) 

    train_masks = torch.tensor(attention_masks) 

    tensor_data = TensorDataset(train_inputs, train_masks, train_labels) 

    tensor_dataloader = DataLoader(tensor_data, batch_size=batch_size) 

 

    return tensor_dataloader 


In [None]:
def train(args): 

    use_cuda = args.num_gpus > 0 

    device = torch.device("cuda" if use_cuda else "cpu") 

  

    # set the seed for generating random numbers 

    torch.manual_seed(args.seed) 

    if use_cuda: 

        torch.cuda.manual_seed(args.seed) 

  

    train_loader = get_data_loader(args.batch_size, args.data_dir, args.train_file) 

    test_loader = get_data_loader(args.test_batch_size, args.data_dir, args.test_file) 

  

    model = BertForSequenceClassification.from_pretrained( 

        "bert-base-uncased",   

        num_labels=args.num_labels,  

        output_attentions=False,   

        output_hidden_states=False,  ) 

  

    model = model.to(device) # load the model to the right device 

     

    # configure optimizer 

    optimizer = AdamW( 

        model.parameters(), 

        lr=args.lr,  # learning rate  

    ) 

  

    for epoch in range(1, args.epochs + 1): 

        total_loss = 0 

        model.train() 

        for step, batch in enumerate(train_loader): 

            b_input_ids = batch[0].to(device) 

            b_input_mask = batch[1].to(device) 

            b_labels = batch[2].to(device) 

            model.zero_grad() 

  

            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) 

            loss = outputs[0] 

            total_loss += loss.item() 

            loss.backward() 

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) 

            optimizer.step() 

            if step % args.log_interval == 0: 

                logger.info( 

                    "Train Epoch: {} [{}/{} ({:.0f}%)] Loss: {:.6f}".format( 

                        epoch, 

                        step * len(batch[0]), 

                        len(train_loader.sampler), 

                        100.0 * step / len(train_loader), 

                        loss.item(), 

                    ) 

                ) 

        logger.info("Average training loss: %f\n", total_loss / len(train_loader))  

        test(model, test_loader, device) 

  

    logger.info("Saving tuned model.") 

    model_2_save = model.module if hasattr(model, "module") else model 

    model_2_save.save_pretrained(save_directory=args.model_dir) 

     

    return model 

 

In [None]:
def test(model, test_loader, device):     

    def get_correct_count(preds, labels): 

        pred_flat = np.argmax(preds, axis=1).flatten() 

        labels_flat = labels.flatten() 

        return np.sum(pred_flat == labels_flat), len(labels_flat) 

    

    model.eval() 

    _, eval_accuracy = 0, 0 

    total_correct = 0 

    total_count = 0 

 

    with torch.no_grad(): 

        for batch in test_loader: 

            b_input_ids = batch[0].to(device) 

            b_input_mask = batch[1].to(device) 

            b_labels = batch[2].to(device) 

  

            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) 

            preds = outputs[0] 

            preds = preds.detach().cpu().numpy() 

            label_ids = b_labels.to("cpu").numpy() 

                         

            num_correct, num_count = get_correct_count(preds, label_ids) 

            total_correct += num_correct 

            total_count += num_count 

             

    logger.info("Test set: Accuracy: %f\n", total_correct/total_count) 

In [None]:
args = SimpleNamespace(num_labels=3, batch_size=16, test_batch_size=10, epochs=3, lr=2e-5, seed=1,log_interval =50, model_dir = "model/", data_dir="data/", num_gpus=1, train_file = "train.csv", test_file="test.csv")      

model = train(args) 

In [None]:
def input_fn(request_body, request_content_type): 

    if request_content_type == "application/json": 

        data = json.loads(request_body)     

        if isinstance(data, str): 

            data = [data] 

        elif isinstance(data, list) and len(data) > 0 and isinstance(data[0], str): 

            pass 

        else: 

            raise ValueError("Unsupported input type. Input type can be a string or an non-empty list. \ 

                             I got {}".format(data)) 

                        

        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True) 

         

        input_ids = [tokenizer.encode(x, add_special_tokens=True) for x in data] 

         

        # pad shorter sentence 

        padded =  torch.zeros(len(input_ids), MAX_LEN)  

        for i, p in enumerate(input_ids): 

            padded[i, :len(p)] = torch.tensor(p) 

      

        # create mask 

        mask = (padded != 0) 

         

        return padded.long(), mask.long() 

    raise ValueError("Unsupported content type: {}".format(request_content_type)) 

In [None]:
def predict_fn(input_data, model): 

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 

    model.to(device) 

    model.eval() 

  

    input_id, input_mask = input_data 

    input_id = input_id.to(device) 

    input_mask = input_mask.to(device) 

    with torch.no_grad(): 

        y = model(input_id, attention_mask=input_mask)[0] 

    return y 

 

In [None]:
article = "Operating profit outpaced the industry average" 

request_body = json.dumps(article) 

enc_data, mask = input_fn(request_body, 'application/json') 

output = predict_fn((enc_data, mask), model) 

preds = output.detach().cpu().numpy() 

print("sentiment label : " + str(np.argmax(preds))) 