In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
VERSION = "1.5"  #@param ["1.5" , "20200325", "nightly"]
!curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
!python pytorch-xla-env-setup.py --version $VERSION

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100  5115  100  5115    0     0  26640      0 --:--:-- --:--:-- --:--:-- 26640
Updating... This may take around 2 minutes.
Updating TPU runtime to pytorch-1.5 ...
Uninstalling torch-1.5.0a0+ab660ae:
  Successfully uninstalled torch-1.5.0a0+ab660ae
Uninstalling torchvision-0.6.0a0+3c254fb:
  Successfully uninstalled torchvision-0.6.0a0+3c254fb
Copying gs://tpu-pytorch/wheels/torch-1.5-cp36-cp36m-linux_x86_64.whl...
- [1 files][ 79.0 MiB/ 79.0 MiB]                                                
Operation completed over 1 objects/79.0 MiB.                                     
Copying gs://tpu-pytorch/wheels/torch_xla-1.5-cp36-cp36m-linux_x86_64.whl...
- [1 files][106.6 MiB/106.6 MiB]                                                
Operation completed ove

In [3]:
!pip install transformers



Imports

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as datautils

import torch_xla
import torch_xla.core.xla_model as xm
import torch_xla.distributed.xla_multiprocessing as xmp
import torch_xla.distributed.parallel_loader as pl

from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer, AdamW, get_cosine_schedule_with_warmup

import numpy as np
import pandas as pd
from tqdm import tqdm 
from sklearn.metrics import roc_auc_score
import time, os

Data Preprocessing



In [5]:
pretrained = 'distilbert-base-uncased'

In [6]:
def regular_encode(texts, tokenizer, maxlen=270):
    enc_di = tokenizer.batch_encode_plus(
        texts,
        add_special_tokens=True, 
        return_token_type_ids=False,
        pad_to_max_length=True,
        truncation=True,
        max_length=maxlen
    )
    
    return np.array(enc_di['input_ids'])

tokenizer = AutoTokenizer.from_pretrained(pretrained)

In [7]:
# Read dataset
df = pd.read_csv('/content/drive/My Drive/Sentiment Analysis/Clean/train2.csv')
df

Unnamed: 0,review,label
0,thanks smiling face with smiling eyes,2
1,i ordered anf black army green color but this ...,0
2,good quality size fits,3
3,excellent product quality,4
4,1 star break because i broke mine shattered my...,2
...,...,...
146737,but beautiful na lng sa stigma i do not know f...,2
146738,awesome speed of the ship awesome awesome qual...,3
146739,product quality is very good recomended produc...,4
146740,fast seller the child was da play the price wa...,3


In [10]:
 # Tokenize the dataset
s = time.time()
text = regular_encode(list(df['review'].astype(str)), tokenizer)
labels = df['label'].values # Labels encoded (see data cleaning notebook)
print("Elapsed: {:.2f}s".format(time.time() - s))

Elapsed: 26.55s


In [11]:
text

array([[  101, 15198, 21905, ...,     0,     0,     0],
       [  101, 50337, 21535, ...,     0,     0,     0],
       [  101, 56237, 41939, ...,     0,     0,     0],
       ...,
       [  101, 21535, 21905, ...,     0,     0,     0],
       [  101, 15040, 22154, ...,     0,     0,     0],
       [  101, 78978, 10230, ...,     0,     0,     0]])

In [12]:
# Split into training and testing
tr_sz = int(len(text) * 0.8) #80% | 20%
X_train, y_train = torch.tensor(text[:tr_sz]), torch.tensor(labels[:tr_sz], dtype=torch.long)
X_valid, y_valid = torch.tensor(text[tr_sz:]), torch.tensor(labels[tr_sz:], dtype=torch.long)

# Produce datasets
train_set = datautils.TensorDataset(X_train, y_train)
valid_set = datautils.TensorDataset(X_valid, y_valid)

Metrics

In [13]:
from sklearn.metrics import accuracy_score

def acc_score(preds, actuals):
    return (accuracy_score(actuals, preds))

In [14]:
from sklearn.metrics import classification_report

def compute_metrics(preds, actuals):
    return (classification_report(actuals, preds, zero_division=0))

Setting the Hyperparameters

In [18]:
# Set flags
flags = {
    'batch_size': 16,
    'num_workers': 8, #number of cores 
    'num_epochs': 1,
    'seed': 42,
    'num_labels': 5, #Important
    'pretrained': pretrained,
    'savedir': '/content/drive/My Drive/Sentiment Analysis/TPU_Checkpoints',
    'modelpath': 'model.bin',
    'learning_rate': 1e-5,
    'weight_decay' : 0.01,
    'print_every': 150
}

Configure the model

In [19]:
    # Configure the model
    config = AutoConfig.from_pretrained(flags['pretrained'], num_labels=flags['num_labels'])
    model = AutoModelForSequenceClassification.from_pretrained(flags['pretrained'], config=config)
    #model.load_state_dict(torch.load(flags['savedir'] + '/' + flags['modelpath']))

Some weights of the model checkpoint at distilbert-base-multilingual-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['pre_classifier.weight', 'pre_cla

Finetuning 

*   Define the model outside to reduce on memory usage




In [20]:
def map_fn(index, flags):
    # Set the seed and obtain an XLA device
    torch.manual_seed(flags['seed'])
    device = xm.xla_device()
    print("Process", index, "obtained, using device:", xm.xla_real_devices([str(device)])[0]) 

    # Produce distributed samplers
    train_sampler = datautils.distributed.DistributedSampler(
        train_set, 
        num_replicas=xm.xrt_world_size(), 
        rank=xm.get_ordinal(), 
        shuffle=True
    )
    valid_sampler = datautils.distributed.DistributedSampler(
        valid_set, 
        num_replicas=xm.xrt_world_size(), 
        rank=xm.get_ordinal(), 
        shuffle=False
    )

    # Create dataloaders
    train_loader = datautils.DataLoader(
        train_set,
        batch_size=flags['batch_size'], 
        sampler=train_sampler, 
        num_workers=flags['num_workers'],
        drop_last=True
    )
    valid_loader = datautils.DataLoader(
        valid_set,
        batch_size=flags['batch_size'], 
        sampler=valid_sampler, 
        num_workers=flags['num_workers'],
        drop_last=True,
        shuffle=False
    )

    # This ensures that the pretrained weights will only be
    # downloaded once (c/o the master process). It also makes
    # sure that the other processes don't attempt to load the
    # weights when downloading isn't finished yet.
    if not xm.is_master_ordinal():
        xm.rendezvous('download_only_once')

    #send the model to the TPU
    model.to(device)
    
    if xm.is_master_ordinal():
        xm.rendezvous('download_only_once')

    # Initialize loss and optimizer
    criterion = nn.CrossEntropyLoss() #uses both LogSoftmax + NLLLoss
    #optimizer = optim.SGD(model.parameters(), lr=flags['learning_rate'], weight_decay=flags['weight_decay'], momentum=0.9, nesterov=True)
    optimizer = AdamW(model.parameters(), lr=flags['learning_rate'], weight_decay=flags['weight_decay'], correct_bias=True)

    # Create Scheduler
    total_steps = len(train_loader) * flags['num_epochs']
    scheduler = get_cosine_schedule_with_warmup(
      optimizer,
      num_warmup_steps=100,
      num_training_steps=total_steps
      )

    xm.master_print("\nNumber of training batches: {}".format(len(train_loader)))
    xm.master_print("Number of evaluation batches: {}\n".format(len(valid_loader)))
    
    for e in range(1, flags['num_epochs'] + 1):
        
        # Train Model
        model.train()
        train_start = time.time()

        xm.master_print("=" * 27 + "Epoch {} of {}".format(e, flags['num_epochs']) + "=" * 27)
        para_train_loader = pl.ParallelLoader(train_loader, [device]).per_device_loader(device)
        for i, batch in enumerate(para_train_loader):
            x, y = batch
            out = model(x)[0]
            loss = criterion(out, y)

            if i % flags['print_every'] == 0:
                xm.master_print('[TRAIN] Iteration {:4} | Loss {:.4f} | Time Elapsed {:.2f} seconds'.format(i, loss.item(),time.time() - train_start))
            
            # Update model
            optimizer.zero_grad() #clear gradients
            loss.backward() #calculate gradient
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) #prevent gradient explosion
            xm.optimizer_step(optimizer) #update weights
            scheduler.step()
        
        xm.master_print('\nFinished training no.{} epoch in {:.2f} seconds.\n'.format(e, time.time() - train_start))

        # Evaluate Model
        model.eval()
        valid_start = time.time()
        preds, actuals = [], []
        
        with torch.no_grad(): #deactivate autograd
            xm.master_print('=' * 28 + 'Validation' + '=' * 28)
            para_valid_loader = pl.ParallelLoader(valid_loader, [device]).per_device_loader(device)
            for i, batch in enumerate(para_valid_loader):
                x, y = batch
                out = model(x)[0]
                loss = criterion(out, y)

                sm = torch.nn.LogSoftmax(dim=1) #apply logsoftmax on the logits
                out = sm(out)

                # Keep track of all outputs and gold labels
                actuals.extend(y.cpu().numpy().tolist())
                preds.extend(out.cpu().detach().numpy().tolist())

                if i % flags['print_every'] == 0:
                    xm.master_print('[VALID] Iteration {:4} | Loss {:.4f} | Time Elapsed {:.2f} seconds'.format(i, loss.item(),time.time() - train_start))

        preds, actuals = np.array(preds), np.array(actuals)
        preds = np.argmax(preds, axis=1) #Convert probabilities to Categories

        valid_acc = acc_score(preds, actuals) #Accuracy Score
        metrics = compute_metrics(preds, actuals) #Classification Report
        
        xm.master_print('\nFinished evaluation in {:.2f} seconds. Validation Accuracy: {:.4f}\n'.format(time.time() - valid_start, valid_acc))
        xm.master_print(metrics) #print only one core

        # Save the model
        xm.save(model.state_dict(), flags['savedir'] + '/' + flags['modelpath'])

Training

*   Sigkill error means run out of memory (reduce Batch_size)



In [None]:
# Start the training process
#os.mkdir(flags['savedir']) if flags['savedir'] not in os.listdir('.') else print ('Already exist')
xmp.spawn(map_fn, args=(flags,), nprocs=8, start_method='fork')

Prediction

Loading in Test Set



In [None]:
class TestDataset(datautils.Dataset):
    def __init__(self, text, ids):
        self.text = text
        self.ids = ids

    def __len__(self):
        return len(self.ids)

    def __getitem__(self, idx):
        ix_text = self.text[idx]
        ix_id = self.ids[idx]
        
        return ix_text, ix_id

In [None]:
df_test = pd.read_csv('/content/drive/My Drive/Sentiment Analysis/Clean/test2.csv')
text = regular_encode(list(df_test['review'].astype(str)), tokenizer)
ids = list(df_test['review_id'])

# Produce a test set
test_set = TestDataset(text, ids)

Review_id is needed to index the results since we are going to split them into 8 cores for training

Constructing the Prediction Model

In [None]:
# Configure the model and load the checkpoint
config = AutoConfig.from_pretrained(flags['pretrained'], num_labels=flags['num_labels'])
model = AutoModelForSequenceClassification.from_pretrained(flags['pretrained'], config=config)
model.load_state_dict(torch.load(flags['savedir'] + '/' + flags['modelpath']))

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model 

<All keys matched successfully>

In [None]:
def map_fn(index, flags):
    # Set the seed and obtain an XLA device
    torch.manual_seed(flags['seed'])
    device = xm.xla_device()
    print("Process", index, "obtained, using device:", xm.xla_real_devices([str(device)])[0])

    # Produce a distributed sampler and a data loader
    test_sampler = datautils.distributed.DistributedSampler(
        test_set,
        num_replicas=xm.xrt_world_size(),
        rank=xm.get_ordinal(),
        shuffle=False
    )
    test_loader = datautils.DataLoader(
        test_set,
        batch_size=flags['batch_size'],
        sampler=test_sampler,
        pin_memory=False,
        drop_last=False,
        num_workers=flags['num_workers']
    )

    model.to(device)

    xm.master_print("\nNumber of testing batches: {}\n".format(len(test_loader)))

    # Run inferencing
    model.eval()
    preds, ids = [], []
    test_start = time.time()

    xm.master_print('=' * 25 + 'Inference' + '=' * 25)
    
    with torch.no_grad():
        for i, batch in enumerate(test_loader):
            x, idx = batch
            x = x.to(device)
            out = model(x)[0]
            sm = torch.nn.LogSoftmax(dim=1)
            out = sm(out)
            preds.extend(out.cpu().detach().numpy().tolist())
            ids.append(idx)
            if i % flags['print_every'] == 0: 
                xm.master_print('Inferencing on step {:4} | Time elapsed: {:.2f} seconds'.format(i, time.time() - test_start))
        preds = np.array(preds)    

    # Save the predictions and associated IDs into a temporary file
    with open('{}/preds_{}.pt'.format(flags['savedir'], xm.xla_real_devices([str(device)])[0]), 'wb') as f:
        torch.save([ids, preds], f)

    xm.master_print('\nFinished inferencing in {:.2f} seconds.\n'.format(time.time() - test_start))

Start Prediction

In [None]:
# Start the processes
xmp.spawn(map_fn, args=(flags,), nprocs=8, start_method='fork') 

Process 0 obtained, using device: TPU:0

Number of testing batches: 945

Process 7 obtained, using device: TPU:7
Process 1 obtained, using device: TPU:1
Process 3 obtained, using device: TPU:3
Inferencing on step    0 | Time elapsed: 5.10 seconds
Process 4 obtained, using device: TPU:4
Process 2 obtained, using device: TPU:2
Process 5 obtained, using device: TPU:5
Process 6 obtained, using device: TPU:6
Inferencing on step  150 | Time elapsed: 49.21 seconds
Inferencing on step  300 | Time elapsed: 69.42 seconds
Inferencing on step  450 | Time elapsed: 90.01 seconds
Inferencing on step  600 | Time elapsed: 109.31 seconds
Inferencing on step  750 | Time elapsed: 128.91 seconds
Inferencing on step  900 | Time elapsed: 148.96 seconds

Finished inferencing in 159.70 seconds.



Remove Duplicates since processing done in all 8 cores (same batch_size in each core)

In [None]:
# Load all prediction files
all_ids, all_preds = [], []

for i in range(8):
    with open('{}/preds_TPU:{}.pt'.format(flags['savedir'], i), 'rb') as f:
        idx, preds = torch.load(f)
        all_preds.extend(preds)
        for id in idx:
            all_ids.extend(id.numpy()) #convert tensor to number and to combine in a list
preds = np.array(all_preds)
preds = np.argmax(preds, axis=1)
preds

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
keys = {0:1, 1:2, 2:3, 3:4, 4:5}
rating = []

for i in preds:
  rating.append(keys[i])

In [None]:
# Combine and remove duplicates
df_result = pd.DataFrame(data={'review_id': all_ids})
df_result['rating'] = rating
df_result.drop_duplicates(keep='first', subset='review_id', inplace=True)
#sort in ascending order
df_result.sort_values(by=['review_id'], inplace=True)
df_result.set_index(['review_id'], inplace=True)

In [None]:
# Save CSV
assert df_result.shape[0] == df_test.shape[0] #check if prediction is same length as test
df_result.to_csv('/content/drive/My Drive/Sentiment Analysis/result.csv')
df_result.head(50)

Reference: https://colab.research.google.com/drive/19B73pxweO35DLT1Gju3tyuKmvATZFxIP?usp=sharing&fbclid=IwAR1QZPg8UWKv-4MBK9xSExUO93UOq2abolnFKOoGjOEhNgzp14fMAsTlW10

AutoModelForSequenceClassification:
* DistilBertConfig 
* AlbertConfig 
* CamembertConfig 
* XLMRobertaConfig 
* BartConfig 
* LongformerConfig 
* RobertaConfig 
* BertConfig 
* XLNetConfig 
* MobileBertConfig 
* FlaubertConfig 
* XLMConfig 
* ElectraConfig.