In [None]:
# !mkdir ~/.kaggle
# !cp kaggle.json ~/.kaggle/
# !kaggle competitions download -c amazon-pet-product-reviews-classification
# !unzip \*.zip

Downloading unlabeled.csv.zip to /content
100% 17.0M/17.0M [00:01<00:00, 12.1MB/s]
100% 17.0M/17.0M [00:01<00:00, 12.6MB/s]
Downloading sample_submission.csv to /content
  0% 0.00/176k [00:00<?, ?B/s]
100% 176k/176k [00:00<00:00, 126MB/s]
Downloading test.csv.zip to /content
  0% 0.00/2.98M [00:00<?, ?B/s]
100% 2.98M/2.98M [00:00<00:00, 48.6MB/s]
Downloading valid.csv.zip to /content
  0% 0.00/3.00M [00:00<?, ?B/s]
100% 3.00M/3.00M [00:00<00:00, 48.5MB/s]
Downloading train.csv.zip to /content
 56% 5.00M/8.88M [00:00<00:00, 16.5MB/s]
100% 8.88M/8.88M [00:00<00:00, 25.4MB/s]
Archive:  valid.csv.zip
  inflating: valid.csv               

Archive:  test.csv.zip
  inflating: test.csv                

Archive:  train.csv.zip
  inflating: train.csv               

Archive:  unlabeled.csv.zip
  inflating: unlabeled.csv           

4 archives were successfully processed.


In [None]:
!curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
!python pytorch-xla-env-setup.py --apt-packages libomp5 libopenblas-dev

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  5116  100  5116    0     0  12210      0 --:--:-- --:--:-- --:--:-- 12180
Updating... This may take around 2 minutes.
Found existing installation: torch 1.9.0+cu102
Uninstalling torch-1.9.0+cu102:
  Successfully uninstalled torch-1.9.0+cu102
Found existing installation: torchvision 0.10.0+cu102
Uninstalling torchvision-0.10.0+cu102:
  Successfully uninstalled torchvision-0.10.0+cu102
Copying gs://tpu-pytorch/wheels/torch-nightly+20200515-cp37-cp37m-linux_x86_64.whl...
\ [1 files][ 91.0 MiB/ 91.0 MiB]                                                
Operation completed over 1 objects/91.0 MiB.                                     
Copying gs://tpu-pytorch/wheels/torch_xla-nightly+20200515-cp37-cp37m-linux_x86_64.whl...
| [1 files][119.5 MiB/119.5 MiB]                                                
Operation completed over 1 obj

In [None]:
!pip install torchmetrics

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.11.3-py3-none-any.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 4.1 MB/s 
[?25hCollecting huggingface-hub>=0.0.17
  Downloading huggingface_hub-0.0.19-py3-none-any.whl (56 kB)
[K     |████████████████████████████████| 56 kB 4.2 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 45.1 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 41.9 MB/s 
Installing collected packages: tokenizers, sacremoses, huggingface-hub, transformers
Successfully installed huggingface-hub-0.0.19 sacremoses-0.0.46 tokenizers-0.10.3 transformers-4.11.3


In [None]:
import torch_xla
import torch_xla.distributed.parallel_loader as pl
import torch_xla.core.xla_model as xm
import torch_xla.distributed.xla_multiprocessing as xmp

In [None]:
import os
os.environ['XLA_USE_BF16']                 = '1'
os.environ['XLA_TENSOR_ALLOCATOR_MAXSIZE'] = '1000000000'

In [None]:
from transformers import AutoTokenizer,AutoConfig,AutoModel
import torch
import torch.nn as nn
from torch.utils.data import Dataset,DataLoader
import pandas as pd
import transformers
import numpy as np
from torchmetrics.functional import f1

In [None]:
import gc

In [None]:
chk = 'distilbert-base-uncased'

In [None]:
class BertDataset(Dataset):

  def __init__(self,x,labels,tokenizer,max_len=256,test=False):

    self.x = x
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_len = 512
    self.test = test
  
  def __getitem__(self,index):

    tokens = self.tokenizer.encode_plus(
        self.x[index],
        add_special_tokens = True,
        max_length = self.max_len,
        return_tensors='pt',
        truncation=True
    )

    pad = self.max_len - tokens['input_ids'].squeeze(0).size(0)

    input_ids = tokens['input_ids'].squeeze(0)
    attention_mask = tokens['attention_mask'].squeeze(0)

    del tokens

    if pad > 0:

      input_ids = torch.cat( [ input_ids , torch.Tensor( [ self.tokenizer.pad_token_id ]*pad ) ] )
      attention_mask = torch.cat( [ attention_mask , torch.Tensor([0]*pad ) ] )
    
    if self.test:

      return input_ids,attention_mask
    
    else : return input_ids,attention_mask,torch.Tensor([self.labels[index]])
  
  def __len__(self):
    return len(self.x)



In [None]:
train = pd.read_csv('/content/train.csv',index_col='id')
validate = pd.read_csv('/content/valid.csv',index_col='id')
test = pd.read_csv('/content/test.csv',index_col='id')

In [None]:
xtr,ytr = train['text'],train['label']
xva,yva = validate['text'],validate['label']
xte = test['text']

In [None]:
tokenizer = AutoTokenizer.from_pretrained(chk)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le = LabelEncoder()
ytr = le.fit_transform(ytr)
yva = le.transform(yva)

In [None]:
class BertForClassification(nn.Module):

  def __init__(self,num_classes):

    super().__init__()

    config = AutoConfig.from_pretrained(chk,num_labels=num_classes)

    self.transformer = AutoModel.from_pretrained(chk,config=config)
    self.preclf = nn.Linear(config.dim,config.dim)
    self.classifier = nn.Linear(config.dim,num_classes)

  
  def forward(self,input_ids,attention_mask,head_mask=None):

    out = self.transformer(input_ids=input_ids,attention_mask=attention_mask,head_mask=head_mask)[0]
    out = self.preclf(out[:,0,:])
    out = self.classifier(out)

    return out

In [None]:
def collate_fn(batch):

  input_ids,attention_mask,labels = tuple(zip(*batch))

  input_ids = torch.stack(input_ids).long()
  attention_mask = torch.stack(attention_mask).int()
  labels = torch.stack(labels).squeeze().long()

  return {'input_ids':input_ids,'attention_mask':attention_mask},labels

In [None]:
class config:

  lr = 5e-5
  opt = transformers.AdamW
  opt_params = {}

  criterion = nn.CrossEntropyLoss
  criterion_params = {}

  scheduler = None
  validation_schduler = None
  step_scheduler = None

  epochs = 1

  metrics = [f1]
  metrics_names = ['loss','f1_score']



In [None]:
import time

In [None]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


In [None]:
class TPUFitter():

  def __init__(self,model,config,device):

    self.model = model
    self.device = device
    self.config = config


    self.epoch = 0

    self.criterion = self.config.criterion(**self.config.criterion_params)

    param_optimizer = list(self.model.named_parameters())
    
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.001},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
    
    self.opt = self.config.opt( optimizer_grouped_parameters , lr=self.config.lr*xm.xrt_world_size(), **self.config.opt_params )
    
    if self.config.scheduler is not None:
      self.scheduler = self.config.scheduler(self.opt,**self.config.scheduler_params)
    else : self.scheduler = None
    
    self.best_loss = 10**5

    xm.master_print(f'Fitter prepared. Device is {self.device}')
  
  def log(self,x):
    xm.master_print(x)
  
  def fit(self,train_loader,validate_loader):

    for epoch in range(self.config.epochs):

      st = time.time()
      para_loader = pl.ParallelLoader(train_loader,[self.device])
      train_metrics = self.train_one_epoch(para_loader.per_device_loader(self.device))
      s = '[TRAIN] '+f' Epoch : {epoch} ' +'   '.join(f" {k} : {v} " for k,v in zip(train_metrics,self.config.metrics_names) ) + f' Time : {int(time.time()-st)}'
      self.log(s)
      del para_loader
      gc.collect()

      st = time.time()
      para_loader = pl.ParallelLoader(validate_loader,[self.device])
      validate_metrics = self.validate(para_loader.per_device_loader(self.device))
      s = '[VALID] '+f' Epoch : {epoch} ' +'   '.join(f" {k} : {v} " for k,v in zip(validate_metrics,self.config.metrics_names) ) + f' Time : {int(time.time()-st)}'
      self.log(s)
      del para_loader
      gc.collect()

      if ( self.scheduler is not None )  and ( self.config.validation_scheduler is not None ) :
        self.scheduler.step(metrics=validate_metrics[0])
      
  
  def validate(self,validate_loader):

    avg_loss = AverageMeter()
    avg_loss.reset()
    mets = [avg_loss]

    if len(self.config.metrics) > 1:
      for m in self.config.metrics:
        ms = AverageMeter()
        ms.reset()
        mets.append(ms)
      
    

    with torch.no_grad():

      for bi,(x,y) in enumerate(validate_loader):

        for k,v in x:
          x[k] = v.to(self.device)
        
        y = y.to(self.device)
        
        out = self.model(**x)
        loss = self.criterion(out,y)

        bs = y.shape[0].item()

        mets[0].update(loss.detach().item(),bs)
        
        
        if len (mets) > 1 :
          for m,meter in zip ( self.config.metrics[1: ],mets[1:] ) :
            sc = m( torch.max(out,dim=-1)[1], y )
            meter.update(sc.item(),bs)
            del sc
            gc.collect()
        
        del loss,out,x,y
        gc.collect()
    

    return [ m.avg for m in mets ]
  
  def train_one_epoch(self,train_loader):

    avg_loss = AverageMeter()
    avg_loss.reset()
    mets = [avg_loss]

    if len(self.config.metrics) > 1:
      for m in self.config.metrics:
        ms = AverageMeter()
        ms.reset()
        mets.append(ms)
    

    for i,(x,y) in enumerate(train_loader):

      for k,v in x:
          x[k] = v.to(self.device)
      
      y = y.to(self.device)
      
      bs = y.shape[0].item()
        
      out = self.model(**x)
      loss = self.criterion(out,y)

      self.opt.zero_grad()
      loss.backward()

      mets[0].update(loss.detach().item(),bs)
        
        
      if len (mets) > 1 :
        for m,meter in zip ( self.config.metrics[1: ],mets[1:] ) :
          sc = m( torch.max(out,dim=-1)[1], y )
          meter.update(sc.item(),bs)
          del sc
          gc.collect()
      
      xm.optimizer_step(self.opt)

      del loss,out,x,y

      if ( self.scheduler is not None ) and ( self.config.step_scheduler is not None):
        self.scheduler.step()
    

    return [ m.avg for m in mets]
    

In [None]:
def _mp_fn(ranks,flags):

  device = xm.xla_device()
  model.to(device)

  train_data = BertDataset(xtr,ytr,tokenizer)
  validate_data = BertDataset(xva,yva,tokenizer)

  train_sampler = torch.utils.data.DistributedSampler(train_data,num_replicas=xm.xrt_world_size(),rank=xm.get_ordinal())
  validate_sampler = torch.utils.data.DistributedSampler(validate_data,num_replicas=xm.xrt_world_size(),rank=xm.get_ordinal())

  train_loader = DataLoader(train_data,batch_size=bs,shuffle=True,sampler=train_sampler,pin_memory=True,drop_last=True)
  validate_loader = DataLoader(validate_data,batch_size=bs,shuffle=False,sampler=validate_sampler,pin_memory=True,drop_last=False)

  fitter = TPUFitter(model,config,device)

  if rank == 0 :
    time.sleep(1)
  
  fitter.fit(train_loader,validate_loader)

In [None]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [None]:
model = BertForClassification(len(le.classes_))
FLAGS={}
xmp.spawn(_mp_fn, args=(FLAGS,), nprocs=8, start_method='fork')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Exception: ignored