In [None]:
# Reference : https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb#scrollTo=pzM1_ykHaFur

In [None]:
# !pip install datasets transformers

In [None]:
# Importing stock ml libraries
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertModel, DistilBertConfig, AutoTokenizer

from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler



from joblib import dump, load
from tqdm import tqdm

from datasets import load_dataset, list_datasets

from IPython.display import Audio, display

from pathlib import Path
import glob


In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

### Helper Functions

In [None]:
def paired_bootstrap_test(test_set, model1,model2, B, score,*args,**kwargs):
    """
    Function to generate \delta(x) and \delta(x^{(i)}) for B bootstrap samples.
    Reference: Jurafsky, Daniel, and James H. Martin. "Speech and Language Processing: An Introduction to Natural Language Processing, Computational Linguistics, and Speech Recognition."
    Params:
    ------
    test_set: np.ndarray, Array of test outputs.
    model1: np.ndarray, Array of Model A's output.
    model2: np.ndarray, Array of Model B's output.
    B : int, No of Bootstrap's to be generated
    score: Evaluation algorithm.
    """
    N = test_set.shape[0]
    score1 =  score(test_set, model1, *args,**kwargs)
    score2 =  score(test_set, model2, *args,**kwargs)
    delta = score1-score2
    deltas = [] # for storing \delta(x) of bootstraps
    for boot in tqdm(range(B)):
        ind = np.random.randint(low=0, high=N, size=N)
        sampleY = test_set[ind,:]
        sample1 = model1[ind,:]
        sc1 = score(sampleY, sample1,*args,**kwargs)
        sample2 = model2[ind,:]
        sc2 = score(sampleY, sample2, *args, **kwargs)
        delta_b = sc1 - sc2
        deltas.append(delta_b)
    deltas = np.array(deltas)
    return (deltas, delta)

def hypothesis_test(dx_i,dx, significance=0.05):
    """
    Implementation of paired-bootstrap test.
    Reference: Berg-Kirkpatrick, et. al. An empirical investigation of statistical significance in nlp.
    """
    p_value = np.mean(dx_i>= (2*dx))
    if p_value<significance:
        print(f"We reject the null hypothesis at a significance of {significance}")
    else:
        print(f"We fail to reject the Null Hypothesis at a significance of {significance}")
    return p_value

In [None]:
def get_tran_test():
    reuters = load_dataset("reuters21578","ModApte") # ModApte b'coz of "A re-examination of text categorization methods" paper
    train = reuters['train'] # Same as paper
    test = reuters['test'] # Same as paper
    train.set_format(type = "pandas")
    test.set_format(type = "pandas")   
    df_train = train[:]
    df_test = test[:]
    # Get empty Indexes: required for dealing with Tensored datasets
    empty_train = df_train.index[df_train.topics.str.len().eq(0)] 
    empty_test = df_test.index[df_test.topics.str.len().eq(0)]

    df_train = df_train[~df_train.topics.str.len().eq(0)] # Drop Empty Topics
    df_test = df_test[~df_test.topics.str.len().eq(0)] # Drop Empty Topics
    cols =df_train.columns

    df_train = df_train.drop([col for col in cols if col not in ['text', 'topics']], axis=1)
    df_test = df_test.drop([col for col in cols if col not in ['text', 'topics']], axis=1)
    return df_train, df_test


In [None]:
def get_data(*args, **kwargs):
    reuters = load_dataset("reuters21578","ModApte") # ModApte b'coz of "A re-examination of text categorization methods" paper
    train = reuters['train'] # Same as paper
    test = reuters['test'] # Same as paper
    train.set_format(type = "pandas")
    test.set_format(type = "pandas")   
    df_train = train[:]
    df_test = test[:]
    # Get empty Indexes: required for dealing with Tensored datasets
    empty_train = df_train.index[df_train.topics.str.len().eq(0)] 
    empty_test = df_test.index[df_test.topics.str.len().eq(0)]

    df_train = df_train[~df_train.topics.str.len().eq(0)] # Drop Empty Topics
    df_test = df_test[~df_test.topics.str.len().eq(0)] # Drop Empty Topics
    cols =df_train.columns

    df_train = df_train.drop([col for col in cols if col not in ['text', 'topics']], axis=1)
    df_test = df_test.drop([col for col in cols if col not in ['text', 'topics']], axis=1)

    X_train = df_train['text']
    X_test = df_test['text']
    mlb = MultiLabelBinarizer()
    y_train = mlb.fit_transform(df_train.topics)
    y_test = mlb.transform(df_test.topics)
    return X_train,X_test,y_train,y_test, empty_train,empty_test

def allDone():
    """Job Done!"""
    display(Audio(url='https://sound.peal.io/ps/audios/000/000/537/original/woo_vu_luvub_dub_dub.wav', autoplay=True))


### Data Handling

In [None]:
X_train,X_test,y_train,y_test, empty_train_,empty_test_= get_data()

Reusing dataset reuters21578 (/root/.cache/huggingface/datasets/reuters21578/ModApte/1.0.0/98a2ad6a0242627562db83992f9625261854c40a88619322596153a5a16a206c)


  0%|          | 0/3 [00:00<?, ?it/s]

  "unknown class(es) {0} will be ignored".format(sorted(unknown, key=str))


In [None]:
# from transformers.file_utils import PaddingStrategy
# Sections of config

# Defining some key variables that will be used later on in the training
MAX_LEN = 500
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 8
EPOCHS = 5
LEARNING_RATE = 1e-04
NUM_LABELS = 115
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased',return_tensors="pt")

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [None]:
class CustomDataset(Dataset):
    def __init__(self, x,y,token, max_ln):
        self.token = token
        self.x = x
        self.y = y
        self.max_ln = max_ln
        self.token = token
    def __len__(self):
        return len(self.x)
    def __getitem__(self,idx):
        text = self.x.iloc[idx]

        to_bert = self.token(text, add_special_tokens=True,max_length=self.max_ln,
                        padding='max_length', truncation = 'only_first')

        ids = to_bert['input_ids']
        mask = to_bert['attention_mask']
        return idx
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype= torch.long),
            'target': torch.tensor(self.y[idx], dtype=torch.float)
        }

In [None]:
train_set = CustomDataset(X_train, y_train,token = tokenizer, max_ln = MAX_LEN)
test_set = CustomDataset(X_test,y_test, token = tokenizer, max_ln = MAX_LEN)

In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(train_set, **train_params)
testing_loader = DataLoader(test_set, **test_params)

In [None]:
from transformers import AutoModelForSequenceClassification 
# With this 1. num_labels = NUM_LABELS, 2. return_dict=False and 3. output_1[0] in models output
# https://stackoverflow.com/questions/66900855/dropout-argument-input-position-1-must-be-tensor-not-tuple-when-using-xl
# B'coz of BCE

In [None]:
class DistilBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistilBERTClass, self).__init__()
        self.l1 = transformers.AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels = NUM_LABELS, return_dict=False)
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(NUM_LABELS, NUM_LABELS)
    
    def forward(self, ids, mask):
        output_1= self.l1(ids, attention_mask = mask)
        output_2 = self.l2(output_1[0])
        output = self.l3(output_2)
        return output

model = DistilBERTClass()
model.to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'classifi

DistilBERTClass(
  (l1): DistilBertForSequenceClassification(
    (distilbert): DistilBertModel(
      (embeddings): Embeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (transformer): Transformer(
        (layer): ModuleList(
          (0): TransformerBlock(
            (attention): MultiHeadSelfAttention(
              (dropout): Dropout(p=0.1, inplace=False)
              (q_lin): Linear(in_features=768, out_features=768, bias=True)
              (k_lin): Linear(in_features=768, out_features=768, bias=True)
              (v_lin): Linear(in_features=768, out_features=768, bias=True)
              (out_lin): Linear(in_features=768, out_features=768, bias=True)
            )
            (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (ffn): FF

In [None]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [None]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()

NameError: ignored

In [None]:
from tqdm.auto import tqdm

In [None]:
num_steps = EPOCHS*len(training_loader)

122

In [None]:
def train(epoch):
    

    progress_bar = tqdm(range(num_steps))
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        #token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['target'].to(device, dtype = torch.float)

        outputs = model(ids, mask)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _%5000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        progress_bar.update(1)

In [None]:
for epoch in range(EPOCHS):
    train(epoch)

  0%|          | 0/972 [00:00<?, ?it/s]

Epoch: 0, Loss:  0.6887668371200562
