# Loading Libraries

In [1]:
!pip install -q transformers

In [2]:
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaModel, AutoTokenizer, AutoModel
from sklearn.preprocessing import MultiLabelBinarizer
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
import warnings
warnings.filterwarnings("ignore")
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')



Downloading (…)lve/main/config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

# Setting the path to files and model name

In [3]:
model_name = 'roberta' # can be 'bioclinical' for bioclinical_bert or 'roberta' for Roberta
train_path = "/kaggle/input/ihqid-webmd/IHQID-WebMD/train.csv"
test_path = "/kaggle/input/ihqid-webmd/IHQID-WebMD/test.csv"

MAX_LEN = 200
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 10
LEARNING_RATE = 1e-05

# Loading and Preprocessing Data

In [4]:
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

In [5]:
def one_hot_encode(row):
    one_hot = [1 if cell else 0 for cell in row]
    return one_hot

train_eng = train_df[['question_hindi', 'Manual_Intent']]
test_eng = test_df[['question_hindi', 'Manual_Intent']]

unique_values = train_eng['Manual_Intent'].unique()

# Function to create the one-hot encoded list for each row
def one_hot_encode_category(row, unique_values):
    one_hot = [1 if value == row['Manual_Intent'] else 0 for value in unique_values]
    return one_hot

# Apply the function to create the new one-hot encoded column
train_eng['one_hot_encoded'] = train_eng.apply(one_hot_encode_category, args=(unique_values,), axis=1)
test_eng['one_hot_encoded'] = test_eng.apply(one_hot_encode_category, args=(unique_values,), axis=1)

In [6]:
train_eng = train_eng[['question_hindi', 'one_hot_encoded']]
test_eng = test_eng[['question_hindi', 'one_hot_encoded']]
test_eng.columns = ['input', 'target']
train_eng.columns = ['input', 'target']

In [7]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.input = dataframe.input
        self.targets = self.data.target
        self.max_len = max_len

    def __len__(self):
        return len(self.input)

    def __getitem__(self, index):
        text = str(self.input[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [8]:
train_dataset = train_eng
test_dataset = test_eng
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))
    
training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

TRAIN Dataset: (720, 2)
TEST Dataset: (241, 2)


In [9]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,'shuffle': True,'num_workers': 0}
test_params = {'batch_size': VALID_BATCH_SIZE,'shuffle': True,'num_workers': 0}

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

# Building Customized classes for model for end to end finetuning

In [10]:
 class RobertaClass(torch.nn.Module):
    def __init__(self):
        super(RobertaClass, self).__init__()
        self.l1 = RobertaModel.from_pretrained('roberta-base')
        self.l2 = torch.nn.Linear(768, 4)
    
    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict = False)
        output = self.l2(output_1)
        return output

In [11]:
 class BioclinicalClass(torch.nn.Module):
    def __init__(self):
        super(BioclinicalClass, self).__init__()
        self.l1 = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
        self.l2 = torch.nn.Linear(768, 4)
    
    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict = False)
        output = self.l2(output_1)
        return output

In [12]:
 class XLMClass(torch.nn.Module):
    def __init__(self):
        super(XLMClass, self).__init__()
        self.l1 = AutoModelForMaskedLM.from_pretrained("xlm-roberta-base")
        self.l2 = torch.nn.Linear(250002, 4)
    
    def forward(self, ids, mask, token_type_ids):
        output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict = False)
        output = self.l2(output_1[0])
        return output

# Initializing the model and setting the loss function and optimizer

In [13]:
model = XLMClass()    
model.to(device)
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

Downloading model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Training

In [14]:
def train(epoch):
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)
        outputs = model(ids, mask, token_type_ids)
        outputs = outputs[:, 1] - outputs[:, 0]
        optimizer.zero_grad()
        loss = loss_fn(outputs, targets.float())
        if _%5000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [15]:
for epoch in range(EPOCHS):
    train(epoch)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch: 0, Loss:  1.9325957298278809
Epoch: 1, Loss:  0.5799528360366821
Epoch: 2, Loss:  0.5265626907348633
Epoch: 3, Loss:  0.37282299995422363
Epoch: 4, Loss:  0.15848389267921448
Epoch: 5, Loss:  0.4397836923599243
Epoch: 6, Loss:  0.295495867729187
Epoch: 7, Loss:  0.28135648369789124
Epoch: 8, Loss:  0.2501368522644043
Epoch: 9, Loss:  0.04627779871225357


# Testing

In [16]:
def test_validation():
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            outputs = outputs[:, 1] - outputs[:, 0]
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [17]:
outputs, targets = test_validation()
outputs = np.array(outputs) >= 0.5
f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
classification_report = metrics.classification_report(targets, outputs)
confusion_matrix = metrics.multilabel_confusion_matrix(targets, outputs)
print(f"Test F1 Score (Macro) = {f1_score_macro}")
print("Test classification report = \n")
print(classification_report)
print("Test confusion matrix = \n")
print(confusion_matrix)


Test F1 Score (Macro) = 0.6774783658144109
Test classification report = 

              precision    recall  f1-score   support

           0       0.75      0.74      0.74        53
           1       0.78      0.67      0.72        73
           2       0.86      0.55      0.67        76
           3       0.49      0.69      0.57        39

   micro avg       0.72      0.65      0.68       241
   macro avg       0.72      0.66      0.68       241
weighted avg       0.75      0.65      0.69       241
 samples avg       0.64      0.65      0.64       241

Test confusion matrix = 

[[[175  13]
  [ 14  39]]

 [[154  14]
  [ 24  49]]

 [[158   7]
  [ 34  42]]

 [[174  28]
  [ 12  27]]]
