# Loading Libraries

In [1]:
!pip install -q transformers
!pip install -q deep-translator

In [2]:
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import RobertaTokenizer, RobertaModel, AutoTokenizer, AutoModel
from sklearn.preprocessing import MultiLabelBinarizer
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
import warnings
from deep_translator import GoogleTranslator
warnings.filterwarnings("ignore")
Robertatokenizer = RobertaTokenizer.from_pretrained('roberta-base')
Bioclinicaltokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")



Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

# Setting the path to files and model name

In [3]:
model_name = 'bioclinical' # can be 'bioclinical' for bioclinical_bert or 'roberta' for Roberta
train_path = "/kaggle/input/ihqid-webmd/IHQID-WebMD/train.csv"
test_path = "/kaggle/input/ihqid-webmd/IHQID-WebMD/test.csv"

MAX_LEN = 200
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 10
LEARNING_RATE = 1e-05

# Loading and Preprocessing Data

In [19]:
def translate_to_english(text):
    try:
        translated_text_1 = GoogleTranslator(source='bn', target='hi').translate(text)
        translated_text = GoogleTranslator(source='hi', target='en').translate(translated_text_1)
        return translated_text
    except Exception as e:
        print(f"Translation error: {e}")
        return None

In [20]:
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

In [21]:
def one_hot_encode(row):
    one_hot = [1 if cell else 0 for cell in row]
    return one_hot

train_df = train_df[['question_bengali', 'Manual_Intent']]
test_df = test_df[['question_bengali', 'Manual_Intent']]
# Apply the translation function to the 'questions_hindi' column
train_df['question_conv'] = train_df['question_bengali'].apply(translate_to_english)
test_df['question_conv'] = test_df['question_bengali'].apply(translate_to_english)
train_eng = train_df[['question_conv', 'Manual_Intent']]
test_eng = test_df[['question_conv', 'Manual_Intent']]

unique_values = train_eng['Manual_Intent'].unique()

# Function to create the one-hot encoded list for each row
def one_hot_encode_category(row, unique_values):
    one_hot = [1 if value == row['Manual_Intent'] else 0 for value in unique_values]
    return one_hot

# Apply the function to create the new one-hot encoded column
train_eng['one_hot_encoded'] = train_eng.apply(one_hot_encode_category, args=(unique_values,), axis=1)
test_eng['one_hot_encoded'] = test_eng.apply(one_hot_encode_category, args=(unique_values,), axis=1)

In [22]:
train_eng = train_eng[['question_conv', 'one_hot_encoded']]
test_eng = test_eng[['question_conv', 'one_hot_encoded']]
test_eng.columns = ['input', 'target']
train_eng.columns = ['input', 'target']

In [23]:
train_eng

Unnamed: 0,input,target
0,What is nystatin prescribed for?,"[1, 0, 0, 0]"
1,Can showering after sex prevent me from gettin...,"[0, 1, 0, 0]"
2,Percocet causes weight gain,"[1, 0, 0, 0]"
3,Can 2 or 2 1/2 glasses of wine a day cause hig...,"[0, 0, 1, 0]"
4,Can too much buttermilk cause thrush?,"[0, 0, 1, 0]"
...,...,...
715,Can an insurance company be required to cover ...,"[0, 1, 0, 0]"
716,How can I use duct tape to get rid of warts?,"[0, 0, 0, 1]"
717,Bell's Palsy What facial exercises can be done...,"[0, 0, 0, 1]"
718,Is prenatal ultrasound safe?,"[0, 0, 0, 1]"


In [24]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.input = dataframe.input
        self.targets = self.data.target
        self.max_len = max_len

    def __len__(self):
        return len(self.input)

    def __getitem__(self, index):
        text = str(self.input[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [25]:
train_dataset = train_eng
test_dataset = test_eng
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

if(model_name == 'roberta'):
    tokenizer = Robertatokenizer
elif(model_name == 'bioclinical'):
    tokenizer = Bioclinicaltokenizer
else:
    print("Doesnt exist model name, please enter correctly")
    
training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

TRAIN Dataset: (720, 2)
TEST Dataset: (241, 2)


In [26]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,'shuffle': True,'num_workers': 0}
test_params = {'batch_size': VALID_BATCH_SIZE,'shuffle': True,'num_workers': 0}

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

# Building Customized classes for model for end to end finetuning

In [27]:
 class RobertaClass(torch.nn.Module):
    def __init__(self):
        super(RobertaClass, self).__init__()
        self.l1 = RobertaModel.from_pretrained('roberta-base')
        self.l2 = torch.nn.Linear(768, 4)
    
    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict = False)
        output = self.l2(output_1)
        return output

In [28]:
 class BioclinicalClass(torch.nn.Module):
    def __init__(self):
        super(BioclinicalClass, self).__init__()
        self.l1 = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
        self.l2 = torch.nn.Linear(768, 4)
    
    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict = False)
        output = self.l2(output_1)
        return output

# Initializing the model and setting the loss function and optimizer

In [29]:
if(model_name == 'roberta'):
    model = RobertaClass()
elif (model_name == 'bioclinical'):
    model = BioclinicalClass()
    
model.to(device)
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

# Training

In [30]:
def train(epoch):
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)
        outputs = model(ids, mask, token_type_ids)
        optimizer.zero_grad()
        loss = loss_fn(outputs, targets.float())
        if _%5000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [31]:
for epoch in range(EPOCHS):
    train(epoch)

Epoch: 0, Loss:  0.8055540323257446
Epoch: 1, Loss:  0.5429575443267822
Epoch: 2, Loss:  0.39278924465179443
Epoch: 3, Loss:  0.24717780947685242
Epoch: 4, Loss:  0.2451520413160324
Epoch: 5, Loss:  0.17690199613571167
Epoch: 6, Loss:  0.050609491765499115
Epoch: 7, Loss:  0.0518803671002388
Epoch: 8, Loss:  0.04026336967945099
Epoch: 9, Loss:  0.022489354014396667


# Testing

In [32]:
def test_validation():
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [33]:
outputs, targets = test_validation()
outputs = np.array(outputs) >= 0.5
f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
classification_report = metrics.classification_report(targets, outputs)
confusion_matrix = metrics.multilabel_confusion_matrix(targets, outputs)
print(f"Test F1 Score (Macro) = {f1_score_macro}")
print("Test classification report = \n")
print(classification_report)
print("Test confusion matrix = \n")
print(confusion_matrix)


Test F1 Score (Macro) = 0.7533757379345615
Test classification report = 

              precision    recall  f1-score   support

           0       0.86      0.79      0.82        53
           1       0.75      0.77      0.76        73
           2       0.84      0.74      0.78        76
           3       0.63      0.67      0.65        39

   micro avg       0.78      0.75      0.76       241
   macro avg       0.77      0.74      0.75       241
weighted avg       0.78      0.75      0.76       241
 samples avg       0.74      0.75      0.75       241

Test confusion matrix = 

[[[181   7]
  [ 11  42]]

 [[149  19]
  [ 17  56]]

 [[154  11]
  [ 20  56]]

 [[187  15]
  [ 13  26]]]
