In [196]:
!pip install -q transformers
!pip install -q deep-translator

In [197]:
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler, TensorDataset
from transformers import RobertaTokenizer, RobertaModel, RobertaForTokenClassification, AutoTokenizer, AutoModel, AutoModelForTokenClassification
from sklearn.preprocessing import MultiLabelBinarizer
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
import warnings
from deep_translator import GoogleTranslator
from sklearn.metrics import classification_report, confusion_matrix, f1_score
warnings.filterwarnings("ignore")
Robertatokenizer = RobertaTokenizer.from_pretrained('roberta-base')
Bioclinicaltokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

In [198]:
model_name = 'roberta' # can be 'bioclinical' for bioclinical_bert or 'roberta' for Roberta
train_path = "/kaggle/input/ihqid-1mg/IHQID-1mg/train.csv"
test_path = "/kaggle/input/ihqid-1mg/IHQID-1mg/test.csv"

MAX_LEN = 200
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 10
LEARNING_RATE = 1e-05

In [199]:
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
train_df['target'] = ''
test_df['target'] = ''
if(model_name == 'roberta'):
    tokenizer = Robertatokenizer
if(model_name == 'bioclinical_bert'):
    tokenizer = Bioclinicaltokenizer

In [200]:
# Define a function to convert a value to lowercase (handling null values)
def lowercase(value):
    return str(value).lower() if pd.notnull(value) else value

# Apply the function to all columns using applymap
train_df = train_df.applymap(lowercase)
test_df = test_df.applymap(lowercase)

In [201]:
def create_tags(tokens, type, tokenizer):
    # Create a list of tuples with BIO tags for the original words
    
    if(type == 'drug'):
        bio_tags = [(tokens[0], 'B-drug')] + [(token, 'I-drug') for token in tokens[1:]]
    if(type == 'disease'):
        bio_tags = [(tokens[0], 'B-disease')] + [(token, 'I-disease') for token in tokens[1:]]
    if(type == 'treatment'):
        bio_tags = [(tokens[0], 'B-treatment')] + [(token, 'I-treatment') for token in tokens[1:]]

    # Tokenize each word and match it to the corresponding BIO tag
    tokenized_bio_tags = []

    for word, bio_tag in bio_tags:
        # Tokenize the word using the tokenizer
        word_tokens = tokenizer.tokenize(word)
        word_tokens.append(word)

        # Match each token to the corresponding BIO tag
        tokenized_bio_tags.extend([(token, bio_tag) for token in word_tokens])
    return tokenized_bio_tags

In [202]:
def bio_tagging(entity_list, tokens):
    bio_tags = []
    for token in tokens:
        token = token.lstrip("Ġ")
        flag = 0
        for index, temp in enumerate(entity_list):
            ent, tag = temp
            if token == ent:
                bio_tags.append((tag))
                flag = 1
                break;
        if(flag == 0):
            bio_tags.append(('O'))
    return bio_tags

In [203]:
for index, row in train_df.iterrows():
    tokenized_entity = []
    if(not(pd.isnull(row['drug_english']))):
#         print(row['drug_english'])
        drugs = row['drug_english'].split(',')
        tokenized_bio_drugs = []
        for drug in drugs:
            words = drug.split()
#             print(words)
            tokenized_bio_drugs.extend(create_tags(words, 'drug', tokenizer))
        tokenized_entity.extend(tokenized_bio_drugs)
    if(not(pd.isnull(row['treatment_english']))):
#         print(row['treatment_english'])
        treatments = row['treatment_english'].split(',')
        tokenized_bio_treatments = []
        for treatment in treatments:
            words = treatment.split()
            tokenized_bio_treatments.extend(create_tags(words, 'treatment', tokenizer))
        tokenized_entity.extend(tokenized_bio_treatments)
    if(not(pd.isnull(row['disease_english']))):
#         print(row['disease_english'])
        diseases = row['disease_english'].split(',')
        tokenized_bio_diseases = []
        for disease in diseases:
            words = disease.split()
            tokenized_bio_diseases.extend(create_tags(words, 'disease', tokenizer))
        tokenized_entity.extend(tokenized_bio_diseases)
    question_tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(row["question_english"])))
    target = bio_tagging(tokenized_entity, question_tokens)
    train_df.at[index, 'target'] = target
    
    

In [204]:
for index, row in test_df.iterrows():
    tokenized_entity = []
    if(not(pd.isnull(row['drug_english']))):
#         print(row['drug_english'])
        drugs = row['drug_english'].split(',')
        tokenized_bio_drugs = []
        for drug in drugs:
            words = drug.split()
#             print(words)
            tokenized_bio_drugs.extend(create_tags(words, 'drug', tokenizer))
        tokenized_entity.extend(tokenized_bio_drugs)
    if(not(pd.isnull(row['treatment_english']))):
#         print(row['treatment_english'])
        treatments = row['treatment_english'].split(',')
        tokenized_bio_treatments = []
        for treatment in treatments:
            words = treatment.split()
            tokenized_bio_treatments.extend(create_tags(words, 'treatment', tokenizer))
        tokenized_entity.extend(tokenized_bio_treatments)
    if(not(pd.isnull(row['disease_english']))):
#         print(row['disease_english'])
        diseases = row['disease_english'].split(',')
        tokenized_bio_diseases = []
        for disease in diseases:
            words = disease.split()
            tokenized_bio_diseases.extend(create_tags(words, 'disease', tokenizer))
        tokenized_entity.extend(tokenized_bio_diseases)
#     print(tokenized_entity)
    question_tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(row["question_english"])))
    target = bio_tagging(tokenized_entity, question_tokens)
    test_df.at[index, 'target'] = target
    


In [205]:
train_df['target']

0      [O, O, O, B-drug, B-drug, B-drug, O, I-drug, O...
1      [O, O, O, O, O, B-drug, B-drug, B-drug, B-drug...
2      [O, O, O, O, O, O, O, O, O, B-disease, I-disea...
3      [O, O, O, O, O, O, O, O, O, O, O, B-drug, B-dr...
4                 [O, O, O, O, O, B-disease, O, O, O, O]
                             ...                        
300    [O, O, O, O, O, O, O, B-drug, B-drug, B-drug, ...
301    [O, O, O, O, O, O, O, O, O, O, O, B-drug, I-dr...
302              [O, O, O, O, O, O, O, O, O, O, O, O, O]
303    [O, O, O, O, O, B-disease, O, B-disease, B-dis...
304    [O, O, O, O, B-treatment, B-treatment, B-treat...
Name: target, Length: 305, dtype: object

In [206]:
train_df['question_english']

0                 what is itralase 200 capsule used for?
1      can the use of jalra-m  50mg/1000mg tablet lea...
2      how long will it take to get my thyroid levels...
3      what are the instructions for storage and disp...
4                        what does a fistula smell like?
                             ...                        
300    how long does it take for famocid 40 tablet to...
301    for how long do i need to continue nikoran 5 t...
302              how should i store my child’s medicine?
303    do all women experience discomfort after menop...
304    can i undergo cataract surgery while taking si...
Name: question_english, Length: 305, dtype: object

In [207]:
if(model_name == 'roberta'):
    model = RobertaForTokenClassification.from_pretrained('roberta-base', num_labels=7)
if(model_name == 'bioclinical'):
    model = AutoModelForTokenClassification.from_pretrained("emilyalsentzer/Bio_ClinicalBERT", num_labels=7)

# Define a mapping from BIO tags to label indices
tag2id = {"B-drug": 0, "I-drug": 1, "B-treatment": 2, "I-treatment": 3, "B-disease": 4, "I-disease": 5, "O": 6}

# Tokenize and convert data to input sequences
input_ids = []
attention_masks = []
labels = []

for _, row in train_df.iterrows():
    sentence = row['question_english']
    bio_tags = row['target']

    tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sentence)))
    input_ids.append(tokenizer.convert_tokens_to_ids(tokens))
    attention_masks.append([1] * len(tokens))

    label_ids = [tag2id[tag] for tag in bio_tags]
    labels.append(label_ids)

input_ids = [torch.tensor(seq) for seq in input_ids]
input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True)
attention_masks = [torch.tensor(seq) for seq in attention_masks]
attention_masks = torch.nn.utils.rnn.pad_sequence(attention_masks, batch_first=True)
labels = [torch.tensor(seq) for seq in labels]
labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True)

test_input_ids = []
test_attention_masks = []
test_labels = []

for _, row in test_df.iterrows():
    sentence = row['question_english']
    bio_tags = row['target']

    tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sentence)))
    test_input_ids.append(tokenizer.convert_tokens_to_ids(tokens))
    test_attention_masks.append([1] * len(tokens))

    # Convert BIO tags to label indices
    label_ids = [tag2id[tag] for tag in bio_tags]
    test_labels.append(label_ids)
    
test_input_ids = [torch.tensor(seq) for seq in test_input_ids]
test_input_ids = torch.nn.utils.rnn.pad_sequence(test_input_ids, batch_first=True)
test_attention_masks = [torch.tensor(seq) for seq in test_attention_masks]
test_attention_masks = torch.nn.utils.rnn.pad_sequence(test_attention_masks, batch_first=True)
test_labels = [torch.tensor(seq) for seq in test_labels]
test_labels = torch.nn.utils.rnn.pad_sequence(test_labels, batch_first=True)


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [208]:
dataset = TensorDataset(input_ids, attention_masks, labels)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_labels)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [209]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)
criterion = torch.nn.CrossEntropyLoss()

In [210]:
num_epochs = 10

for epoch in range(num_epochs):
    loss = 0
    for batch in dataloader:
        input_ids, attention_masks, labels = batch

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    print(f'Epoch: {epoch}, Loss:  {loss}')

model.save_pretrained('/kaggle/working/model.pth')

Epoch: 0, Loss:  1.4142554998397827
Epoch: 1, Loss:  0.5240087509155273
Epoch: 2, Loss:  0.42614829540252686
Epoch: 3, Loss:  0.34828072786331177
Epoch: 4, Loss:  0.3397728502750397
Epoch: 5, Loss:  0.21514473855495453
Epoch: 6, Loss:  0.09161556512117386
Epoch: 7, Loss:  0.1070496141910553
Epoch: 8, Loss:  0.11522602289915085
Epoch: 9, Loss:  0.09321325272321701


In [211]:
test_df

Unnamed: 0,question_english,disease_english,drug_english,treatment_english,question_bengali,disease_bengali,drug_bengali,treatment_bengali,question_hindi,disease_hindi,...,question_marathi,disease_marathi,drug_marathi,treatment_marathi,question_gujarati,disease_gujarati,drug_gujarati,treatment_gujarati,Manual_Intent,target
0,can tuberculosis be cured in hiv co-infection?,"hiv, tuberculosis",,,এইচআইভি সহ-সংক্রমণে কি যক্ষ্মা নিরাময় করা যায়?,"এইচআইভি, যক্ষ্মা",,,क्या एचआईवी सह-संक्रमण में टुबरक्लोसिस को ठीक...,"एचआईवी , टुबरक्लोसिस",...,एचआयव्ही सह संसर्गामध्ये क्षयरोग बरा होऊ शकतो का?,"एचआयव्ही,क्षयरोग",,,શું એચઆઈવીના સહ-સંક્રમણમાં ક્ષય રોગ મટાડી શકાય...,"એચઆઈવી, ક્ષય",,,disease,"[O, O, B-disease, O, O, O, B-disease, B-diseas..."
1,how can i test my breast cancer at home?,breast cancer,,,আমি কীভাবে বাড়িতে আমার স্তন ক্যান্সার পরীক্ষা...,স্তন ক্যান্সার,,,मैं घर पर अपने स्तन कैंसर का परीक्षण कैसे कर स...,स्तन कैंसर,...,मी घरी माझ्या स्तनाच्या कर्करोगाची चाचणी कशी क...,स्तनाच्या कर्करोग,,,હું ઘરે મારા સ્તન કેન્સરનું પરીક્ષણ કેવી રીતે ...,સ્તન કેન્સર,,,disease,"[O, O, O, O, O, O, B-disease, I-disease, O, O,..."
2,how do you stop an obstetric fistula?,obstetric fistula,,,আপনি কীভাবে একটি প্রসূতি ফিস্টুলা বন্ধ করবেন?,প্রসূতি ফিস্টুলা,,,आप एक प्रसूति नालव्रण को कैसे रोकते हैं?,प्रसूति नालव्रण,...,प्रसूती फिस्टुला कसे थांबवायचे?,प्रसूती फिस्टुला,,,તમે ઑબ્સ્ટેટ્રિક ફિસ્ટુલાને કેવી રીતે રોકશો?,ઑબ્સ્ટેટ્રિક ફિસ્ટુલા,,,disease,"[O, O, O, O, O, O, O, B-disease, O, I-disease,..."
3,is domstal 10mg tablet an over the counter drug?,,domstal 10mg tablet,,ডোমস্টাল ১০ মিলিগ্রাম কি বিনা প্রেস্ক্রিপশনে উ...,,ডোমস্টাল ১০ মিলিগ্রাম,,क्या डोमस्टल १०एमजी टैबलेट एक ओवर द काउंटर दवा...,,...,डोम्स्टल १० टॅब्लेटहे ओव्हर काउंटर औषध आहे का?,,डोम्स्टल १० टॅब्लेट,,શું ડોમસ્ટાલ ૧૦ મિલિગ્રામ ટીકડી એ ઓવર કાઉન્ટર ...,,ડોમસ્ટાલ ૧૦ મિલિગ્રામ ટીકડી,,drug,"[O, O, B-drug, B-drug, I-drug, I-drug, I-drug,..."
4,does zika virus stay in the body lifelong?,zika virus,,,জিকা ভাইরাস কি সারা জীবন শরীরে থাকে?,জিকা,,,क्या जीका वायरस शरीर में आजीवन रहता है?,जीका,...,झिका विषाणू शरीरात आयुष्यभर राहतो का?,झिका विषाणू,,,શું ઝીકા વાયરસ આજીવન શરીરમાં રહે છે?,ઝીકા વાયરસ,,,disease,"[O, O, B-disease, B-disease, I-disease, O, O, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107,when to do blood sugar test - monthly or annua...,,,blood sugar test,রক্তে শর্করার পরীক্ষা কখন করতে হবে - মাসিক বা ...,,,শর্করা পরীক্ষা,ब्लड शुगर टेस्ट कब करें - मासिक या सालाना?,,...,रक्तातील साखरेची चाचणी कधी करावी - मासिक किंवा...,,,रक्तातील साखरेची चाचणी,રક્ત ખાંડની તપાસ ક્યારે કરવી - માસિક કે વાર્ષિક?,,,રક્ત ખાંડ પરીક્ષણ,treatment,"[O, O, O, O, B-treatment, I-treatment, I-treat..."
108,when surgery is required in fracture?,,,surgery,ভাঙা অবস্থায় অস্ত্রোপচারের প্রয়োজন হয় কখন?,,,অস্ত্রোপচার,फ्रैक्चर में सर्जरी की आवश्यकता कब होती है?,,...,फ्रॅक्चरमध्ये शस्त्रक्रिया केव्हा आवश्यक असते,,,शस्त्रक्रिया,રક્ત ખાંડની તપાસ ક્યારે કરવી - માસિક કે વાર્ષિક?,,,શસ્ત્રક્રિયા,treatment,"[O, O, B-treatment, O, O, O, O, O, O]"
109,which medications are contraindicated before s...,,,surgery,অস্ত্রোপচারের আগে কোন ওষুধ ব্যবহার করা যাবে না,,,অস্ত্রোপচারে,सर्जरी से पहले कौन सी दवाएं वर्जित हैं ?,,...,शस्त्रक्रियेपूर्वी कोणती औषधे निर्बंधीत आहेत,,,शस्त्रक्रिया,શસ્ત્રક્રિયા પહેલાં કઈ દવાઓ બિનસલાહભર્યા છે?,,,શસ્ત્રક્રિયા,treatment,"[O, O, O, O, O, O, O, O, O, B-treatment, O]"
110,are all dental procedure safe?,,,dental procedure,সব দাঁতের পদ্ধতি কি নিরাপদ?,,,দাঁতের পদ্ধতি,क्या सभी दंत प्रक्रियाएं सुरक्षित हैं?,,...,सर्व दंत प्रक्रिया सुरक्षित आहेत का?,,,दंत प्रक्रिया,શું દાંતની બધી પ્રક્રિયાઓ સલામત છે?,,,દાંતની પ્રક્રિયા,treatment,"[O, O, O, B-treatment, I-treatment, O, O, O]"


In [212]:
model.eval()
all_predictions = []
all_labels = []

with torch.no_grad():
    for batch in test_dataloader:
        input_ids, attention_masks, labels = batch

        outputs = model(input_ids, attention_mask=attention_masks)
        predictions = torch.argmax(outputs.logits, dim=-1)

        all_predictions.extend(predictions.cpu().numpy().flatten())
        all_labels.extend(labels.cpu().numpy().flatten())

all_predictions = np.array(all_predictions)
all_labels = np.array(all_labels)

macro_f1 = f1_score(all_labels, all_predictions, average='macro')
print(f"Macro F1 Score: {macro_f1 * 100:.2f}%")

conf_matrix = confusion_matrix(all_labels, all_predictions)
print("Confusion Matrix:")
print(conf_matrix)

class_report = classification_report(all_labels, all_predictions, target_names=list(tag2id.keys()))
print("Classification Report:")
print(class_report)


Macro F1 Score: 52.06%
Confusion Matrix:
[[2908    4    0    0    0    0    7]
 [   3  141    0    0    0    0    2]
 [   0    1    0    0   13    0   11]
 [   0    4    0    0    4    2    7]
 [   0    0    0    0   66    0   18]
 [   0    0    0    0   19    2    1]
 [  37    7    1    0   27    0 1083]]
Classification Report:
              precision    recall  f1-score   support

      B-drug       0.99      1.00      0.99      2919
      I-drug       0.90      0.97      0.93       146
 B-treatment       0.00      0.00      0.00        25
 I-treatment       0.00      0.00      0.00        17
   B-disease       0.51      0.79      0.62        84
   I-disease       0.50      0.09      0.15        22
           O       0.96      0.94      0.95      1155

    accuracy                           0.96      4368
   macro avg       0.55      0.54      0.52      4368
weighted avg       0.96      0.96      0.96      4368

