In [5]:
!pip install -q transformers
!pip install -q deep-translator

In [6]:
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler, TensorDataset
from transformers import RobertaTokenizer, RobertaModel, RobertaForTokenClassification, AutoTokenizer, AutoModel, AutoModelForTokenClassification, XLMRobertaTokenizer 
from sklearn.preprocessing import MultiLabelBinarizer
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
import warnings
from deep_translator import GoogleTranslator
from sklearn.metrics import classification_report, confusion_matrix, f1_score
warnings.filterwarnings("ignore")
Robertatokenizer = RobertaTokenizer.from_pretrained('roberta-base')
Bioclinicaltokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")



Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

In [7]:
model_name = 'xlm' # can be 'bioclinical' for bioclinical_bert or 'roberta' for Roberta
train_path = "/kaggle/input/ihqid-webmd/IHQID-WebMD/train.csv"
test_path = "/kaggle/input/ihqid-webmd/IHQID-WebMD/test.csv"

MAX_LEN = 200
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 10
LEARNING_RATE = 1e-05

In [9]:
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
train_df['target'] = ''
test_df['target'] = ''
if(model_name == 'roberta'):
    tokenizer = Robertatokenizer
if(model_name == 'bioclinical'):
    tokenizer = Bioclinicaltokenizer
if(model_name == 'xlm'):
    tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

In [10]:
# Define a function to convert a value to lowercase (handling null values)
def lowercase(value):
    return str(value).lower() if pd.notnull(value) else value

# Apply the function to all columns using applymap
train_df = train_df.applymap(lowercase)
test_df = test_df.applymap(lowercase)

In [11]:
def create_tags(tokens, type, tokenizer):
    # Create a list of tuples with BIO tags for the original words
    
    if(type == 'drug'):
        bio_tags = [(tokens[0], 'B-drug')] + [(token, 'I-drug') for token in tokens[1:]]
    if(type == 'disease'):
        bio_tags = [(tokens[0], 'B-disease')] + [(token, 'I-disease') for token in tokens[1:]]
    if(type == 'treatment'):
        bio_tags = [(tokens[0], 'B-treatment')] + [(token, 'I-treatment') for token in tokens[1:]]

    # Tokenize each word and match it to the corresponding BIO tag
    tokenized_bio_tags = []

    for word, bio_tag in bio_tags:
        # Tokenize the word using the tokenizer
        word_tokens = tokenizer.tokenize(word)
        word_tokens.append(word)

        # Match each token to the corresponding BIO tag
        tokenized_bio_tags.extend([(token, bio_tag) for token in word_tokens])
    return tokenized_bio_tags

In [12]:
def bio_tagging(entity_list, tokens):
    bio_tags = []
    for token in tokens:
        token = token.lstrip("Ġ")
        flag = 0
        for index, temp in enumerate(entity_list):
            ent, tag = temp
            if token == ent:
                bio_tags.append((tag))
                flag = 1
                break;
        if(flag == 0):
            bio_tags.append(('O'))
    return bio_tags

In [13]:
for index, row in train_df.iterrows():
    tokenized_entity = []
    if(not(pd.isnull(row['drug_hindi']))):
#         print(row['drug_hindi'])
        drugs = row['drug_hindi'].split(',')
        tokenized_bio_drugs = []
        for drug in drugs:
            words = drug.split()
#             print(words)
            tokenized_bio_drugs.extend(create_tags(words, 'drug', tokenizer))
        tokenized_entity.extend(tokenized_bio_drugs)
    if(not(pd.isnull(row['treatment_hindi']))):
#         print(row['treatment_hindi'])
        treatments = row['treatment_hindi'].split(',')
        tokenized_bio_treatments = []
        for treatment in treatments:
            words = treatment.split()
            tokenized_bio_treatments.extend(create_tags(words, 'treatment', tokenizer))
        tokenized_entity.extend(tokenized_bio_treatments)
    if(not(pd.isnull(row['disease_hindi']))):
#         print(row['disease_hindi'])
        diseases = row['disease_hindi'].split(',')
        tokenized_bio_diseases = []
        for disease in diseases:
            words = disease.split()
            tokenized_bio_diseases.extend(create_tags(words, 'disease', tokenizer))
        tokenized_entity.extend(tokenized_bio_diseases)
    question_tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(row["question_hindi"])))
    target = bio_tagging(tokenized_entity, question_tokens)
    train_df.at[index, 'target'] = target
    
    

In [14]:
for index, row in test_df.iterrows():
    tokenized_entity = []
    if(not(pd.isnull(row['drug_hindi']))):
#         print(row['drug_hindi'])
        drugs = row['drug_hindi'].split(',')
        tokenized_bio_drugs = []
        for drug in drugs:
            words = drug.split()
#             print(words)
            tokenized_bio_drugs.extend(create_tags(words, 'drug', tokenizer))
        tokenized_entity.extend(tokenized_bio_drugs)
    if(not(pd.isnull(row['treatment_hindi']))):
#         print(row['treatment_hindi'])
        treatments = row['treatment_hindi'].split(',')
        tokenized_bio_treatments = []
        for treatment in treatments:
            words = treatment.split()
            tokenized_bio_treatments.extend(create_tags(words, 'treatment', tokenizer))
        tokenized_entity.extend(tokenized_bio_treatments)
    if(not(pd.isnull(row['disease_hindi']))):
#         print(row['disease_hindi'])
        diseases = row['disease_hindi'].split(',')
        tokenized_bio_diseases = []
        for disease in diseases:
            words = disease.split()
            tokenized_bio_diseases.extend(create_tags(words, 'disease', tokenizer))
        tokenized_entity.extend(tokenized_bio_diseases)
#     print(tokenized_entity)
    question_tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(row["question_hindi"])))
    target = bio_tagging(tokenized_entity, question_tokens)
    test_df.at[index, 'target'] = target
    


In [15]:
train_df['target']

0      [O, B-drug, B-drug, B-drug, O, O, O, O, O, O, ...
1      [O, O, O, O, O, O, O, O, O, O, O, B-disease, O...
2      [O, O, O, O, O, O, O, I-disease, O, O, O, O, O...
3      [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...
4      [O, O, O, O, O, O, O, B-disease, B-disease, B-...
                             ...                        
715    [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...
716    [O, O, B-disease, O, O, O, O, O, O, O, O, O, O...
717    [O, B-disease, B-disease, I-disease, B-disease...
718    [O, O, B-treatment, B-treatment, B-treatment, ...
719    [O, O, B-treatment, B-treatment, B-treatment, ...
Name: target, Length: 720, dtype: object

In [16]:
train_df['question_hindi']

0               निस्टेटिन किस लिए निर्धारित किया गया है?
1      क्या सम्भोग के बाद डाउचिंग मुझे गर्भवती होने स...
2               क्या पेर्कोसेट वजन बढ़ने का कारण बनता है
3      क्या एक दिन में शराब के २ या २ १/२ गिलास उच्च ...
4            क्या बहुत अधिक छाछ थ्रश का कारण बन सकती है?
                             ...                        
715    क्या अफोर्डेबल केयर एक्ट के लिए किसी आश्रित की...
716    मैं मस्सों से छुटकारा पाने के लिए डक्ट टेप का ...
717    बेल्स पाल्सी के चेहरे के पक्षाघात को ठीक करने ...
718            क्या प्रसवपूर्व अल्ट्रासाउंड सुरक्षित है?
719    मैं सर्जरी तक वंक्षण हर्निया के लक्षणों को कैस...
Name: question_hindi, Length: 720, dtype: object

In [18]:
if(model_name == 'roberta'):
    model = RobertaForTokenClassification.from_pretrained('roberta-base', num_labels=7)
if(model_name == 'bioclinical'):
    model = AutoModelForTokenClassification.from_pretrained("emilyalsentzer/Bio_ClinicalBERT", num_labels=7)
if(model_name == 'xlm'):
#     model = XLMRobertaForTokenClassification.from_pretrained('xlm-roberta-base', num_labels=7)
    model = AutoModelForTokenClassification.from_pretrained("xlm-roberta-base", num_labels=7)

# Define a mapping from BIO tags to label indices
tag2id = {"B-drug": 0, "I-drug": 1, "B-treatment": 2, "I-treatment": 3, "B-disease": 4, "I-disease": 5, "O": 6}

# Tokenize and convert data to input sequences
input_ids = []
attention_masks = []
labels = []

for _, row in train_df.iterrows():
    sentence = row['question_hindi']
    bio_tags = row['target']

    tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sentence)))
    input_ids.append(tokenizer.convert_tokens_to_ids(tokens))
    attention_masks.append([1] * len(tokens))

    label_ids = [tag2id[tag] for tag in bio_tags]
    labels.append(label_ids)

input_ids = [torch.tensor(seq) for seq in input_ids]
input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True)
attention_masks = [torch.tensor(seq) for seq in attention_masks]
attention_masks = torch.nn.utils.rnn.pad_sequence(attention_masks, batch_first=True)
labels = [torch.tensor(seq) for seq in labels]
labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True)

test_input_ids = []
test_attention_masks = []
test_labels = []

for _, row in test_df.iterrows():
    sentence = row['question_hindi']
    bio_tags = row['target']

    tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sentence)))
    test_input_ids.append(tokenizer.convert_tokens_to_ids(tokens))
    test_attention_masks.append([1] * len(tokens))

    # Convert BIO tags to label indices
    label_ids = [tag2id[tag] for tag in bio_tags]
    test_labels.append(label_ids)
    
test_input_ids = [torch.tensor(seq) for seq in test_input_ids]
test_input_ids = torch.nn.utils.rnn.pad_sequence(test_input_ids, batch_first=True)
test_attention_masks = [torch.tensor(seq) for seq in test_attention_masks]
test_attention_masks = torch.nn.utils.rnn.pad_sequence(test_attention_masks, batch_first=True)
test_labels = [torch.tensor(seq) for seq in test_labels]
test_labels = torch.nn.utils.rnn.pad_sequence(test_labels, batch_first=True)


Downloading model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
dataset = TensorDataset(input_ids, attention_masks, labels)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_labels)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [20]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)
criterion = torch.nn.CrossEntropyLoss()

In [21]:
num_epochs = 10

for epoch in range(num_epochs):
    loss = 0
    for batch in dataloader:
        input_ids, attention_masks, labels = batch

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    print(f'Epoch: {epoch}, Loss:  {loss}')

model.save_pretrained('/kaggle/working/model.pth')

Epoch: 0, Loss:  0.626343309879303
Epoch: 1, Loss:  0.42235374450683594
Epoch: 2, Loss:  0.2534402012825012
Epoch: 3, Loss:  0.34019866585731506
Epoch: 4, Loss:  0.2730828523635864
Epoch: 5, Loss:  0.32145729660987854
Epoch: 6, Loss:  0.2347106635570526
Epoch: 7, Loss:  0.2848314940929413
Epoch: 8, Loss:  0.22861319780349731
Epoch: 9, Loss:  0.18679532408714294


In [22]:
test_df

Unnamed: 0,question_english,disease_english,drug_english,treatment_english,question_bengali,disease_bengali,drug_bengali,treatment_bengali,question_hindi,disease_hindi,...,question_marathi,disease_marathi,drug_marathi,treatment_marathi,question_gujarati,disease_gujarati,drug_gujarati,treatment_gujarati,Manual_Intent,target
0,how common is pregnancy after a hysterectomy?,pregnancy,,hysterectomy,হিস্টেরেক্টমির পরে গর্ভাবস্থা কতটা সাধারণ?,গর্ভাবস্থা,,হিস্টেরেক্টমির,हिस्टेरेक्टॉमी के बाद गर्भावस्था कितनी आम है?,गर्भवती,...,हिस्टेरेक्टॉमी नंतर गर्भधारणा किती सामान्य आहे?,गर्भधारणा,,हिस्टेरेक्टॉमी,હિસ્ટરેકટમી પછી ગર્ભાવસ્થા કેટલી સામાન્ય છે?,ગર્ભાવસ્થા,,હિસ્ટરેકટમી,treatment plan,"[O, O, O, O, B-treatment, B-treatment, B-treat..."
1,how effective is generic thyroid medication,,generic thyroid medication,,জেনেরিক থাইরয়েড ঔষধ কতটা কার্যকর?,,জেনেরিক থাইরয়েড ঔষধ,,जेनेरिक थायराइड दवा कितनी प्रभावी है?,,...,जेनेरिक थायरॉईड औषध किती प्रभावी आहे,,जेनेरिक थायरॉईड औषध,,સામાન્ય થાઇરોઇડ દવા કેટલી અસરકારક છે,,,,drug,"[O, O, O, O, I-drug, I-drug, I-drug, I-drug, O..."
2,is singulair a corticosteroid?,,"singular, corticosteroid",,সিঙ্গুলেয়ার কি একটি কর্টিকোস্টেরয়েড?,,"সিঙ্গুলেয়ার, কর্টিকোস্টেরয়েড",,क्या सिंगुलैर एक कॉर्टिकोस्टेरॉइड है?,,...,सिंग्युलेअर हे कॉर्टिकोस्टेरॉईड आहे का?,,सिंग्युलेअर कॉर्टिकोस्टेरॉईड,,શું સિંગલેર કોર્ટીકોસ્ટેરોઈડ છે?,,"એકવચન, કોર્ટીકોસ્ટેરોઈડ",,drug,"[O, O, O, O, O, O, B-drug, B-drug, B-drug, B-d..."
3,can you take advil with homeopathic ear drops,,"advil, homeopathic ear drops",,আপনি কি হোমিওপ্যাথিক কানের ড্রপের সঙ্গে অ্যাডভ...,,"অ্যাডভিল, হোমিওপ্যাথিক কানের ড্রপ",,आप होम्योपैथिक एयर ड्रॉप्स के साथ एडविल ले सक...,,...,तुम्ही होमिओपॅथिक कानाच्या थेंबांसह अॅडविल घेऊ...,,होमिओपॅथिक कानाच्या थेंबांसह अॅडविल,,શું તમે હોમિયોપેથિક કાનના ટીપાં સાથે એડવિલ લઈ ...,,"એડવિલ, હોમિયોપેથિક કાનના ટીપાં",,drug,"[O, O, B-drug, B-drug, B-drug, B-drug, B-drug,..."
4,is bell's palsy contagious?,bell's palsy,,,বেলস পালসি সংক্রামক হয়?,বেলস পালসি,,,क्या बेल्स पाल्सी संक्रामक है?,बेल्स पाल्सी,...,बेलचा पक्षाघात संसर्गजन्य आहे का?,बेलचा पक्षाघात,,,શું બેલનો લકવો ચેપી છે?,ઘંટડીનો લકવો,,,disease,"[O, O, B-disease, B-disease, I-disease, B-dise..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
236,how will obamacare affect medicare recipients?...,,,,কিভাবে ওবামাকেয়ার মেডিকেয়ার প্রাপকদের প্রভাব...,,,,ओबामाकेयर मेडिकेयर प्राप्तकर्ताओं को कैसे प्रभ...,,...,ओबामाकेअरचा मेडिकेअर प्राप्तकर्त्यांवर कसा परि...,,,,ઓબામાકેર મેડિકેર પ્રાપ્તકર્તાઓને કેવી રીતે અસર...,,,,other,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
237,what is mptp's role in the study of parkinson'...,parkinson's disease,,mptp,পার্কিনসন রোগের গবেষণায় এমপিটিপি এর ভূমিকা কী?,পার্কিনসন রোগের,,এমপিটিপি,पार्किंसंस रोग के अध्ययन में एमपीटीपी की क्या ...,पार्किंसंस रोग,...,पार्किन्सन रोगाच्या अभ्यासात एमपीटीपी ची भूमि...,पार्किन्सन रोगा,,एमपीटीपी,પાર્કિન્સન રોગના અભ્યાસમાં એમપીટીપી ની ભૂમિકા ...,ધ્રુજારી ની બીમારી,,એમપીટીપી,treatment plan,"[O, B-disease, B-disease, B-disease, B-disease..."
238,is botox safe to use for frown lines and wrink...,"frown lines, wrinkles",botox,,বোটক্স কি ভ্রুকুটি এবং বলিরেখার জন্য ব্যবহার ক...,"ভ্রুকুটি, বলিরেখার",বোটক্স,,क्या बोटोक्स भ्रूभंग और झुर्रियों के लिए उपयोग...,"भ्रूभंग रेखाएं, झुर्रियां",...,भुसभुशीत रेषा आणि सुरकुत्या यासाठी बोटॉक्स वाप...,"भुसभुशीत रेषा, सुरकुत्या",बोटॉक्स,,શું બોટોક્સ ફ્રાઉન લાઇન અને કરચલીઓ માટે વાપરવા...,"ભવાં ચડાવવાની રેખાઓ, કરચલીઓ",બોટોક્સ,,disease,"[O, O, B-drug, B-drug, B-drug, B-disease, B-di..."
239,when will i have an ultrasound during my pregn...,pregnancy,,ultrasound,আমার গর্ভাবস্থায় আমি কখন আল্ট্রাসাউন্ড করব?,গর্ভাবস্থায়,,আল্ট্রাসাউন্ড,गर्भावस्था के दौरान मेरा अल्ट्रासाउंड कब होगा?,गर्भावस्था,...,माझ्या गर्भधारणेदरम्यान मला अल्ट्रासाऊंड कधी ह...,गर्भधारण,,अल्ट्रासाऊंड,મારી ગર્ભાવસ્થા દરમિયાન હું અલ્ટ્રાસાઉન્ડ ક્યા...,ગર્ભાવસ્થા,,,treatment plan,"[O, B-disease, B-disease, B-disease, O, O, O, ..."


In [23]:
model.eval()
all_predictions = []
all_labels = []

with torch.no_grad():
    for batch in test_dataloader:
        input_ids, attention_masks, labels = batch

        outputs = model(input_ids, attention_mask=attention_masks)
        predictions = torch.argmax(outputs.logits, dim=-1)

        all_predictions.extend(predictions.cpu().numpy().flatten())
        all_labels.extend(labels.cpu().numpy().flatten())

all_predictions = np.array(all_predictions)
all_labels = np.array(all_labels)

macro_f1 = f1_score(all_labels, all_predictions, average='macro')
print(f"Macro F1 Score: {macro_f1 * 100:.2f}%")

conf_matrix = confusion_matrix(all_labels, all_predictions)
print("Confusion Matrix:")
print(conf_matrix)

class_report = classification_report(all_labels, all_predictions, target_names=list(tag2id.keys()))
print("Classification Report:")
print(class_report)


Macro F1 Score: 26.86%
Confusion Matrix:
[[14844     0     0     0     0     0   110]
 [   31     0     0     0     0     0    56]
 [   66     0     0     0     0     0    95]
 [   22     0     0     0     0     0    84]
 [  103     0     0     0     6     0   241]
 [   21     0     1     0     5     0   151]
 [  338     0     0     0     1     0  3587]]
Classification Report:
              precision    recall  f1-score   support

      B-drug       0.96      0.99      0.98     14954
      I-drug       0.00      0.00      0.00        87
 B-treatment       0.00      0.00      0.00       161
 I-treatment       0.00      0.00      0.00       106
   B-disease       0.50      0.02      0.03       350
   I-disease       0.00      0.00      0.00       178
           O       0.83      0.91      0.87      3926

    accuracy                           0.93     19762
   macro avg       0.33      0.27      0.27     19762
weighted avg       0.90      0.93      0.91     19762

