# Loading Libraries

In [21]:
!pip install spacy
!python -m spacy download en_core_web_sm
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, f1_score
import spacy
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
nlp = spacy.load('en_core_web_sm')

Collecting en-core-web-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.0/en_core_web_sm-3.7.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m47.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [11]:
train_path = "/kaggle/input/ihqid-1mg/IHQID-1mg/train.csv"
test_path = "/kaggle/input/ihqid-1mg/IHQID-1mg/test.csv"
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
train_df['target'] = ''
test_df['target'] = ''

In [12]:
# Define a function to convert a value to lowercase (handling null values)
def lowercase(value):
    return str(value).lower() if pd.notnull(value) else value

# Apply the function to all columns using applymap
train_df = train_df.applymap(lowercase)
test_df = test_df.applymap(lowercase)

In [13]:
def create_tags(tokens, type):
    # Create a list of tuples with BIO tags for the original words
    
    if(type == 'drug'):
        bio_tags = [(tokens[0], 'B-drug')] + [(token, 'I-drug') for token in tokens[1:]]
    if(type == 'disease'):
        bio_tags = [(tokens[0], 'B-disease')] + [(token, 'I-disease') for token in tokens[1:]]
    if(type == 'treatment'):
        bio_tags = [(tokens[0], 'B-treatment')] + [(token, 'I-treatment') for token in tokens[1:]]
    return bio_tags

In [14]:
def bio_tagging(entity_list, tokens):
    bio_tags = []
    for token in tokens:
        token = token.lstrip("Ġ")
        flag = 0
        for index, temp in enumerate(entity_list):
            ent, tag = temp
            if token == ent:
                bio_tags.append((tag))
                flag = 1
                break;
        if(flag == 0):
            bio_tags.append(('O'))
    return bio_tags

In [15]:
for index, row in train_df.iterrows():
    entity = []
    if(not(pd.isnull(row['drug_english']))):
        drugs = row['drug_english'].split(',')
        bio_drugs = []
        for drug in drugs:
            words = drug.split()
            bio_drugs.extend(create_tags(words, 'drug'))
        entity.extend(bio_drugs)
    if(not(pd.isnull(row['treatment_english']))):
#         print(row['treatment_english'])
        treatments = row['treatment_english'].split(',')
        bio_treatments = []
        for treatment in treatments:
            words = treatment.split()
            bio_treatments.extend(create_tags(words, 'treatment'))
        entity.extend(bio_treatments)
    if(not(pd.isnull(row['disease_english']))):
#         print(row['disease_english'])
        diseases = row['disease_english'].split(',')
        bio_diseases = []
        for disease in diseases:
            words = disease.split()
            bio_diseases.extend(create_tags(words, 'disease'))
        entity.extend(bio_diseases)
    question_tokens = row['question_english'].split()
    target = bio_tagging(entity, question_tokens)
    train_df.at[index, 'target'] = target
    
    

In [17]:
for index, row in test_df.iterrows():
    entity = []
    if(not(pd.isnull(row['drug_english']))):
        drugs = row['drug_english'].split(',')
        bio_drugs = []
        for drug in drugs:
            words = drug.split()
            bio_drugs.extend(create_tags(words, 'drug'))
        entity.extend(bio_drugs)
    if(not(pd.isnull(row['treatment_english']))):
#         print(row['treatment_english'])
        treatments = row['treatment_english'].split(',')
        bio_treatments = []
        for treatment in treatments:
            words = treatment.split()
            bio_treatments.extend(create_tags(words, 'treatment'))
        entity.extend(bio_treatments)
    if(not(pd.isnull(row['disease_english']))):
#         print(row['disease_english'])
        diseases = row['disease_english'].split(',')
        bio_diseases = []
        for disease in diseases:
            words = disease.split()
            bio_diseases.extend(create_tags(words, 'disease'))
        entity.extend(bio_diseases)
    question_tokens = row['question_english'].split()
    target = bio_tagging(entity, question_tokens)
    test_df.at[index, 'target'] = target
    
    

In [18]:
test_df

Unnamed: 0,question_english,disease_english,drug_english,treatment_english,question_bengali,disease_bengali,drug_bengali,treatment_bengali,question_hindi,disease_hindi,...,question_marathi,disease_marathi,drug_marathi,treatment_marathi,question_gujarati,disease_gujarati,drug_gujarati,treatment_gujarati,Manual_Intent,target
0,can tuberculosis be cured in hiv co-infection?,"hiv, tuberculosis",,,এইচআইভি সহ-সংক্রমণে কি যক্ষ্মা নিরাময় করা যায়?,"এইচআইভি, যক্ষ্মা",,,क्या एचआईवी सह-संक्रमण में टुबरक्लोसिस को ठीक...,"एचआईवी , टुबरक्लोसिस",...,एचआयव्ही सह संसर्गामध्ये क्षयरोग बरा होऊ शकतो का?,"एचआयव्ही,क्षयरोग",,,શું એચઆઈવીના સહ-સંક્રમણમાં ક્ષય રોગ મટાડી શકાય...,"એચઆઈવી, ક્ષય",,,disease,"[O, B-disease, O, O, O, B-disease, O]"
1,how can i test my breast cancer at home?,breast cancer,,,আমি কীভাবে বাড়িতে আমার স্তন ক্যান্সার পরীক্ষা...,স্তন ক্যান্সার,,,मैं घर पर अपने स्तन कैंसर का परीक्षण कैसे कर स...,स्तन कैंसर,...,मी घरी माझ्या स्तनाच्या कर्करोगाची चाचणी कशी क...,स्तनाच्या कर्करोग,,,હું ઘરે મારા સ્તન કેન્સરનું પરીક્ષણ કેવી રીતે ...,સ્તન કેન્સર,,,disease,"[O, O, O, O, O, B-disease, I-disease, O, O]"
2,how do you stop an obstetric fistula?,obstetric fistula,,,আপনি কীভাবে একটি প্রসূতি ফিস্টুলা বন্ধ করবেন?,প্রসূতি ফিস্টুলা,,,आप एक प्रसूति नालव्रण को कैसे रोकते हैं?,प्रसूति नालव्रण,...,प्रसूती फिस्टुला कसे थांबवायचे?,प्रसूती फिस्टुला,,,તમે ઑબ્સ્ટેટ્રિક ફિસ્ટુલાને કેવી રીતે રોકશો?,ઑબ્સ્ટેટ્રિક ફિસ્ટુલા,,,disease,"[O, O, O, O, O, B-disease, O]"
3,is domstal 10mg tablet an over the counter drug?,,domstal 10mg tablet,,ডোমস্টাল ১০ মিলিগ্রাম কি বিনা প্রেস্ক্রিপশনে উ...,,ডোমস্টাল ১০ মিলিগ্রাম,,क्या डोमस्टल १०एमजी टैबलेट एक ओवर द काउंटर दवा...,,...,डोम्स्टल १० टॅब्लेटहे ओव्हर काउंटर औषध आहे का?,,डोम्स्टल १० टॅब्लेट,,શું ડોમસ્ટાલ ૧૦ મિલિગ્રામ ટીકડી એ ઓવર કાઉન્ટર ...,,ડોમસ્ટાલ ૧૦ મિલિગ્રામ ટીકડી,,drug,"[O, B-drug, I-drug, I-drug, O, O, O, O, O]"
4,does zika virus stay in the body lifelong?,zika virus,,,জিকা ভাইরাস কি সারা জীবন শরীরে থাকে?,জিকা,,,क्या जीका वायरस शरीर में आजीवन रहता है?,जीका,...,झिका विषाणू शरीरात आयुष्यभर राहतो का?,झिका विषाणू,,,શું ઝીકા વાયરસ આજીવન શરીરમાં રહે છે?,ઝીકા વાયરસ,,,disease,"[O, B-disease, I-disease, O, O, O, O, O]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107,when to do blood sugar test - monthly or annua...,,,blood sugar test,রক্তে শর্করার পরীক্ষা কখন করতে হবে - মাসিক বা ...,,,শর্করা পরীক্ষা,ब्लड शुगर टेस्ट कब करें - मासिक या सालाना?,,...,रक्तातील साखरेची चाचणी कधी करावी - मासिक किंवा...,,,रक्तातील साखरेची चाचणी,રક્ત ખાંડની તપાસ ક્યારે કરવી - માસિક કે વાર્ષિક?,,,રક્ત ખાંડ પરીક્ષણ,treatment,"[O, O, O, B-treatment, I-treatment, I-treatmen..."
108,when surgery is required in fracture?,,,surgery,ভাঙা অবস্থায় অস্ত্রোপচারের প্রয়োজন হয় কখন?,,,অস্ত্রোপচার,फ्रैक्चर में सर्जरी की आवश्यकता कब होती है?,,...,फ्रॅक्चरमध्ये शस्त्रक्रिया केव्हा आवश्यक असते,,,शस्त्रक्रिया,રક્ત ખાંડની તપાસ ક્યારે કરવી - માસિક કે વાર્ષિક?,,,શસ્ત્રક્રિયા,treatment,"[O, B-treatment, O, O, O, O]"
109,which medications are contraindicated before s...,,,surgery,অস্ত্রোপচারের আগে কোন ওষুধ ব্যবহার করা যাবে না,,,অস্ত্রোপচারে,सर्जरी से पहले कौन सी दवाएं वर्जित हैं ?,,...,शस्त्रक्रियेपूर्वी कोणती औषधे निर्बंधीत आहेत,,,शस्त्रक्रिया,શસ્ત્રક્રિયા પહેલાં કઈ દવાઓ બિનસલાહભર્યા છે?,,,શસ્ત્રક્રિયા,treatment,"[O, O, O, O, O, B-treatment]"
110,are all dental procedure safe?,,,dental procedure,সব দাঁতের পদ্ধতি কি নিরাপদ?,,,দাঁতের পদ্ধতি,क्या सभी दंत प्रक्रियाएं सुरक्षित हैं?,,...,सर्व दंत प्रक्रिया सुरक्षित आहेत का?,,,दंत प्रक्रिया,શું દાંતની બધી પ્રક્રિયાઓ સલામત છે?,,,દાંતની પ્રક્રિયા,treatment,"[O, O, B-treatment, I-treatment, O]"


In [19]:
# Process train data
train_words = []
train_tags = []

for sentence, tag_list in zip(train_df['question_english'], train_df['target']):
    doc = nlp(sentence)
    for word, tag in zip(doc, tag_list):
        train_words.append(word.text)
        train_tags.append(tag)

# Create a DataFrame for train data
train_word_df = pd.DataFrame({'word': train_words, 'target': train_tags})

# Process test data
test_words = []
test_tags = []

for sentence, tag_list in zip(test_df['question_english'], test_df['target']):
    doc = nlp(sentence)
    for word, tag in zip(doc, tag_list):
        test_words.append(word.text)
        test_tags.append(tag)

# Create a DataFrame for test data
test_word_df = pd.DataFrame({'word': test_words, 'target': test_tags})

In [22]:
# Create a pipeline with a simple SVM classifier
model = make_pipeline(CountVectorizer(), SVC())

# Train the model on the entire training set
model.fit(train_word_df['word'], train_word_df['target'])

# Make predictions on the test set
test_predictions = model.predict(test_word_df['word'])

In [23]:
macro_f1 = f1_score(test_word_df['target'], test_predictions, average='macro')
conf_matrix = confusion_matrix(test_word_df['target'], test_predictions)
class_report = classification_report(test_word_df['target'], test_predictions)

# Display the results
print(f"Macro-F1 Score: {macro_f1}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)

Macro-F1 Score: 0.420117318683021

Confusion Matrix:
[[ 10   0   0   0   0   0  21]
 [  0  17   0   0   2   0  36]
 [  0   0   4   0   0   0  11]
 [  0   0   0   0   0   0  11]
 [  0   0   0   0  46   0  38]
 [  0   0   1   0   0   1  13]
 [  4   3   0   0  19   0 777]]

Classification Report:
              precision    recall  f1-score   support

   B-disease       0.71      0.32      0.44        31
      B-drug       0.85      0.31      0.45        55
 B-treatment       0.80      0.27      0.40        15
   I-disease       0.00      0.00      0.00        11
      I-drug       0.69      0.55      0.61        84
 I-treatment       1.00      0.07      0.12        15
           O       0.86      0.97      0.91       803

    accuracy                           0.84      1014
   macro avg       0.70      0.35      0.42      1014
weighted avg       0.83      0.84      0.82      1014



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
