LOAD THE DATASET | PREPROCESSING | CLEANING

In [None]:
import re
import spacy
import pandas as pd

nlp = spacy.load('en_core_web_sm')

# https://www.kaggle.com/datasets/gunjansanjaykadam/rcorda/data
csv_file_path = '/kaggle/input/rcorda/healthcare_dataset.csv'
data = pd.read_csv(csv_file_path)

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text


def detect_negation(text):
    doc = nlp(text)
    negation_words = set()
    for token in doc:
        if token.dep_ == 'neg':
            negation_words.add(token.head.text)
    return negation_words


data['Cleaned Medical Condition'] = data['Medical Condition'].apply(clean_text)
data['Negated Conditions'] = data['Cleaned Medical Condition'].apply(detect_negation)


data[['Medical Condition', 'Cleaned Medical Condition', 'Negated Conditions']].head()


Unnamed: 0,Medical Condition,Cleaned Medical Condition,Negated Conditions
0,Cancer,cancer,{}
1,Obesity,obesity,{}
2,Obesity,obesity,{}
3,Diabetes,diabetes,{}
4,Cancer,cancer,{}


MATCHING THE EXTRACTED LABELS

In [None]:

disease_labels = ['cancer', 'diabetes', 'obesity', 'asthma', 'hypertension', 'tumor']

def extract_disease_labels(text, labels):
    diseases_found = [label for label in labels if label in text]
    return diseases_found

data['Extracted Diseases'] = data['Cleaned Medical Condition'].apply(lambda x: extract_disease_labels(x, disease_labels))

data[['Medical Condition', 'Extracted Diseases']].head()


Unnamed: 0,Medical Condition,Extracted Diseases
0,Cancer,[cancer]
1,Obesity,[obesity]
2,Obesity,[obesity]
3,Diabetes,[diabetes]
4,Cancer,[cancer]


EVALUATION

we assume ground truth labels for each report

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score


data['True Label'] = data['Medical Condition'].apply(lambda x: 1 if 'cancer' in x.lower() else 0)
data['Predicted Label'] = data['Extracted Diseases'].apply(lambda x: 1 if 'cancer' in x else 0)

precision = precision_score(data['True Label'], data['Predicted Label'])
recall = recall_score(data['True Label'], data['Predicted Label'])
f1 = f1_score(data['True Label'], data['Predicted Label'])

print(f'Precision: {precision}, Recall: {recall}, F1 Score: {f1}')


Precision: 1.0, Recall: 1.0, F1 Score: 1.0


Using BERT for Question Answering

In [None]:
from transformers import pipeline

qa_pipeline = pipeline("question-answering")

def ask_question(text, question):
    answer = qa_pipeline(question=question, context=text)
    return answer['answer']

data['Extracted Answer'] = data['Medical Condition'].apply(lambda x: ask_question(x, "What disease is mentioned?"))

data[['Medical Condition', 'Extracted Answer']].head()


No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 564e9b5 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


Unnamed: 0,Medical Condition,Extracted Answer
0,Cancer,Cancer
1,Obesity,Obesity
2,Obesity,Obesity
3,Diabetes,Diabetes
4,Cancer,Cancer
