# Clinical NLP Project
This notebook contains the original code and documentation for data cleaning, synonym replacement, preprocessing, vectorization, model training, and inference.

**DATA CLEANING**

In [ ]:
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
from src.data_cleaning import expand_abbreviations, replace_synonyms
from src.vectorization import clinical_vectorizer

# Load datasets
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test_f.csv')

# Data Cleaning
abbreviation_dict = {
    "#": "broken bone (fracture)",
    "A&E": "accident and emergency",
    "a.c.": "before meals",
    "a.m.": "morning",
    "AF": "atrial fibrillation",
    "AMHP": "approved mental health professional",
    "APTT": "activated partial thromboplastin time",
    "ASQ": "Ages and Stages Questionnaire",
    "b.d.s": "2 times a day",
    "b.i.d.": "twice a day",
    "BMI": "body mass index",
    "BNO": "bowels not open",
    "BO": "bowels open",
    "BP": "blood pressure",
    "c/c": "chief complaint",
    "CMHN": "community mental health nurse",
    "CPN": "community psychiatric nurse",
    "CSF": "cerebrospinal fluid",
    "CSU": "catheter stream urine sample",
    "CT scan": "computerised tomography scan",
    "CVP": "central venous pressure",
    "CXR": "chest X-ray",
    "DNACPR": "do not attempt cardiopulmonary resuscitation",
    "DNAR": "do not attempt resuscitation",
    "DNR": "do not resuscitate",
    "Dr": "doctor",
    "DVT": "deep vein thrombosis",
    "Dx": "diagnosis",
    "ECG": "electrocardiogram",
    "ED": "emergency department",
    "EEG": "electroencephalogram",
    "EMU": "early morning urine sample",
    "ESR": "erythrocyte sedimentation rate",
    "EUA": "examination under anaesthetic",
    "FBC": "full blood count",
    "FOBT": "faecal occult blood test",
    "FIT": "faecal immunochemical test",
    "FY1": "foundation year 1 doctor",
    "FY2": "foundation year 2 doctor",
    "GA": "general anaesthetic",
    "gtt.": "drop(s)",
    "h.": "hour",
    "h/o": "history of",
    "Hb": "haemoglobin",
    "HCA": "healthcare assistant",
    "HCSW": "healthcare support worker",
    "HDL": "high-density lipoprotein",
    "HRT": "hormone replacement therapy",
    "Ht": "height",
    "Hx": "history",
    "i": "1 tablet",
    "ii": "2 tablets",
    "iii": "3 tablets",
    "i.m.": "injection into a muscle",
    "i.v.": "injection directly to a vein",
    "INR": "international normalised ratio",
    "IVI": "intravenous infusion",
    "IVP": "intravenous pyelogram",
    "Ix": "investigations",
    "LA": "local anaesthetic",
    "LDL": "low-density lipoprotein",
    "LFT": "liver function test",
    "LMP": "last menstrual period",
    "M/R": "modified release",
    "MRI": "magnetic resonance imaging",
    "MRSA": "methicillin-resistant Staphylococcus aureus",
    "MSU": "mid-stream urine sample",
    "n.p.o.": "nothing by mouth",
    "NAD": "nothing abnormal discovered",
    "NAI": "non-accidental injury",
    "NBM": "nil by mouth",
    "NG": "nasogastric",
    "nocte": "every night",
    "NoF": "neck of femur",
    "NSAID": "non-steroidal anti-inflammatory drug",
    "o.d.": "once a day",
    "o/e": "on examination",
    "OT": "occupational therapist",
    "p.c.": "after food",
    "p.m.": "afternoon or evening",
    "p.o.": "orally",
    "POD": "podiatrist",
    "p.r.": "rectally",
    "p.r.n.": "as needed"
}

df['question'] = df['question'].apply(lambda x: expand_abbreviations(x, abbreviation_dict))
df['answer'] = df['answer'].apply(lambda x: expand_abbreviations(x, abbreviation_dict))

# Synonym Replacement
synonym_dict = {
    "heart attack": "myocardial infarction",
    "high blood pressure": "hypertension",
    "low blood pressure": "hypotension",
    "high blood sugar": "hyperglycemia",
    "low blood sugar": "hypoglycemia",
    "stroke": "cerebrovascular accident",
    "brain attack": "cerebrovascular accident",
    "shortness of breath": "dyspnea",
    "difficulty breathing": "dyspnea",
    "fainting": "syncope",
    "passing out": "syncope",
    "fever": "pyrexia",
    "headache": "cephalalgia",
    "chest pain": "angina",
    "cold sore": "herpes labialis",
    "kidney stone": "renal calculus",
    "urinary tract infection": "UTI",
    "bladder infection": "UTI",
    "lung infection": "pneumonia",
    "high cholesterol": "hyperlipidemia",
    "blood clot": "thrombosis",
    "swollen lymph nodes": "lymphadenopathy",
    "irregular heartbeat": "arrhythmia",
    "fast heartbeat": "tachycardia",
    "slow heartbeat": "bradycardia",
    "acid reflux": "gastroesophageal reflux disease",
    "stomach flu": "gastroenteritis",
    "pink eye": "conjunctivitis",
    "nosebleed": "epistaxis",
    "runny nose": "rhinorrhea",
    "dry mouth": "xerostomia",
    "itching": "pruritus",
    "rash": "dermatitis",
    "skin inflammation": "dermatitis",
    "joint pain": "arthralgia",
    "muscle pain": "myalgia",
    "bloody urine": "hematuria",
    "bloody stool": "hematochezia",
    "vomiting blood": "hematemesis",
    "black stool": "melena",
    "fluid in lungs": "pulmonary edema",
    "yellow skin": "jaundice",
    "liver failure": "hepatic failure",
    "kidney failure": "renal failure",
    "low oxygen": "hypoxia",
    "high carbon dioxide": "hypercapnia",
    "low sodium": "hyponatremia",
    "high sodium": "hypernatremia",
    "low potassium": "hypokalemia",
    "high potassium": "hyperkalemia",
    "abnormal heartbeat": "arrhythmia",
    "dizzy": "vertigo",
    "dizziness": "vertigo",
    "numbness": "paresthesia",
    "tingling": "paresthesia",
    "diabetes": "diabetes mellitus",
    "lung cancer": "pulmonary carcinoma",
    "liver cancer": "hepatocellular carcinoma",
    "skin cancer": "melanoma",
    "breast cancer": "mammary carcinoma",
    "cervical cancer": "cervical carcinoma",
    "uterine cancer": "endometrial carcinoma",
    "brain cancer": "glioblastoma",
    "eye pressure": "intraocular pressure",
    "broken bone": "fracture",
    "back pain": "lumbalgia",
    "neck pain": "cervicalgia",
    "pregnancy loss": "spontaneous abortion",
    "miscarriage": "spontaneous abortion",
    "water breaking": "rupture of membranes",
    "labor pains": "uterine contractions",
    "baby dropping": "lightening",
    "spotting": "light vaginal bleeding",
    "night sweats": "nocturnal hyperhidrosis"
}

df['question'] = df['question'].apply(lambda x: replace_synonyms(x, synonym_dict))
df['answer'] = df['answer'].apply(lambda x: replace_synonyms(x, synonym_dict))

# Vectorization
df['question_vector'] = df['question'].apply(clinical_vectorizer)
df['answer_vector'] = df['answer'].apply(clinical_vectorizer)

# Show final results
df.head()