In [None]:
import re
import os
import json
import pandas as pd
import numpy as np

from ast import literal_eval
from sklearn.model_selection import train_test_split

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
base_dir = '/content/drive/MyDrive/NLP/vaers_analysis'
data_dir = os.path.join(base_dir, 'data')

In [None]:
## Its been noticed that there are a few values in the symptoms
## table which are not actual symptoms, so remove the records with
## only such values
outlier_symptoms = [
    'Accidental exposure to product',
    'Accidental overdose',
    'Accidental underdose',
    'Adverse event',
    'Adverse eventDevice connection issue',
    'Circumstance or information capable of leading to medication error',
    'Device adhesion issue',
    'Device connection issue',
    'Device defective',
    'Device delivery system issue',
    'Device issue',
    'Device leakage',
    'Device malfunction',
    'Device material issue',
    'Device temperature issue',
    'Device use issue',
    'Discontinued product administered',
    'Expired device used',
    'Expired product administered',
    'Extra dose administered',
    'Inappropriate schedule of product administration',
    'Incomplete course of vaccination',
    'Incorrect dose administered',
    'Incorrect product administration duration',
    'Incorrect product formulation administered',
    'Incorrect route of product administration',
    'Injury associated with device',
    'Intentional product use issue',
    'Interchange of vaccine products',
    'Investigation',
    'Laboratory test',
    'Limb discomfort',
    'Liquid product physical issue',
    'Manufacturing product shipping issue',
    'Medication error',
    'Needle Issue',
    'Needle issue',
    'No adverse event',
    'Off label use',
    'Out of specification product use',
    'Overdose',
    'Patient uncooperative',
    'Physical examination',
    'Poor quality product administered',
    'Product Storage Error',
    'Product Storage error',
    'Product Temperature Excursion Issue',
    'Product administered at inappropriate site',
    'Product administered to patient of inappropriate age',
    "Product administered to patient of inappropriate age' (if applicable)",
    'Product administration error',
    'Product administration interrupted',
    'Product appearance confusion',
    'Product availability issue',
    'Product blister packaging issue',
    'Product closure issue',
    'Product colour issue',
    'Product confusion',
    'Product container issue',
    'Product container seal issue',
    'Product contamination',
    'Product contamination physical',
    'Product dispensing error',
    'Product dispensing issue',
    'Product dose omission in error',
    'Product dose omission issue',
    'Product expiration date issue',
    'Product label confusion',
    'Product label issue',
    'Product leakage',
    'Product lot number issue',
    'Product measured potency issue',
    'Product monitoring error',
    'Product packaging confusion',
    'Product packaging issue',
    'Product packaging quantity issue',
    'Product physical issue',
    'Product preparation error',
    'Product preparation issue',
    'Product quality issue',
    'Product storage error',
    'Product substitution issue',
    'Product temperature excursion issue',
    'Product use issue',
    'Recalled product administered',
    'Road traffic accident',
    'SARS-CoV-2 test',
    'Suspected product contamination',
    'Suspected product quality issue',
    'Syringe issue',
    'Transcription medication error',
    'Underdose',
    'Unevaluable event',
    'Urine analysis',
    'Vaccination error',
    'Wrong dosage formulation',
    'Wrong patient received product',
    'Wrong product administered',
    'Wrong technique in device usage process',
    'Wrong technique in product usage process'

    ## Newly Added Outlier Symptoms
    'Abdominal X-ray',
    'Allergy test negative',
    'Antibody test',
    'Antibody test negative',
    'Bacterial test negative',
    'Blood folate normal',
    'Blood potassium normal',
    'Blood pressure measurement',
    'Blood pressure normal',
    'Blood test',
    'Blood test normal',
    'Breast feeding',
    'Cardiac monitoring',
    'Cardiac stress test',
    'Chest X-ray',
    'Chest X-ray normal',
    'Coronavirus test',
    'Fear of injection',
    'Full blood count normal',
    'HIV test negative',
    'Hip fracture',
    'Hip surgery',
    'Immunology test normal',
    'Influenza B virus test',
    'Laboratory test normal',
    'Life support',
    'Magnetic resonance imaging normal',
    'Pathology test',
    'Pregnancy',
    'Renal function test normal',
    'SARS-CoV-1 test',
    'SARS-CoV-2 antibody test',
    'SARS-CoV-2 antibody test negative',
    'SARS-CoV-2 test negative',
    'Scan',
    'Skin test',
    'Spinal X-ray',
    'Stool analysis',
    'Streptococcus test',
    'Streptococcus test negative',
    'Ultrasound breast normal',
    'Ultrasound scan',
    'Vaccination failure',
    'Viral test',
    'Vitamin B12 normal',
    'Vitamin D',
    'Weight',
    'X-ray',
    'X-ray limb normal',
    'X-ray normal',
    'X-ray of pelvis and hip'
]

In [None]:
vaers_data = pd.read_csv(os.path.join(data_dir, 'labelled_vaers_data.csv'))
vaers_data['symptoms'] = vaers_data['symptoms'].apply(literal_eval)
vaers_data['ordered_symptoms'] = vaers_data['ordered_symptoms'].apply(literal_eval)

In [None]:
def clean_symptoms(symptoms):
    """
    Cleans a list of symptom strings by removing unwanted characters, extra information,
    and empty strings, ensuring each symptom is well-formatted and valid.

    Args:
        symptoms (list of str): A list of symptom strings to be cleaned.

    Returns:
        list of str: A cleaned list of symptom strings.

    Cleaning Steps:
        1. Removes any '**' characters from each symptom string.
        2. Strips leading and trailing spaces, single quotes ('), and double quotes (").
        3. Removes any extra information enclosed in parentheses (e.g., "(example)").
        4. Removes unwanted special quotes like ‘ and ’ from the strings.
        5. Excludes any empty strings resulting from the cleaning process.
    """

    symptoms = [sym.replace('**', '') for sym in symptoms]  #remove trailing **
    symptoms = [sym.strip().strip("'\"") for sym in symptoms]   #remove trailing spaces
    symptoms = [re.sub(r'\(.*?\)', '', sym).strip() for sym in symptoms] #remove extra information introduced by LLMs
    symptoms = [sym.strip('‘’"') for sym in symptoms if sym.strip()]  # Remove unwanted quotes and preserve valid text
    symptoms = [sym for sym in symptoms if sym != ''] #remove empty strings

    return symptoms

In [None]:
## Clean the ordered_symptoms colum
vaers_data['ordered_symptoms'] = vaers_data['ordered_symptoms'].apply(clean_symptoms)

## Keep the records with exactly same Ordered Symptoms and Unordered Symptoms
vaers_data = vaers_data[vaers_data['symptoms'].apply(set) == vaers_data['ordered_symptoms'].apply(set)]

## TODO: Need to identify the actual symptoms from symptom_text
## Keep the records with actuals symptoms after removing outlier symptoms
vaers_data = vaers_data[~vaers_data['ordered_symptoms'].apply(lambda x: set(x).issubset(set(outlier_symptoms)))]

## Remove the outlier symptoms from symptoms and ordered_symptoms
vaers_data['symptoms'] = vaers_data['symptoms'].apply(
    lambda symptoms: [sym for sym in symptoms if sym not in outlier_symptoms]
)

vaers_data['ordered_symptoms'] = vaers_data['ordered_symptoms'].apply(
    lambda symptoms: [sym for sym in symptoms if sym not in outlier_symptoms]
)

## Drop the Null Records
vaers_data.dropna(inplace=True)

In [None]:
## Since we want to find the temporal/associative relationship between
## symptoms, we will consider records with more than 1 reported symptoms
vaers_data = vaers_data[vaers_data['ordered_symptoms'].apply(len) > 1]

## There are a few records with more than 20 symptoms, we consider
## such records as outliers
vaers_data = vaers_data[vaers_data['ordered_symptoms'].apply(len) < 21]

In [None]:
vaers_data['report_length'] = vaers_data['symptom_text'].apply(lambda text: len([word for word in text.split(' ')]))
vaers_data['num_symptoms'] = vaers_data['ordered_symptoms'].apply(len)

In [None]:
## Let's use the records with more than 50 Words to Fine Tune the pre-trained transformer based models
ft_data = vaers_data[vaers_data['report_length'] > 50].sample(n=10500, random_state=42)
train, test = train_test_split(ft_data, test_size=500, random_state=42)

In [None]:
vaers_data.to_csv(os.path.join(data_dir, 'vaers_data.csv'), index=False)
train.to_csv(os.path.join(data_dir, 'train_vaers_data.csv'), index=False)
test.to_csv(os.path.join(data_dir, 'test_vaers_data.csv'), index=False)