# Preprocess the VAERS data

In this notebook, we prepare the dataset which will be used in the data annotation and training

In [116]:
import os
import pandas as pd
from tqdm import tqdm

# Load raw data

The raw data files are downloaded from https://vaers.hhs.gov/data.html,
we could use these raw files for better screening

In [189]:
df_vax = pd.read_csv('2021VAERSVAX.csv', encoding='cp1252')
df_sym = pd.read_csv('2021VAERSSYMPTOMS.csv', encoding='cp1252')
df_rpt = pd.read_csv('2021VAERSData.csv', encoding='cp1252')

print('* df report size:', df_rpt.VAERS_ID.count())
print('* df symptom size:', df_sym.VAERS_ID.count())
print('* df vax size:', df_vax.VAERS_ID.count())

* df report size: 108190
* df symptom size: 148519
* df vax size: 109785


  interactivity=interactivity, compiler=compiler, result=result)


## Check data sample

In [14]:
# check the sample data for the df vax
df_vax.head()

Unnamed: 0,VAERS_ID,VAX_TYPE,VAX_MANU,VAX_LOT,VAX_DOSE_SERIES,VAX_ROUTE,VAX_SITE,VAX_NAME
0,916600,COVID19,MODERNA,037K20A,1,IM,LA,COVID19 (COVID19 (MODERNA))
1,916601,COVID19,MODERNA,025L20A,1,IM,RA,COVID19 (COVID19 (MODERNA))
2,916602,COVID19,PFIZER\BIONTECH,EL1284,1,IM,LA,COVID19 (COVID19 (PFIZER-BIONTECH))
3,916603,COVID19,MODERNA,unknown,UNK,,,COVID19 (COVID19 (MODERNA))
4,916604,COVID19,MODERNA,,1,IM,LA,COVID19 (COVID19 (MODERNA))


In [12]:
df_sym.head(10)

Unnamed: 0,VAERS_ID,SYMPTOM1,SYMPTOMVERSION1,SYMPTOM2,SYMPTOMVERSION2,SYMPTOM3,SYMPTOMVERSION3,SYMPTOM4,SYMPTOMVERSION4,SYMPTOM5,SYMPTOMVERSION5
0,916600,Dysphagia,23.1,Epiglottitis,23.1,,,,,,
1,916601,Anxiety,23.1,Dyspnoea,23.1,,,,,,
2,916602,Chest discomfort,23.1,Dysphagia,23.1,Pain in extremity,23.1,Visual impairment,23.1,,
3,916603,Dizziness,23.1,Fatigue,23.1,Mobility decreased,23.1,,,,
4,916604,Injection site erythema,23.1,Injection site pruritus,23.1,Injection site swelling,23.1,Injection site warmth,23.1,,
5,916606,Pharyngeal swelling,23.1,,,,,,,,
6,916607,Abdominal pain,23.1,Chills,23.1,Sleep disorder,23.1,,,,
7,916608,Diarrhoea,23.1,Nasal congestion,23.1,,,,,,
8,916609,Vaccination site erythema,23.1,Vaccination site pruritus,23.1,Vaccination site swelling,23.1,,,,
9,916610,Rash,23.1,Urticaria,23.1,,,,,,


In [9]:
# since there are too many columns, we could revert the matrix to show all columns
df_rpt.head(10).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
VAERS_ID,916600,916601,916602,916603,916604,916606,916607,916608,916609,916610
RECVDATE,01/01/2021,01/01/2021,01/01/2021,01/01/2021,01/01/2021,01/01/2021,01/01/2021,01/01/2021,01/01/2021,01/01/2021
STATE,TX,CA,WA,WA,TX,NV,KS,OH,TN,VA
AGE_YRS,33,73,23,58,47,44,50,33,71,18
CAGE_YR,33,73,23,58,47,44,50,33,71,18
CAGE_MO,,,,,,,,,,
SEX,F,F,F,F,F,F,M,M,F,F
RPT_DATE,,,,,,,,,,
SYMPTOM_TEXT,Right side of epiglottis swelled up and hinder...,Approximately 30 min post vaccination administ...,"About 15 minutes after receiving the vaccine, ...","extreme fatigue, dizziness,. could not lift my...","Injection site swelling, redness, warm to the ...",patient called back the next day and stated he...,SEVERE chills approximately 13-14 hours after ...,Nasal congestion and diarrhea,On day 9 following the vaccination I noticed a...,hives and rash all over body that has not gone...
DIED,,,,,,,,,,


## Merge the subset

In [169]:
# merge the report text and vaccination info
# we don't need all of the columns in each dataframe
df = df_rpt[['VAERS_ID', 'AGE_YRS', 'SEX', 'VAX_DATE', 'SYMPTOM_TEXT', 'ALLERGIES']].merge(
    df_vax[['VAERS_ID', 'VAX_TYPE', 'VAX_MANU']].drop_duplicates(subset='VAERS_ID'),
    on='VAERS_ID',
    how='left'
)

# merge the first symptom as the main label
# since there are multiple labels for each report
# we could also collect all symptoms for multi-label classification tasks
df = df.merge(
    df_sym[['VAERS_ID', 'SYMPTOM1']].drop_duplicates(subset='VAERS_ID'),
    on='VAERS_ID',
    how='left'
)

# we only use the COVID19 vaccination data
df = df[df['VAX_TYPE']=='COVID19']

# change the default datetime format for eaiser query
df['VAX_DATE'] = pd.to_datetime(df['VAX_DATE'], format='%m/%d/%Y')

# make sure the format is string for this two
df['SYMPTOM_TEXT'] = df['SYMPTOM_TEXT'].astype(str)
df['SYMPTOM1'] = df['SYMPTOM1'].astype(str)

# calcuate the text length for the symptoms
df['TEXT_LEN'] = df['SYMPTOM_TEXT'].apply(lambda r: len(r))

# replace the blank in symptom name with underline
df['SYMPTOM'] = df['SYMPTOM1'].apply(lambda r: r.replace(' ', '_'))

# update some data string
df.loc[df.SYMPTOM=='Chills', 'SYMPTOM'] = 'Chill'

# then we could remove the unused columns
df.drop(columns=['SYMPTOM1'], inplace=True)

# let's see how our dataframe looks like
print('* df size:', df['VAERS_ID'].count())
df.head()

Unnamed: 0,VAERS_ID,AGE_YRS,SEX,VAX_DATE,SYMPTOM_TEXT,ALLERGIES,VAX_TYPE,VAX_MANU,TEXT_LEN,SYMPTOM
0,916600,33.0,F,2020-12-28,Right side of epiglottis swelled up and hinder...,Pcn and bee venom,COVID19,MODERNA,95,Dysphagia
1,916601,73.0,F,2020-12-31,Approximately 30 min post vaccination administ...,"""Dairy""",COVID19,MODERNA,450,Anxiety
2,916602,23.0,F,2020-12-31,"About 15 minutes after receiving the vaccine, ...",Shellfish,COVID19,PFIZER\BIONTECH,420,Chest_discomfort
3,916603,58.0,F,2020-12-23,"extreme fatigue, dizziness,. could not lift my...","Diclofenac, novacaine, lidocaine, pickles, tom...",COVID19,MODERNA,68,Dizziness
4,916604,47.0,F,2020-12-22,"Injection site swelling, redness, warm to the ...",Na,COVID19,MODERNA,61,Injection_site_erythema


## Get a smaller dataset for working

Let's get our working dataset 

In [181]:
# first, we only use the data in 2021
dft = df[df['VAX_DATE']>='2021-01-01']
print('* dft size:', dft['VAERS_ID'].count())

# then, we remove those records that are too short 
dft = dft[dft['TEXT_LEN']>80]
print('* dft size:', dft['VAERS_ID'].count())

# then drop those not used columns
dft.drop(columns=['TEXT_LEN', 'ALLERGIES', 'VAX_TYPE'], inplace=True)

dft.head()

* dft size: 84467
* dft size: 65828


Unnamed: 0,VAERS_ID,AGE_YRS,SEX,VAX_DATE,SYMPTOM_TEXT,VAX_MANU,SYMPTOM
279,916904,49.0,M,2021-01-01,"Pt. symptoms included: swelling in the eyes, c...",MODERNA,Chest_discomfort
337,916969,88.0,F,2021-01-01,Anaphylactic like reaction started with dizzin...,MODERNA,Anaphylactoid_reaction
343,916975,28.0,F,2021-01-01,Vomited suddenly without knowledge of being na...,PFIZER\BIONTECH,Headache
348,916980,55.0,F,2021-01-01,Patient was given 0.5mL of the Moderna vaccine...,MODERNA,Immediate_post-injection_reaction
444,917082,43.0,F,2021-01-01,"Began to feel tired and sore around 1530, chil...",MODERNA,Arthralgia


In [179]:
# let's see how this dataset looks like by the symptom label
# the default groupby result is not well ordered,
# so we sort the results by the number of records
dft.groupby(['SYMPTOM'])[['VAERS_ID']].count().sort_values(by='VAERS_ID').tail(15)

Unnamed: 0_level_0,VAERS_ID
SYMPTOM,Unnamed: 1_level_1
COVID-19,704
Anxiety,766
Back_pain,780
Abdominal_pain,790
Body_temperature_increased,826
Chest_discomfort,884
Erythema,1261
Headache,1306
Poor_quality_product_administered,1448
Fatigue,1671


# Output the selected top 10

## Define the top 10

In [195]:
# we could select the top 10 symptoms according to our dataset
aes = dft.groupby(['SYMPTOM'])[['VAERS_ID']].count().sort_values(by='VAERS_ID').tail(10).index.tolist()
print('* top 10 symptoms:', aes)

# or we could specify 10 symptoms directly,
# then we could put our own selection here
aes = [
    'Pyrexia', 'Chill', 'Headache', 'Fatigue', 'Pain',
    'Nausea', 'Dizziness', 'Pain_in_extremity', 'Injection_site_pain', 'Myalgia'
]
# and let's see how our selected 10 symptoms
dft[dft.SYMPTOM.isin(aes)].groupby(['SYMPTOM'])[['VAERS_ID']].count().sort_values(by='VAERS_ID').tail(15)

* top 10 symptoms: ['Chest_discomfort', 'Poor_quality_product_administered', 'Erythema', 'Headache', 'Fatigue', 'Injection_site_erythema', 'Asthenia', 'Dizziness', 'Arthralgia', 'Chill']


Unnamed: 0_level_0,VAERS_ID
SYMPTOM,Unnamed: 1_level_1
Myalgia,105
Pyrexia,117
Pain,176
Nausea,185
Pain_in_extremity,308
Injection_site_pain,522
Headache,1626
Fatigue,2035
Dizziness,3497
Chill,7365


## output samples

In [186]:
dft_allsamp = []
dft_allae = []

for ae in tqdm(aes):
    # get the sample df for this symptom
    dft_ae = dft[dft['SYMPTOM']==ae]
    
    # sampling by the default method 
    dft_sample = dft_ae.sample(n=50)
    
    # gather the dataframe for further use
    dft_allsamp.append(dft_sample)
    dft_allae.append(dft_ae)
    
    # output the sample symptom txt file
    idx = 0
    for _, row in dft_sample.iterrows():
        # get the text content
        txt = row['SYMPTOM_TEXT']
        
        # create a filename
        fn = '%s_%02d.txt' % (ae, idx)
        
        # and make it a full path 
        full_fn = os.path.join(
            'sample', fn
        )
        
        # write to disk
        with open(full_fn, 'w') as f:
            f.write(txt)
            
        idx += 1
        
# put all dataframe into one big dataframe
dft_allsamp = pd.concat(dft_allsamp)
dft_allae = pd.concat(dft_allae)

# get those records are not sampled
dft_notsamp = pd.concat([dft_allsamp, dft_allae])
dft_notsamp = dft_notsamp.drop_duplicates(keep=False)

print('* dft_allae:', dft_allae.shape)
print('* dft_allsamp:', dft_allsamp.shape)
print('* dft_notsamp:', dft_notsamp.shape)
print('* generated samples')

100%|██████████| 10/10 [00:00<00:00, 15.39it/s]

* generated samples





## output large and csv

In [188]:
dft_notsamp.to_csv('large.csv', index=False)
dft_allsamp.to_csv('sample.csv', index=False)

print('* generated large.csv and sample.csv')

* generated csv
