In [None]:
import os
import pandas as pd
import numpy as np

from ast import literal_eval
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

In [None]:
from google.colab import drive
drive.mount('/content/drive')

  and should_run_async(code)


Mounted at /content/drive


In [None]:
base_dir = '/content/drive/MyDrive/NLP/vaers_analysis'
data_dir = os.path.join(base_dir, 'data')
results_dir = os.path.join(base_dir, 'results')

  and should_run_async(code)


In [None]:
## Load the clean vaers data for COVID-19 Moderna
vaers_data = pd.read_csv(os.path.join(data_dir, 'vaers_data.csv'))
vaers_data['symptoms'] = vaers_data['symptoms'].apply(literal_eval)
vaers_data['ordered_symptoms'] = vaers_data['ordered_symptoms'].apply(literal_eval)

## Randomly sample 10,000 Ordered Symptoms for Associative Analysis
data = vaers_data['ordered_symptoms'].sample(n=10000, random_state=42)

data.head()

  and should_run_async(code)


Unnamed: 0,ordered_symptoms
67770,"[Pain in extremity, Influenza like illness, Ar..."
226924,"[Rash macular, Rash]"
123500,"[Diarrhoea, Nausea, Pain]"
39400,"[Anaphylactic reaction, Dyspnoea, Chest discom..."
249744,"[COVID-19, SARS-CoV-2 test positive, Vaccine b..."


In [None]:
data = data.tolist()

  and should_run_async(code)


In [None]:
# Initialize the TransactionEncoder
transaction_encoder = TransactionEncoder()
binary_matrix = transaction_encoder.fit(data).transform(data)
binary_df = pd.DataFrame(binary_matrix, columns=transaction_encoder.columns_)
binary_df.head(10)

  and should_run_async(code)


Unnamed: 0,Abdominal X-ray,Abdominal abscess,Abdominal discomfort,Abdominal distension,Abdominal exploration,Abdominal infection,Abdominal mass,Abdominal pain,Abdominal pain lower,Abdominal pain upper,...,Wound,Wound dehiscence,Wound infection pseudomonas,Wound treatment,Wrong technique in product usage process,X-ray abnormal,X-ray dental normal,X-ray limb,Xanthopsia,Yellow skin
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
8,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [None]:
## Calculate Support
support_df = apriori(binary_df, min_support=0.01, use_colnames=True, verbose=1)
sorted_support_df = support_df.sort_values('support', ascending=False)
sorted_support_df.head(20)

  and should_run_async(code)


Processing 30 combinations | Sampling itemset size 5


Unnamed: 0,support,itemsets
23,0.2163,(Headache)
54,0.1968,(Pyrexia)
18,0.1886,(Fatigue)
8,0.1709,(Chills)
48,0.1526,(Pain)
45,0.1155,(Nausea)
49,0.1113,(Pain in extremity)
44,0.0973,(Myalgia)
102,0.094,"(Chills, Pyrexia)"
31,0.0898,(Injection site pain)


In [None]:
## Calculate 60% Confidence
conf_df = association_rules(support_df, metric="confidence", min_threshold=0.5, num_itemsets=len(binary_df))
conf_df = conf_df[['antecedents', 'consequents', 'confidence']]
sorted_conf_df = conf_df.sort_values('confidence', ascending=False)
sorted_conf_df.head(20)

  and should_run_async(code)


Unnamed: 0,antecedents,consequents,confidence
2,(Vaccine breakthrough infection),(COVID-19),0.95082
19,"(SARS-CoV-2 test positive, Cough)",(COVID-19),0.93985
0,(SARS-CoV-2 test positive),(COVID-19),0.935737
20,"(COVID-19, Cough)",(SARS-CoV-2 test positive),0.919118
1,(COVID-19),(SARS-CoV-2 test positive),0.790728
68,"(Injection site warmth, Injection site swelling)",(Injection site erythema),0.778281
67,"(Injection site warmth, Injection site pruritus)",(Injection site erythema),0.777778
65,"(Injection site pruritus, Injection site swell...",(Injection site erythema),0.764493
59,"(Headache, Vomiting)",(Nausea),0.726804
9,(Injection site warmth),(Injection site erythema),0.724832


In [None]:
## Calculate Lift
lift_df = association_rules(support_df, metric="lift", min_threshold=1.001, num_itemsets=len(binary_df))
lift_df = lift_df[['antecedents', 'consequents', 'lift']]
sorted_lift_df = lift_df.sort_values('lift', ascending=False)
sorted_lift_df.head(20)

  and should_run_async(code)


Unnamed: 0,antecedents,consequents,lift
256,(SARS-CoV-2 test positive),"(COVID-19, Cough)",14.406233
253,"(COVID-19, Cough)",(SARS-CoV-2 test positive),14.406233
33,(COVID-19),(Vaccine breakthrough infection),12.593638
32,(Vaccine breakthrough infection),(COVID-19),12.593638
252,"(SARS-CoV-2 test positive, Cough)",(COVID-19),12.448339
257,(COVID-19),"(SARS-CoV-2 test positive, Cough)",12.448339
30,(SARS-CoV-2 test positive),(COVID-19),12.393863
31,(COVID-19),(SARS-CoV-2 test positive),12.393863
94,(Skin warm),(Erythema),9.245123
95,(Erythema),(Skin warm),9.245123


In [None]:
sorted_support_df.to_csv(os.path.join(results_dir, 'support_score.csv'), index=False)
sorted_conf_df.to_csv(os.path.join(results_dir, 'confidence_score.csv'), index=False)
sorted_lift_df.to_csv(os.path.join(results_dir, 'lift_score.csv'), index=False)

  and should_run_async(code)
