In [98]:
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
from collections import Counter

# APRIORI FOR LLM TOKENIZED SYMPTOMS


In [150]:
df = pd.read_csv('llm_tokenized_symptoms.csv')

In [151]:
df.drop(['Unnamed: 0'], axis = 1, inplace=True)

In [152]:
df

Unnamed: 0,VAERS_ID,extracted_symptoms
0,2728994,"['muscle pain', 'arm soreness', 'pain', 'activ..."
1,2729004,"['micro-seizures', 'seizures', 'arrest in spee..."
2,2729006,"['endocarditis', 'fever', 'chills', 'lung infe..."
3,2729006,"['endocarditis', 'fever', 'chills', 'lung infe..."
4,2729006,"['endocarditis', 'fever', 'chills', 'lung infe..."
...,...,...
16823,2733817,"['weakness', 'malaise', 'arm pain', 'chills', ..."
16824,2733820,"['COVID-19', 'pneumonia']"
16825,2733821,['type 1 diabetes']
16826,2737168,[]


In [153]:
import ast

# Convert strings to lists
df['extracted_symptoms'] = df['extracted_symptoms'].apply(ast.literal_eval)

# Verify the type of the first element
print(type(df['extracted_symptoms'][0]))  # S

<class 'list'>


In [154]:
df['extracted_symptoms']

0        [muscle pain, arm soreness, pain, activities o...
1        [micro-seizures, seizures, arrest in speech, e...
2            [endocarditis, fever, chills, lung infection]
3            [endocarditis, fever, chills, lung infection]
4            [endocarditis, fever, chills, lung infection]
                               ...                        
16823    [weakness, malaise, arm pain, chills, fever, i...
16824                                [COVID-19, pneumonia]
16825                                    [type 1 diabetes]
16826                                                   []
16827                               [fever, rash, fatigue]
Name: extracted_symptoms, Length: 16828, dtype: object

In [155]:
# Step 1: Create a Grand Dictionary of Symptoms
# Assuming df1['SYMPTOM_LIST'] contains lists of symptoms
symptom_list = df['extracted_symptoms'].explode()  # Flatten the list of lists
symptom_counts = Counter(symptom_list)         # Count each symptom occurrence
grand_dict_of_symptoms = dict(symptom_counts) 

In [156]:
grand_dict_of_symptoms

{'muscle pain': 414,
 'arm soreness': 60,
 'pain': 787,
 'activities of daily living impaired': 1,
 'micro-seizures': 1,
 'seizures': 33,
 'arrest in speech': 1,
 'eye rolling': 3,
 'arm stiffness': 5,
 'seizure': 97,
 'encephalitis': 8,
 'endocarditis': 4,
 'fever': 1319,
 'chills': 620,
 'lung infection': 3,
 'aches and pains': 4,
 'fatigue': 1284,
 'night sweats': 47,
 'diarrhea': 259,
 'loss of appetite': 53,
 'shoulder pain': 118,
 'hives': 195,
 'lumps': 2,
 'brain fog': 164,
 'slowed thinking': 2,
 'shortness of breath': 410,
 'respiratory failure': 47,
 'sepsis': 93,
 'pneumonia': 127,
 'COVID-19': 2281,
 'tachycardia': 187,
 'hypotension': 107,
 'elevated AST/ALT': 2,
 'urinary incontinence': 12,
 'dysphagia': 51,
 'hypomagnesemia': 11,
 'left shoulder pain': 4,
 'rotator cuff tear': 4,
 'anxiety': 116,
 'GERD': 29,
 'floaters': 10,
 'retinal tear': 2,
 'impaired vision': 1,
 'chest pain': 344,
 'epigastric pain': 6,
 'weakness': 458,
 'hypoxemia': 28,
 'hazy airspace opacific

In [157]:
print("""
- {}
- {}
- {}
- {}
- {}
- {}
- {}
- {}
- {}
- {}
- {}
- {}
- {}
- {}
- {}
- {}
- {}
- {}
- {}
- {}
""".format(*(grand_dict_of_symptoms), *(grand_dict_of_symptoms.values())))


- muscle pain
- arm soreness
- pain
- activities of daily living impaired
- micro-seizures
- seizures
- arrest in speech
- eye rolling
- arm stiffness
- seizure
- encephalitis
- endocarditis
- fever
- chills
- lung infection
- aches and pains
- fatigue
- night sweats
- diarrhea
- loss of appetite



In [None]:

symptom_dummies = df['extracted_symptoms'].apply(lambda x: pd.Series(1, index=set(x))).fillna(0)

frequent_itemsets = apriori(symptom_dummies, min_support=0.01, use_colnames=True)

rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)

rules = rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']]


# Display Results
print("Grand Dictionary of Symptoms:", grand_dict_of_symptoms)
print("\nFrequent Itemsets (Support ≥ 0.01):\n", frequent_itemsets)
print("\nAssociation Rules (Confidence ≥ 0.5):\n", rules)

high_lift_rules = rules[rules['lift'] > 1.0]
print("\nHigh-Lift Association Rules (Lift > 1.0):\n", high_lift_rules)





Grand Dictionary of Symptoms: {'muscle pain': 414, 'arm soreness': 60, 'pain': 787, 'activities of daily living impaired': 1, 'micro-seizures': 1, 'seizures': 33, 'arrest in speech': 1, 'eye rolling': 3, 'arm stiffness': 5, 'seizure': 97, 'encephalitis': 8, 'endocarditis': 4, 'fever': 1319, 'chills': 620, 'lung infection': 3, 'aches and pains': 4, 'fatigue': 1284, 'night sweats': 47, 'diarrhea': 259, 'loss of appetite': 53, 'shoulder pain': 118, 'hives': 195, 'lumps': 2, 'brain fog': 164, 'slowed thinking': 2, 'shortness of breath': 410, 'respiratory failure': 47, 'sepsis': 93, 'pneumonia': 127, 'COVID-19': 2281, 'tachycardia': 187, 'hypotension': 107, 'elevated AST/ALT': 2, 'urinary incontinence': 12, 'dysphagia': 51, 'hypomagnesemia': 11, 'left shoulder pain': 4, 'rotator cuff tear': 4, 'anxiety': 116, 'GERD': 29, 'floaters': 10, 'retinal tear': 2, 'impaired vision': 1, 'chest pain': 344, 'epigastric pain': 6, 'weakness': 458, 'hypoxemia': 28, 'hazy airspace opacification': 1, 'lung 

In [None]:
sorted_itemsets = frequent_itemsets.sort_values(by='support', ascending=False)
sorted_itemsets

Unnamed: 0,support,itemsets
8,0.135548,(COVID-19)
2,0.078381,(fever)
4,0.076242,(fatigue)
12,0.063882,(headache)
0,0.045460,(pain)
...,...,...
93,0.010221,"(malaise, headache, fatigue)"
62,0.010102,"(chills, fatigue)"
92,0.010043,"(injection site pain, fever, fatigue)"
87,0.010043,"(headache, muscle pain, fatigue)"


In [161]:
sorted_itemsets[0:20]

Unnamed: 0,support,itemsets
8,0.135548,(COVID-19)
2,0.078381,(fever)
4,0.076242,(fatigue)
12,0.063882,(headache)
0,0.04546,(pain)
13,0.041478,(nausea)
17,0.040944,(dizziness)
3,0.036784,(chills)
35,0.036665,(rash)
29,0.03405,(injection site pain)


In [162]:
rules[rules['lift'] > 1.0]

Unnamed: 0,antecedents,consequents,support,confidence,lift
0,(muscle pain),(fatigue),0.016461,0.669082,8.775771
1,(muscle pain),(headache),0.012360,0.502415,7.864788
2,(joint pain),(muscle pain),0.011944,0.505025,20.527930
3,(muscle pain),(injection site pain),0.014856,0.603865,17.734443
4,(injection site tenderness),(muscle pain),0.010696,0.664207,26.998235
...,...,...,...,...,...
57,"(injection site redness, injection site swelling)",(injection site pain),0.011469,0.893519,26.241064
58,"(injection site redness, injection site pain)",(injection site swelling),0.011469,0.835498,47.660195
59,"(injection site pain, injection site swelling)",(injection site redness),0.011469,0.768924,46.712845
60,(injection site redness),"(injection site pain, injection site swelling)",0.011469,0.696751,46.712845


In [167]:
rules.sort_values(by='lift', ascending=False)

Unnamed: 0,antecedents,consequents,support,confidence,lift
61,(injection site swelling),"(injection site redness, injection site pain)",0.011469,0.654237,47.660195
58,"(injection site redness, injection site pain)",(injection site swelling),0.011469,0.835498,47.660195
60,(injection site redness),"(injection site pain, injection site swelling)",0.011469,0.696751,46.712845
59,"(injection site pain, injection site swelling)",(injection site redness),0.011469,0.768924,46.712845
20,(injection site redness),(injection site swelling),0.012836,0.779783,44.482017
...,...,...,...,...,...
34,"(fever, headache)",(fatigue),0.012539,0.513382,6.733587
37,"(injection site pain, fatigue)",(fever),0.010043,0.518405,6.613888
13,(vaccination failure),(COVID-19),0.012479,0.744681,5.493858
14,(vaccine failure),(COVID-19),0.010280,0.742489,5.477689


# APRIORI FOR NER + MEDICAL Abbreviation Replacement TOKENIZED SYMPTOMS


In [168]:
df_ner_labels = pd.read_csv('symptoms_with_labels.csv')

In [None]:

df_ner_labels['SYMPTOM_LIST'] = df_ner_labels['SYMPTOM_LIST'].apply(ast.literal_eval)

print(type(df_ner_labels['SYMPTOM_LIST'][0]))  # S

<class 'list'>


In [170]:
symptom_list = df_ner_labels['SYMPTOM_LIST'].explode()  # Flatten the list of lists
symptom_counts = Counter(symptom_list)         # Count each symptom occurrence
grand_dict_of_symptoms = dict(symptom_counts) 

In [171]:
print("""
- {}
- {}
- {}
- {}
- {}
- {}
- {}
- {}
- {}
- {}
- {}
- {}
- {}
- {}
- {}
- {}
- {}
- {}
- {}
- {}
""".format(*(grand_dict_of_symptoms), *(grand_dict_of_symptoms.values())))


- sore
- muscle pain
- arexvy
- coronavirus disease 2019
- pain
- seizures
- arrest
- seizure
- auto-immune disease
- encephalitis
- endocarditis
- fever
- chills
- infection
- pains fatigue chills
- diarrhea loss
- fatigue diarhea loss
- prolonged shoulder pain
- nan
- respiratory failure s/p trach placement 8/17/2023 gerd hypertension



In [None]:

symptom_dummies = df_ner_labels['SYMPTOM_LIST'].apply(lambda x: pd.Series(1, index=set(x))).fillna(0)

frequent_itemsets = apriori(symptom_dummies, min_support=0.01, use_colnames=True)

rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)

rules = rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']]

# Display Results
print("Grand Dictionary of Symptoms:", grand_dict_of_symptoms)
print("\nFrequent Itemsets (Support ≥ 0.01):\n", frequent_itemsets)
print("\nAssociation Rules (Confidence ≥ 0.5):\n", rules)

# Optional: Filter rules based on lift for stronger associations
high_lift_rules = rules[rules['lift'] > 1.0]
print("\nHigh-Lift Association Rules (Lift > 1.0):\n", high_lift_rules)




Grand Dictionary of Symptoms: {'sore': 890, 'muscle pain': 1100, 'arexvy': 43, 'coronavirus disease 2019': 2364, 'pain': 7469, 'seizures': 75, 'arrest': 3, 'seizure': 167, 'auto-immune disease': 2, 'encephalitis': 28, 'endocarditis': 2, 'fever': 2008, 'chills': 570, 'infection': 506, 'pains fatigue chills': 1, 'diarrhea loss': 1, 'fatigue diarhea loss': 1, 'prolonged shoulder pain': 1, nan: 4298, 'respiratory failure s/p trach placement 8/17/2023 gerd hypertension': 1, 'shortness of breath': 383, 'fevers': 18, 'muscular dystrophy': 273, 'respiratory distress': 25, 'sepsis': 130, 'pneumonia': 410, 'infections': 37, 'eating disorder': 359, 'tachycardia': 117, 'hypotensive': 11, 'culturesurine antigens methicillin-resistant staphylococcus aureus': 1, 'neginitial lactic acid 08 covid swab positive isolation precautions started': 1, 'infectious diseases': 80, 'coronavirus disease': 9652, 'urinary incontinence': 10, 'urinary retention': 13, 'overflow incontinence': 1, 'hypomagnesemia': 7, 'l

In [173]:
frequent_itemsets.sort_values(by='support', ascending=False)[0:20]

Unnamed: 0,support,itemsets
8,0.263581,(coronavirus disease)
1,0.145687,(pain)
2,0.078094,(coronavirus disease 2019)
64,0.063443,"(coronavirus disease, coronavirus disease 2019)"
32,0.060991,(swelling)
5,0.058224,(fever)
15,0.056527,(fatigue)
21,0.047598,(headache)
54,0.046403,"(pain, coronavirus disease)"
26,0.039424,(allergies)


In [174]:
rules.sort_values(by='lift', ascending=False)[0:20]

Unnamed: 0,antecedents,consequents,support,confidence,lift
470,"(pain, pyrexia)","(fever, erythema)",0.010312,0.689076,61.567746
473,"(fever, erythema)","(pain, pyrexia)",0.010312,0.921348,61.567746
281,"(pain, pyrexia)","(fever, muscle pain)",0.010123,0.676471,55.172247
283,"(fever, muscle pain)","(pain, pyrexia)",0.010123,0.825641,55.172247
635,"(pain, myalgia, fatigue)","(erythema, muscle pain)",0.010501,0.625468,51.809613
639,"(erythema, muscle pain)","(pain, myalgia, fatigue)",0.010501,0.869792,51.809613
622,"(swelling, myalgia)","(pain, muscle pain, fatigue)",0.011004,0.911458,51.770833
605,"(pain, muscle pain, fatigue)","(swelling, myalgia)",0.011004,0.625,51.770833
628,"(pain, muscle pain, fatigue)","(erythema, myalgia)",0.010501,0.596429,51.552174
645,"(erythema, myalgia)","(pain, muscle pain, fatigue)",0.010501,0.907609,51.552174


In [176]:
rules.sort_values(by='lift', ascending=False)[20:40]

Unnamed: 0,antecedents,consequents,support,confidence,lift
642,"(myalgia, fatigue)","(pain, erythema, muscle pain)",0.010501,0.596429,49.924211
631,"(pain, erythema, muscle pain)","(myalgia, fatigue)",0.010501,0.878947,49.924211
589,"(pain, myalgia, fatigue)","(muscle pain, headache)",0.011004,0.655431,49.875455
594,"(muscle pain, headache)","(pain, myalgia, fatigue)",0.011004,0.837321,49.875455
544,"(pain, myalgia, fatigue)","(fever, muscle pain)",0.010249,0.610487,49.790685
548,"(fever, muscle pain)","(pain, myalgia, fatigue)",0.010249,0.835897,49.790685
430,"(fatigue, myalgia)","(erythema, muscle pain)",0.010563,0.6,49.7
431,"(erythema, muscle pain)","(fatigue, myalgia)",0.010563,0.875,49.7
608,"(pain, swelling, muscle pain)","(myalgia, fatigue)",0.011004,0.870647,49.452736
620,"(myalgia, fatigue)","(pain, swelling, muscle pain)",0.011004,0.625,49.452736


In [134]:
rules[rules['lift'] > 1.0]

Unnamed: 0,antecedents,consequents,support,confidence,lift
0,(muscle pain),(pain),0.021441,0.904509,6.208595
1,(muscle pain),(fever),0.012261,0.517241,8.883593
2,(muscle pain),(coronavirus disease),0.014336,0.604775,2.294450
3,(muscle pain),(fatigue),0.018172,0.766578,13.561358
4,(muscle pain),(headache),0.013141,0.554377,11.647036
...,...,...,...,...,...
642,"(myalgia, fatigue)","(pain, erythema, muscle pain)",0.010501,0.596429,49.924211
643,"(pain, erythema)","(myalgia, muscle pain, fatigue)",0.010501,0.607273,36.583581
644,"(pain, myalgia)","(erythema, muscle pain, fatigue)",0.010501,0.513846,46.698338
645,"(erythema, myalgia)","(pain, muscle pain, fatigue)",0.010501,0.907609,51.552174


# APRIORI FOR Ground Truth (VAERS SYMPTOMS)


In [177]:
df_ground_truth = pd.read_csv('symptoms_t_15904.csv')

In [None]:
import numpy as np
columns = ['SYMPTOM1', 'SYMPTOM2', 'SYMPTOM3', 'SYMPTOM4', 'SYMPTOM5']
for i, row in df_ground_truth.iterrows():
    list_of_symptoms = []
    for col in columns:
        if row[col] is not np.nan:
            list_of_symptoms.append(row[col])
    df_ground_truth.at[i, 'SYMPTOMS_LIST'] = str(list_of_symptoms)


See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])


In [179]:
# Convert strings to lists
df_ground_truth['SYMPTOMS_LIST'] = df_ground_truth['SYMPTOMS_LIST'].apply(ast.literal_eval)

# Verify the type of the first element
print(type(df_ground_truth['SYMPTOMS_LIST'][0]))  # S

<class 'list'>


In [180]:
symptom_list = df_ground_truth['SYMPTOMS_LIST'].explode()  # Flatten the list of lists
symptom_counts = Counter(symptom_list)         # Count each symptom occurrence
grand_dict_of_symptoms = dict(symptom_counts) 

In [181]:
print("""
- {}
- {}
- {}
- {}
- {}
- {}
- {}
- {}
- {}
- {}
- {}
- {}
- {}
- {}
- {}
- {}
- {}
- {}
- {}
- {}
""".format(*(grand_dict_of_symptoms), *(grand_dict_of_symptoms.values())))



- Injection site pain
- Loss of personal independence in daily activities
- Pain in extremity
- X-ray
- Angiogram
- Angiogram cerebral
- Aphasia
- Blood glucose
- CSF cell count
- Alpha haemolytic streptococcal infection
- Blood culture positive
- Chills
- Computerised tomogram thorax abnormal
- Endocarditis
- Decreased appetite
- Diarrhoea
- Fatigue
- Night sweats
- Arthralgia
- Urticaria



In [None]:

symptom_dummies = df_ground_truth['SYMPTOMS_LIST'].apply(lambda x: pd.Series(1, index=set(x))).fillna(0)

frequent_itemsets = apriori(symptom_dummies, min_support=0.01, use_colnames=True)

rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)

rules = rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']]

# Display Results
print("Grand Dictionary of Symptoms:", grand_dict_of_symptoms)
print("\nFrequent Itemsets (Support ≥ 0.01):\n", frequent_itemsets)
print("\nAssociation Rules (Confidence ≥ 0.5):\n", rules)

high_lift_rules = rules[rules['lift'] > 1.0]
print("\nHigh-Lift Association Rules (Lift > 1.0):\n", high_lift_rules)



Grand Dictionary of Symptoms: {'Injection site pain': 569, 'Loss of personal independence in daily activities': 69, 'Pain in extremity': 538, 'X-ray': 22, 'Angiogram': 28, 'Angiogram cerebral': 11, 'Aphasia': 42, 'Blood glucose': 23, 'CSF cell count': 2, 'Alpha haemolytic streptococcal infection': 1, 'Blood culture positive': 7, 'Chills': 581, 'Computerised tomogram thorax abnormal': 26, 'Endocarditis': 2, 'Decreased appetite': 120, 'Diarrhoea': 196, 'Fatigue': 1124, 'Night sweats': 19, 'Arthralgia': 725, 'Urticaria': 188, 'Limb mass': 8, 'Bradyphrenia': 1, 'Brain fog': 149, 'Acute respiratory failure': 92, 'Alanine aminotransferase increased': 17, 'Aspartate aminotransferase increased': 12, 'Bladder catheterisation': 7, 'Eye laser surgery': 2, 'Retinal tear': 1, 'Vitreous floaters': 10, 'Abdominal pain upper': 75, 'Asthenia': 558, 'Blood pressure increased': 61, 'COVID-19': 2603, 'Angiogram pulmonary abnormal': 19, 'Atelectasis': 15, 'COVID-19 pneumonia': 75, 'Blood test': 246, 'Chest

In [184]:
frequent_itemsets.sort_values(by='support', ascending=False)[0:20]

Unnamed: 0,support,itemsets
7,0.16367,(COVID-19)
25,0.101547,(Expired product administered)
16,0.089097,(No adverse event)
35,0.071366,(Drug ineffective)
4,0.070674,(Fatigue)
60,0.067027,"(Drug ineffective, COVID-19)"
38,0.055835,(Headache)
32,0.051496,(SARS-CoV-2 test)
33,0.049862,(Vaccination failure)
59,0.049296,"(COVID-19, Vaccination failure)"


In [185]:
rules.sort_values(by='lift', ascending=False )

Unnamed: 0,antecedents,consequents,support,confidence,lift
7,(Injection site erythema),(Injection site swelling),0.014462,0.516854,23.286246
6,(Injection site swelling),(Injection site erythema),0.014462,0.651558,23.286246
8,(Underdose),(Product administered to patient of inappropri...,0.010501,0.609489,19.271002
9,"(Headache, Malaise)",(Fatigue),0.010123,0.79703,11.277545
10,"(Fatigue, Malaise)",(Headache),0.010123,0.624031,11.176339
2,(Body temperature),(Fatigue),0.011255,0.617241,8.733636
1,(Myalgia),(Fatigue),0.015279,0.532895,7.540176
0,(Malaise),(Fatigue),0.016222,0.522267,7.389802
4,(Vaccination failure),(COVID-19),0.049296,0.988651,6.04053
12,"(Drug ineffective, SARS-CoV-2 test)",(COVID-19),0.019995,0.98452,6.015293


In [120]:
frequent_itemsets[0:20]

Unnamed: 0,support,itemsets
0,0.035777,(Injection site pain)
1,0.033828,(Pain in extremity)
2,0.036532,(Chills)
3,0.012324,(Diarrhoea)
4,0.070674,(Fatigue)
5,0.045586,(Arthralgia)
6,0.011821,(Urticaria)
7,0.16367,(COVID-19)
8,0.035086,(Asthenia)
9,0.011318,(Electrocardiogram)


In [97]:
rules[rules['lift'] > 1.0]

Unnamed: 0,antecedents,consequents,support,confidence,lift
0,(Malaise),(Fatigue),0.016222,0.522267,7.389802
1,(Myalgia),(Fatigue),0.015279,0.532895,7.540176
2,(Body temperature),(Fatigue),0.011255,0.617241,8.733636
3,(SARS-CoV-2 test),(COVID-19),0.045775,0.888889,5.430998
4,(Vaccination failure),(COVID-19),0.049296,0.988651,6.04053
5,(Drug ineffective),(COVID-19),0.067027,0.939207,5.738436
6,(Injection site swelling),(Injection site erythema),0.014462,0.651558,23.286246
7,(Injection site erythema),(Injection site swelling),0.014462,0.516854,23.286246
8,(Underdose),(Product administered to patient of inappropri...,0.010501,0.609489,19.271002
9,"(Headache, Malaise)",(Fatigue),0.010123,0.79703,11.277545
