In [None]:
def categorize_symptoms(symptom):
    categories = {
        'pain': ['headache', 'pain', 'pain in extremity', 'chest discomfort', 'chest pain', 'abdominal pain upper', 'pain of skin', 'abdominal pain', 'bone pain', 'back pain', 'neck pain', 'musculoskeletal pain', 'arthralgia', 'myalgia', 'joint swelling', 'muscle spasms', 'muscle tightness', 'muscular weakness', 'joint range of motion decreased', 'musculoskeletal stiffness', 'joint stiffness', 'neck stif'],
        'fever/chills': ['pyrexia', 'chills', 'body temperature increased', 'feeling hot', 'feeling cold', 'cold sweat', 'night sweats', 'hyperhidrosis', 'fever', 'influenza', 'influenza like illness', 'covid-19', 'febrile neutropenia', 'feeling abnormal', 'feeling hot and cold', 'feeling cold and hot'],
        'fatigue/general discomfort': ['fatigue', 'asthenia', 'malaise', 'lethargy', 'condition aggravated', 'discomfort',],
        'skin reactions': ['injection site erythema', 'pruritus', 'injection site pruritus', 'rash', 'erythema', 'rash erythematous', 'rash pruritic', 'hyperhidrosis', 'skin warm', 'skin swelling', 'urticaria', 'skin discolouration', 'skin burning sensation', 'skin lesion', 'skin exfoliation', 'skin induration', 'skin tightness', 'skin irritation', 'skin reaction', 'skin ulcer', 'skin hypertrophy', 'skin atrophy', 'skin nodule', 'skin papilloma', 'skin hyperpigmentation', 'skin hypopigmentation', 'skin haemorrhage', 'skin necrosis', 'skin striae', 'skin wrinkling', 'skin fragility', 'skin exfoliation', 'skin depigmentation'],
        'digestive issues': ['nausea', 'vomiting', 'diarrhoea', 'abdominal discomfort', 'dysphagia', 'retching', 'abdominal distension', 'abdominal pain', 'abdominal pain upper', 'abdominal pain lower', 'abdominal tenderness', 'abdominal rigidity', 'abdominal pain right', 'abdominal pain left', 'abdominal pain upper', 'abdominal pain lower', 'abdominal tenderness', 'abdominal rigidity', 'abdominal pain right', 'abdominal pain left', 'abdominal pain upper', 'abdominal pain lower', 'abdominal tenderness', 'abdominal rigidity', 'abdominal pain right'],
        'respiratory symptoms': ['dyspnoea', 'cough', 'rhinorrhoea', 'nasal congestion', 'respiratory tract congestion', 'wheezing', 'dysphonia', 'respiratory failure', 'respiratory distress', 'respiratory arrest', 'respiratory depression', 'respiratory disorder', 'respiratory rate increased', 'respiratory tract infection', 'respiratory disorder', ],
        'neurological symptoms': ['dizziness', 'paraesthesia', 'hypoaesthesia', 'tremor', 'syncope', 'vertigo', 'seizure', 'loss of consciousness', 'confusional state', 'disorientation', 'presyncope'],
        'cardiovascular symptoms': ['palpitations', 'heart rate increased', 'hypertension', 'hypotension', 'tachycardia', 'chest discomfort'],
        'musculoskeletal symptoms': ['arthralgia', 'myalgia', 'muscle spasms', 'muscular weakness', 'joint swelling', 'musculoskeletal stiffness', 'joint range of motion decreased', 'muscle tightness'],
        'psychiatric symptoms': ['anxiety', 'depression', 'nervousness', 'insomnia', 'somnolence', 'hallucination', 'psychosis'],
        'visual and auditory symptoms': ['tinnitus', 'vision blurred', 'eye pain', 'eye swelling', 'photophobia', 'hypoacusis', 'hearing impaired', 'visual impairment', 'visual acuity reduced', 'visual disturbance', 'visual field defect', 'visual brightness', 'visual brightness', 'visual disturbance', 'visual field defect', 'visual acuity reduced', 'visual impairment', 'visual disturbance'],
        'hematological symptoms': ['lymphadenopathy', 'full blood count', 'pallor', 'haemoglobin decreased', 'haematocrit decreased', 'red blood cell count decreased', 'white blood cell count decreased', 'platelet count decreased', 'blood iron decreased', 'blood iron increased', 'blood lactate dehydrogenase increased', 'blood bilirubin increased', 'blood creatinine increased', 'blood creatine phosphokinase increased', 'blood urea increased', 'blood uric acid increased', 'blood potassium increased', 'blood sodium decreased', 'blood sodium increased', 'blood chloride decreased', 'blood chloride increased', 'blood calcium decreased', 'blood calcium increased', 'blood albumin decreased', 'blood albumin increased', 'blood alkaline phosphatase increased', 'blood amylase increased', 'blood glucose increased', 'blood glucose decreased', 'blood triglycerides increased', 'blood cholesterol increased', 'blood cholesterol decreased', 'blood triglycerides decreased', 'blood triglycerides increased'],
        'urinary symptoms': ['urine analysis', 'urinary issues', 'urinary retention', 'urinary tract infection', 'urinary incontinence',],
        'swelling-related symptoms': ['swelling', 'swollen tongue', 'swelling face', 'peripheral swelling', 'peripheral coldness', 'peripheral circulatory failure', 'peripheral ischaemia', 'peripheral sensory neuropathy', 'peripheral motor neuropathy', 'peripheral vascular disorder',],
        'systemic infections': ['covid-19', 'influenza like illness', 'herpes zoster', 'cellulitis'],
        'allergic reactions': ['hypersensitivity', 'anaphylactic reaction', 'angioedema', 'anaphylactic shock', 'anaphylactoid reaction', 'anaphylactic transfusion reaction', 'anaphylactic response', 'anaphylactic symptom', 'anaphylactic reaction', 'anaphylactic shock', 'anaphylactoid reaction', ],
    }
    
    for category, symptoms in categories.items():
        if symptom.lower() in [s.lower() for s in symptoms]:
            return category
    
    return 'Other' 

# Function to categorize symptoms in a list and return groups
def categorize_symptom_list(symptom_list):
    categories = [categorize_symptoms(symptom) for symptom in symptom_list.split(', ')]
    return ', '.join(set(categories))  # Combine unique categories into a comma-separated string

moderna_df['Symptoms_category'] = moderna_df['Symptoms'].apply(categorize_symptom_list)

# Function to process the values in Symptoms_category column
def process_category(symptoms):
    symptoms_list = symptoms.split(', ')
    if 'Other' in symptoms_list and len(symptoms_list) > 1:
        symptoms_list.remove('Other')
        return ', '.join(symptoms_list)
    return symptoms

# Apply the function to the Symptoms_category column
moderna_df['Symptoms_category'] = moderna_df['Symptoms_category'].apply(process_category)


In [None]:
from itertools import combinations

def extract_moderna_rules(apriori_result):
    rules = []
    for relation_record in apriori_result:
        for ordered_statistic in relation_record.ordered_statistics:
            antecedent = list(ordered_statistic.items_base)
            consequent = list(ordered_statistic.items_add)
            
            # Check if 'Moderna vaccine' is in antecedent or consequent
            if 'Moderna vaccine' in consequent and len(consequent) == 1:
                if antecedent and consequent:
                    antecedent_str = ', '.join(antecedent)
                    consequent_str = ', '.join(consequent)
                    support = relation_record.support
                    lift = ordered_statistic.lift
                    rule = f"{antecedent_str} -> {consequent_str} (Support: {support}, Lift: {lift})"
                    rules.append(rule)
    return rules

association_rules = extract_moderna_rules(results)

for rule in association_rules:
    print(rule)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

def extract_moderna_rules_with_metrics(apriori_result):
    rules = []
    supports = []
    lifts = []
    for relation_record in apriori_result:
        for ordered_statistic in relation_record.ordered_statistics:
            antecedent = list(ordered_statistic.items_base)
            consequent = list(ordered_statistic.items_add)
            
            # Check if 'Moderna vaccine' is in antecedent or consequent
            if 'Moderna vaccine' in consequent and len(consequent) == 1:
                if antecedent and consequent:  # Exclude if either antecedent or consequent is empty
                    support = relation_record.support
                    lift = ordered_statistic.lift
                    
                    antecedent_str = ', '.join(antecedent)
                    consequent_str = ', '.join(consequent)
                    
                    rules.append(f"{antecedent_str} -> {consequent_str}")
                    supports.append(support)
                    lifts.append(lift)
    
    # Create a DataFrame
    data = pd.DataFrame({'Rules': rules, 'Support': supports, 'Lift': lifts})
    return data


rules_data = extract_moderna_rules_with_metrics(results)

heatmap_data = rules_data.pivot_table(index='Rules', columns='Support', values='Support')

# Create heatmap
plt.figure(figsize=(18, 18))
sns.heatmap(heatmap_data, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Association Rules Heatmap')
plt.xlabel('Support')
plt.ylabel('Association Rules')
plt.show()


In [None]:
## Title: Analyzing COVID-19 Vaccine Adverse Reactions: A Comprehensive Stud
## Team Members: Rutu Barvaliya, Dharmit Anghan, Breanna Brown, Raghav Mangat
## Group: 8
## Course: Comp4710
## Section: A01
## Professor: Carson
import pandas as pd
import matplotlib.pyplot as plt
vers_data = pd.read_csv("2020VAERSDATA.csv", encoding='latin-1')
vers_vax = pd.read_csv("2020VAERSVAX.csv", encoding='latin-1')
vers_symptoms = pd.read_csv("2020PREPROCESSED_SYMPTOMS.csv", encoding='latin-1')
# Print the total count of VAERS_ID
total_count = len(vers_data['VAERS_ID'])
print("Total count of VAERS_ID:", total_count)

# Remove duplicate VAERS_ID
vers_data.drop_duplicates(subset='VAERS_ID', inplace=True)

# Print the count after removing duplicates
count_after_duplicates = len(vers_data['VAERS_ID'])
print("Count after removing duplicates:", count_after_duplicates)



# Print the total count of VAERS_ID
total_count = len(vers_vax['VAERS_ID'])
print("Total count of VAERS_ID:", total_count)

# Remove duplicate VAERS_ID
vers_vax.drop_duplicates(subset='VAERS_ID', inplace=True)

# Print the count after removing duplicates
count_after_duplicates = len(vers_vax['VAERS_ID'])
print("Count after removing duplicates:", count_after_duplicates)



# Print the total count of VAERS_ID
total_count = len(vers_symptoms['VAERS_ID'])
print("Total count of VAERS_ID:", total_count)

# Remove duplicate VAERS_ID
vers_symptoms.drop_duplicates(subset='VAERS_ID', inplace=True)

# Print the count after removing duplicates
count_after_duplicates = len(vers_symptoms['VAERS_ID'])
print("Count after removing duplicates:", count_after_duplicates)
### Cleaning up 2022VAERSDATA file.
# Define a dictionary with columns and their respective predefined values for filling nulls
fill_values = {
    'OTHER_MEDS': 'No medication',
    'CUR_ILL': 'Not applicable',
    'HISTORY': 'No concerns',
    'PRIOR_VAX': 'Not applicable', 
    'DIED': 'N',
    'DATEDIED': 'Not applicable', 
    'L_THREAT': 'N', 
    'ER_VISIT': 'N', 
    'HOSPITAL':'N',
    'HOSPDAYS': 0,
    'X_STAY': 'N',
    'DISABLE': 'N',
    'BIRTH_DEFECT': 'N',
    'OFC_VISIT': 'N',
    'ER_ED_VISIT': 'N',
    'ALLERGIES': 'N'
}

# Fill null values with predefined values for respective columns
for column, value in fill_values.items():
    vers_data[column].fillna(value, inplace=True)

# Calculate the percentage of null values in each column
null_percentages = (vers_data.isnull().sum() / len(vers_data)) * 100
print(null_percentages)

# List to store columns with more than 50% null values
columns_to_drop = []

# Iterate through each column's null percentage
for column, percentage in null_percentages.items():
    if percentage > 50:
        columns_to_drop.append(column)

# Drop columns with more than 50% null values
if columns_to_drop:
    vers_data.drop(columns=columns_to_drop, inplace=True)
    print(f"Dropped columns: {columns_to_drop}")
else:
    print("No columns have more than 50% null values.")

### Cleaning up 2022VAERSVAX file.
# Calculate the percentage of null values in each column
null_percentages = (vers_vax.isnull().sum() / len(vers_vax)) * 100
print(null_percentages)

# List to store columns with more than 50% null values
columns_to_drop = []

# Iterate through each column's null percentage
for column, percentage in null_percentages.items():
    if percentage > 50:
        columns_to_drop.append(column)

# Drop columns with more than 50% null values
if columns_to_drop:
    vers_vax.drop(columns=columns_to_drop, inplace=True)
    print(f"Dropped columns: {columns_to_drop}")
else:
    print("No columns have more than 50% null values.")
### Cleaning up 2022VAERSSYMPTOMS file.
# Define a dictionary with columns and their respective predefined values for filling nulls
fill_values = {
    'SYMPTOM2': 'No symptom',
    'SYMPTOMVERSION2': 0,
    'SYMPTOM3': 'No symptom',
    'SYMPTOMVERSION3': 0,
    'SYMPTOM4': 'No symptom',
    'SYMPTOMVERSION4': 0,
}

# Fill null values with predefined values for respective columns
for column, value in fill_values.items():
    vers_symptoms[column].fillna(value, inplace=True)

# Calculate the percentage of null values in each column
null_percentages = (vers_symptoms.isnull().sum() / len(vers_symptoms)) * 100
print(null_percentages) 

# List to store columns with more than 50% null values
columns_to_drop = []

# Iterate through each column's null percentage
for column, percentage in null_percentages.items():
    if percentage > 50:
        columns_to_drop.append(column)

# Drop columns with more than 70% null values
if columns_to_drop:
    vers_symptoms.drop(columns=columns_to_drop, inplace=True)
    print(f"Dropped columns: {columns_to_drop}")
else:
    print("No columns have more than 70% null values.")
### Showing updated data, with less null values.
num_rows, num_columns = vers_data.shape
print(f'Total number of rows in vers data: {num_rows}')
print(f'Total number of columns in vers data: {num_columns}')
print(f'Name of the columns: {vers_data.columns.tolist()}')
num_rows, num_columns = vers_vax.shape
print(f'Total number of rows in vers vax: {num_rows}')
print(f'Total number of columns in vers vax: {num_columns}')
print(f'Name of the columns: {vers_vax.columns.tolist()}')
num_rows, num_columns = vers_symptoms.shape
print(f'Total number of rows in vers symptoms: {num_rows}')
print(f'Total number of columns in vers symptoms: {num_columns}')
print(f'Name of the columns: {vers_symptoms.columns.tolist()}')
### Data with 0 null values.
# Removing rows with NaN values
cleaned_vers_data = vers_data.dropna()

num_rows, num_columns = cleaned_vers_data.shape

print(f'Total number of rows: {num_rows}')
print(f'Total number of columns: {num_columns}')
# Removing rows with NaN values
cleaned_vers_vax = vers_vax.dropna()

num_rows, num_columns = cleaned_vers_vax.shape

print(f'Total number of rows: {num_rows}')
print(f'Total number of columns: {num_columns}')
# Removing rows with NaN values
cleaned_vers_symptoms = vers_symptoms.dropna()

num_rows, num_columns = cleaned_vers_symptoms.shape

print(f'Total number of rows: {num_rows}')
print(f'Total number of columns: {num_columns}')
### Merge all the cleaned data files into one and write it out in cleaned_vers_data.csv file.
# Merge DataFrames based on the 'VERS_ID' column
merged_df = pd.merge(cleaned_vers_data, cleaned_vers_vax, on='VAERS_ID', how='inner')
merged_df = pd.merge(merged_df, cleaned_vers_symptoms, on='VAERS_ID', how='inner')

num_rows, num_columns = merged_df.shape
print(f'Total number of  rows in merged data: {num_rows}')
print(f'Total number of columns in merged data: {num_columns}')

# Writing it out in csv.
# output_file_name = "cleaned_vers_data.csv"
# merged_df.to_csv(output_file_name, index=False)
df1 = pd.read_csv("cleaned_vers_data_2020.csv", encoding='latin-1')
df2 = pd.read_csv("cleaned_vers_data_2021.csv", encoding='latin-1')
df3 = pd.read_csv("cleaned_vers_data_2022.csv", encoding='latin-1')

filtered_data1 = df1[(df1['VAX_TYPE'] == 'COVID19') | (df1['VAX_TYPE'] == 'COVID19-2')]
filtered_data2 = df2[(df2['VAX_TYPE'] == 'COVID19') | (df2['VAX_TYPE'] == 'COVID19-2')]
filtered_data3 = df3[(df3['VAX_TYPE'] == 'COVID19') | (df3['VAX_TYPE'] == 'COVID19-2')]

output_file_name1 = "cleaned_vers_data_covid_2020.csv"
filtered_data1.to_csv(output_file_name1, index=False)


output_file_name2 = "cleaned_vers_data_covid_2021.csv"
filtered_data2.to_csv(output_file_name2, index=False)


output_file_name3 = "cleaned_vers_data_covid_2022.csv"
filtered_data3.to_csv(output_file_name3, index=False)

df1 = pd.read_csv("cleaned_vers_data_covid_2020.csv", encoding='latin-1')
print(df1.shape)
# Read the three CSV files
file_paths = ["cleaned_vers_data_covid_2020.csv", "cleaned_vers_data_covid_2021.csv", "cleaned_vers_data_covid_2022.csv"]
dfs = [pd.read_csv(file_path) for file_path in file_paths]

# Concatenate the dataframes
df = pd.concat(dfs)
print(df.shape)

output_file_name = "cleaned_vers_data_covid19_vaccine_2020-2022.csv"
df.to_csv(output_file_name, index=False)

Total number of rows in vers data: 10381
Total number of columns in vers data: 46
Total number of rows: 7666
Total number of columns: 46

Total number of rows in vers data: 710783
Total number of columns in vers data: 46
Total number of rows: 445062
Total number of columns: 46

Total number of rows in vers data: 210736
Total number of columns in vers data: 46
Total number of rows: 104130
Total number of columns: 46

Total data 556,858