In [None]:
# Import all necessary Libraries

import pandas as pd
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import torch
import numpy as np
from collections import Counter
import re
import ast

In [None]:
# Data is organised into a single csv for the chosen time frame 2020 december - 2024 may

years = [2020,2021,2022,2023,2024]

# Define the raw data file path format for all three - VAERSSYMPTOMS.csv, VAERSVAX.csv, VAERSDATA.csv
file_pattern_symptoms = '/Users/sharanya/Documents/Personalized-Risk-Stratification/Data/Rawdata/{year}VAERSSYMPTOMS.csv'
file_pattern_vax = '/Users/sharanya/Documents/Personalized-Risk-Stratification/Data/Rawdata/{year}VAERSVAX.csv'
file_pattern_demographics = '/Users/sharanya/Documents/Personalized-Risk-Stratification/Data/Rawdata/{year}VAERSDATA.csv' 

# Initialize an empty list to hold dataframes
dfs = []

for year in years:
    # Construct file paths for the year
    file_symptoms = file_pattern_symptoms.format(year=year)
    file_vax = file_pattern_vax.format(year=year)
    file_demographics = file_pattern_demographics.format(year=year)
    
    # Read the files into DataFrames
    df_symptoms = pd.read_csv(file_symptoms, encoding='ISO-8859-1')
    df_vax = pd.read_csv(file_vax, encoding='ISO-8859-1')
    df_demographics = pd.read_csv(file_demographics, encoding='ISO-8859-1') 
    
    # Filter the vax DataFrame to only include 'COVID19' and 'COVID19-2' in 'vax_type'
    df_vax_filtered = df_vax[df_vax['VAX_TYPE'].isin(['COVID19', 'COVID19-2'])]
    
    # Merge the filtered vax DataFrame with symptoms and third file DataFrames
    merged_df = pd.merge(df_symptoms, df_vax_filtered, on='VAERS_ID')
    merged_df = pd.merge(merged_df, df_demographics, on='VAERS_ID')
    
    # Append the merged DataFrame to the list
    dfs.append(merged_df)

# Concatenate all yearly DataFrames into a single DataFrame
combined_df = pd.concat(dfs, ignore_index=True)




  df_adrdata = pd.read_csv(file_adrdata, encoding='ISO-8859-1')
  df_adrdata = pd.read_csv(file_adrdata, encoding='ISO-8859-1')
  df_adrdata = pd.read_csv(file_adrdata, encoding='ISO-8859-1')


In [None]:
### optional - save the .csv file before proceeding to the next step (uncomment and run below commented line)
# combined_df.to_csv('/Data/Combined_VAERS_data.csv', index=False)

In [None]:
# selecting only relevant features
data = combined_df[['VAERS_ID', 'SYMPTOM1', 'SYMPTOM2', 'SYMPTOM3', 'SYMPTOM4', 'SYMPTOM5', 'VAX_DOSE_SERIES', 'VAX_NAME', 'VAX_ROUTE', 'STATE', 'AGE_YRS', 'SEX', 'OTHER_MEDS', 'CUR_ILL', 'HISTORY', 'PRIOR_VAX', 'ALLERGIES']]

In [None]:
# considering only patient records with dose series 1 or 2 for better analysis on ADR
data = data[data['VAX_DOSE_SERIES'].isin(['1', '2'])]

In [None]:
# age column preprocessing, converting to categorical for better analysis
def convertAge(string_age):
    if 0 <= float(string_age) <= 2: # 0 - 3 yrs
        return "Infant"
    if 3 <= float(string_age) <= 12: # 4 - 15 yrs
        return "Kid"
    if 13 <= float(string_age) <= 19: # 13 - 19 yrs
        return "Teenager"
    if 20 <= float(string_age) <= 30: # 20 - 29 yrs
        return "Young Adult"
    if 31 <= float(string_age) <= 59: # 30 - 59 yrs
        return "Adult"
    return "Senior Citizen" # >60 yrs

data['AGE_YRS'] = data['AGE_YRS'].apply(lambda x: convertAge(x))

In [None]:
#column preprocessing - handling missing values and data imputation

data["VAX_ROUTE"] = data['VAX_ROUTE'].fillna("UNK")

# Function to clean state abbreviations
def clean_state(state):  
    if state in official_states:
        return state
    elif state == "TX" or state == "CA":
        return state  
    elif state == "Tx":  
        return "TX"
    elif state == "ca":
        return "CA"
    else:
        return "UNK"  # Replace unrecognized states with "UNK"

# List of official USPS two-letter state abbreviations
official_states = {
    'AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', 'HI', 'ID',
    'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS',
    'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK',
    'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV',
    'WI', 'WY', 'DC', 'GU', 'PR', 'VI'
}
# Apply the function to the dataset
data['STATE'] = data['STATE'].apply(clean_state)



In [None]:
#consider splitting the data by vaccine brands - pfizer, moderna

data_pfizer = data[data['VAX_NAME'].isin(['COVID19 (COVID19 (PFIZER-BIONTECH))', 'COVID19 (COVID19 (PFIZER-BIONTECH BIVALENT))'])]
data_moderna = data[data['VAX_NAME'].isin(['COVID19 (COVID19 (MODERNA))', 'COVID19 (COVID19 (MODERNA BIVALENT))'])]

In [None]:
# function to consider only the rows with atlease one labelled adr present - to retain records with relevant adverse reactions only.
def filter_by_symptoms(dataframe, reaction_list, symptom_columns):
    """
    Filters rows in the DataFrame based on the presence of labelled adr in the specified symptom columns.
    Args:
        dataframe: The DataFrame to filter.
        reaction_list: A list of adr to filter by.
        symptom_columns: The list of column names to check for reactions.
    Returns:
        pd.DataFrame: A filtered DataFrame containing only the rows where at least one of the reactions is found in the specified columns.
    """
    symptom_data = dataframe[symptom_columns].apply(lambda col: col.str.lower())
    mask = symptom_data.isin(reaction_list).any(axis=1)
    return dataframe[mask]

# list of all the adr as per their company labellings.
pfizer_adr = ['chest pain', 'shortness of breath', 'breath shortness', 'difficulty breathing', 'pounding heartbeat', 'fast heartbeat', 'rash', 'itch', 'hives', 'face swelling', 'myocarditis', 'pericarditis', 'injection site pain', 'injection site reaction', 'injection site swelling', 'injection site redness', 'tierdness', 'headache', 'muscle pain', 'chills', 'joint pain', 'fever', 'nausea', 'feeling unwell', 'uneasy feeling', 'swollen lymph nodes', 'decreased appetite','diarrhea', 'vomiting', 'arm pain', 'fainting', 'dizziness']
moderna_adr = ['myocarditis','pericarditis','cardiac','anaphylaxis','urticaria','syncope','pain','auxillary swelling','auxillary tenderness','swelling','redness','fatigue','headache','myalgia','arthralgia','chills','fever','nausea','lymphadenopathy','erythemea','injection site reaction','injection site redness','injection site swelling','injection site itch','injection site rash']
symptom_columns = ["SYMPTOM1", "SYMPTOM2", "SYMPTOM3", "SYMPTOM4", "SYMPTOM5"]

pfizer_filtered_by_symptoms = filter_by_symptoms(data_pfizer, pfizer_adr, symptom_columns)
moderna_filtered_by_symptoms = filter_by_symptoms(data_moderna, moderna_adr, symptom_columns)


In [None]:
#Function for extracting all the unique symptoms from the 5 symptom columns 
def get_unique_values(df, column_prefix, start, end):
    """
    Extract unique values from specified columns in a DataFrame.
    Args:
        df (pd.DataFrame): The DataFrame to process.
        column_prefix (str): The prefix of the column names (e.g., 'SYMPTOM').
        start (int): The starting index for the column range.
        end (int): The ending index for the column range.
    Returns:
        list: A combined list of unique values from the specified columns.
    """
    all_unique_values = set()
    for i in range(start, end + 1):
        col_name = f"{column_prefix}{i}"
        if col_name in df.columns:  
            column_unique = df[col_name].unique()
            all_unique_values.update(column_unique)
    return list(all_unique_values)

# Call the function
pfizer_symptom_list = get_unique_values(pfizer_filtered_by_symptoms, column_prefix="SYMPTOM", start=1, end=5)
moderna_symptom_list = get_unique_values(moderna_filtered_by_symptoms, column_prefix="SYMPTOM", start=1, end=5)





In [None]:
# Disease Extraction using BioBERT to retain only the relevant health condition terms in the columns - PastHC and CUR_ILL
# This is a pretrained model for Named Entity Recognition (NER) in biomedical texts, specifically for disease-related terms.
# This is mainly used to get rid of the irrelevant terms in the columns.

# Load model and tokenizer
model_name = "alvaroalon2/biobert_diseases_ner"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

# Create a NER pipeline
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Function to clean and reconstruct entities
def reconstruct_entities(entities):
    """
    Reconstructs named entities by combining subword tokens and removing artifacts.
    
    Args: 
        entities (list): A list of dictionaries containing NER results, each with 'word' and optionally 'entity_group'.
    
    Return: 
        list: A list of reconstructed entity strings with subwords merged.
    """
    reconstructed_terms = []
    for entity in entities:
        word = entity.get('word', '')
        if word.startswith("##"):
            if reconstructed_terms:
                reconstructed_terms[-1] += word[2:]  # Merge subword with the previous word
            else:
                reconstructed_terms.append(word[2:])
        else:
            reconstructed_terms.append(word)
    return reconstructed_terms

# Function to chunk long text
def chunk_text(text, max_length=128):
    """
    Splits input text into smaller chunks based on token length.
    
    Args: 
        text (str): The input text that's to be chunked.
        max_length (int): The maximum number of tokens per chunk (default is 128).
    
    Return: 
        list: A list of decoded text chunks, each within the token limit.
    """
    tokens = tokenizer(text, return_tensors="pt", truncation=False)["input_ids"][0].tolist()
    chunks = [tokens[i : i + max_length] for i in range(0, len(tokens), max_length)]
    return [tokenizer.decode(chunk, skip_special_tokens=True) for chunk in chunks]

# Function to extract disease-related terms
def extract_diseases(text):
    """
    Extracts disease related terms from input biomedical text using a BioBERT-based NER pipeline.
    
    Args: 
        text (str): The biomedical input text to extract disease entities from.
    
    Return: 
        list: A list of unique, reconstructed disease-related terms extracted from the text.
    """

    try:
        # Handle non-string or empty inputs
        if not isinstance(text, str) or not text.strip():
            return []  

        # Split long text into chunks
        text_chunks = chunk_text(text, max_length=512)

        # Apply the NER pipeline to each chunk
        all_entities = []
        for chunk in text_chunks:
            entities = ner_pipeline(chunk)
            # Debugging: Print raw entities
            print(f"Raw entities for text: {chunk[:50]}... {entities}")
            # Skip if entities are not a list
            if not isinstance(entities, list):
                continue  
            all_entities.extend(entities)

        # Filter entities for disease-related terms and reconstruct them
        disease_terms = [entity for entity in all_entities if 'entity_group' in entity and entity['entity_group'] in ['DISEASE']]
        reconstructed_terms = reconstruct_entities(disease_terms)
        return list(set(reconstructed_terms))
    except Exception as e:
        print(f"Error processing text: {text}. Error: {e}")
        return []



Device set to use mps:0


In [None]:
# Moderna data disease term extraction
moderna_filtered_by_symptoms.loc[:, 'ExtractedPastHC'] = moderna_filtered_by_symptoms['HISTORY'].apply(extract_diseases)
moderna_filtered_by_symptoms.loc[:, 'ExtractedCurrentHC'] = moderna_filtered_by_symptoms['CUR_ILL'].apply(extract_diseases)

In [None]:
# Pfizer data disease term extraction
pfizer_filtered_by_symptoms.loc[:, 'ExtractedPastHC'] = pfizer_filtered_by_symptoms['HISTORY'].apply(extract_diseases)
pfizer_filtered_by_symptoms.loc[:, 'ExtractedCurrentHC'] = pfizer_filtered_by_symptoms['CUR_ILL'].apply(extract_diseases)

In [None]:
# Load model and tokenizer
model_name = "judithrosell/BC5CDR_ClinicalBERT_NER"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

# Create a NER pipeline
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")


# Function to extract disease-related terms
def extract_medications(text):
    """
    Extracts medication related terms from clinical text using a ClinicalBERT-based NER pipeline.
   
    Args:
        text (str): Clinical input text to analyze for chemical or medication entities.
    
    Return:
        list: A list of unique medication terms found in the input text.
    """
    try:
        if not isinstance(text, str) or not text.strip():
            return []  # Handle non-string or empty inputs

        # Split long text into chunks
        text_chunks = chunk_text(text, max_length=512)

        # Apply the NER pipeline to each chunk
        all_entities = []
        for chunk in text_chunks:
            entities = ner_pipeline(chunk)

            # Debugging: Print raw entities
            print(f"Raw entities for text: {chunk[:50]}... {entities}")

            if not isinstance(entities, list):
                continue  # Skip if entities are not a list

            all_entities.extend(entities)

        # Filter entities for disease-related terms and reconstruct them
        medicine_terms = [entity for entity in all_entities if 'entity_group' in entity and entity['entity_group'] in ['Chemical']]
        reconstructed_terms = reconstruct_entities(medicine_terms)
        return list(set(reconstructed_terms))  # Return unique disease terms
    except Exception as e:
        print(f"Error processing text: {text}. Error: {e}")
        return []



Device set to use mps:0


In [None]:
# medical term extraction from moderna and pfizer dataframes.
pfizer_filtered_by_symptoms.loc[:, 'ExtractedMedications'] = pfizer_filtered_by_symptoms['OTHER_MEDS'].apply(extract_medications)
moderna_filtered_by_symptoms.loc[:, 'ExtractedMedications'] = moderna_filtered_by_symptoms['OTHER_MEDS'].apply(extract_medications)


In [None]:
# dropping duplicates based on VAERS_ID
pfizer_sample = pfizer_filtered_by_symptoms.drop_duplicates(subset=["VAERS_ID"])
moderna_sample = moderna_filtered_by_symptoms.drop_duplicates(subset=["VAERS_ID"])

In [None]:
# function froms an excel of all the retained symptoms along with their frequencies
# these are used later to perform manual labelling and validation of the symptoms

def get_symptom_frequencies(dataframe, columns):
    """
    Extract unique symptom terms from multiple columns and compute their frequencies.
    
    Args:
        dataframe (pd.DataFrame): The DataFrame containing symptom columns.
        columns (list): List of column names to extract and process.
    
    Return:
        pd.DataFrame: A DataFrame with 'Symptom' and 'Count' columns, sorted by count.
    
    """
    all_text = ''
    for col in columns:
        all_text += ' ' + ' '.join(dataframe[col].dropna().astype(str))
    
    words = re.findall(r'\b[\w\s]+\b', all_text.lower())
    word_counts = Counter(words)
    
    symptom_freq_df = pd.DataFrame(word_counts.items(), columns=['Symptom', 'Count'])
    symptom_freq_df['Symptom'] = symptom_freq_df['Symptom'].str.title()
    symptom_freq_df = symptom_freq_df.sort_values(by='Count', ascending=False).reset_index(drop=True)
    
    return symptom_freq_df


moderna_symptom_freq = get_symptom_frequencies(moderna_sample, symptom_columns)
moderna_symptom_freq.to_excel('/Users/sharanya/Documents/Personalized-Risk-Stratification/Data/Target Label/moderna_symptom_data.xlsx', index=False)
pfizer_symptom_freq = get_symptom_frequencies(pfizer_sample, symptom_columns)
pfizer_symptom_freq.to_excel('/Users/sharanya/Documents/Personalized-Risk-Stratification/Data/Target Label/pfizer_symptom_data.xlsx', index=False)


### These symptom data Excel files were used for manual review. We carefully examined all listed symptoms and selected only the relevant adverse drug reaction terms, categorizing each under its appropriate organ-system classification.

### While this process may seem repetitive, there’s an important distinction: 

### we reviewed all symptoms reported by patients who had at least one relevant ADR. This approach allowed us to identify additional relevant ADRs that might have been missed initially from company labellings. As a result, we were able to develop a thoroughly refined and accurately labeled dataset, verified through manual review and cross-checked by a medical professional.

### the manually edited excel files are attached in this repo, so they could be used to run further steps.

In [None]:
# Load the manually edited excel files for further steps.
file_path = "/Users/sharanya/Documents/Personalized-Risk-Stratification/Data/Target Label /moderna_symptom_data_edit_2.xls" 
moderna_target = pd.read_excel(file_path)  

moderna_target_df = moderna_target[['Selected_ADR', 'Frequency', 'TargetLabel']]

# Split multiple target labels into a list
moderna_target_df['TargetLabel'] = moderna_target_df['TargetLabel'].str.split(',')

# Explode so, each row has only one target label
moderna_df_exploded = moderna_target_df.explode('TargetLabel')

# Remove leading/trailing spaces from TargetLabel names
moderna_df_exploded['TargetLabel'] = moderna_df_exploded['TargetLabel'].str.strip()

# Group by TargetLabel but preserve individual ADRs and frequencies
target_label_dict = {}
for label in moderna_df_exploded['TargetLabel'].unique():
    filtered_df = moderna_df_exploded[moderna_df_exploded['TargetLabel'] == label]
    
    # Store ADR for each target label
    target_label_dict[label] = {
        "ADR": filtered_df[['Selected_ADR', 'Frequency']].values.tolist()
    }

# Convert dictionary to DataFrame
moderna_result_df = pd.DataFrame.from_dict(target_label_dict, orient='index')

# Reset index to move Target Label into a column
moderna_result_df.reset_index(inplace=True)

moderna_result_df.columns = ["Target Label", "ADR_Frequency_List"]

# ADR with frequency < 10 are removed, to avoid extreme rare cases.
moderna_result_df['ADR_Frequency_List'] = moderna_result_df['ADR_Frequency_List'].apply(
    lambda items: [entry for entry in items if entry[1] >= 10]
)

# Save results to an Excel file
output_path = "/Users/sharanya/Documents/Personalized-Risk-Stratification/Data/Target Label /moderna_target_analysis.xlsx"
moderna_result_df.to_excel(output_path, index=False)

print("Processing complete. Output saved to:", output_path)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  moderna_target_df['TargetLabel'] = moderna_target_df['TargetLabel'].str.split(',')


Processing complete. Output saved to: /Users/sharanya/Documents/Personalized-Risk-Stratification/Data/Target Label /moderna_target_analysis.xlsx


In [None]:
# Load the manually edited excel files for further steps.
file_path = "/Users/sharanya/Documents/Personalized-Risk-Stratification/Data/Target Label /pfizer_symptom_data_edit_2.xls" 
pfizer_target = pd.read_excel(file_path)  

pfizer_target_df = pfizer_target[['Selected_ADR', 'Frequency', 'TargetLabel']]

# Split multiple target labels into a list
pfizer_target_df['TargetLabel'] = pfizer_target_df['TargetLabel'].str.split(',')

# Explode so, each row has only one target label
pfizer_df_exploded = pfizer_target_df.explode('TargetLabel')

# Remove leading/trailing spaces from TargetLabel names
pfizer_df_exploded['TargetLabel'] = pfizer_df_exploded['TargetLabel'].str.strip()

# Group by TargetLabel but preserve individual ADRs and frequencies
target_label_dict = {}
for label in pfizer_df_exploded['TargetLabel'].unique():
    filtered_df = pfizer_df_exploded[pfizer_df_exploded['TargetLabel'] == label]
    
    # Store ADR for each target label
    target_label_dict[label] = {
        "ADR": filtered_df[['Selected_ADR', 'Frequency']].values.tolist()
    }

# Convert dictionary to DataFrame
pfizer_result_df = pd.DataFrame.from_dict(target_label_dict, orient='index')

# Reset index to move Target Label into a column
pfizer_result_df.reset_index(inplace=True)

pfizer_result_df.columns = ["Target Label", "ADR_Frequency_List"]

# ADR with frequency < 10 are removed, to avoid extreme rare cases.
pfizer_result_df['ADR_Frequency_List'] = pfizer_result_df['ADR_Frequency_List'].apply(
    lambda items: [entry for entry in items if entry[1] >= 10]
)

# Save results to an Excel file
output_path = "/Users/sharanya/Documents/Personalized-Risk-Stratification/Data/Target Label /pfizer_target_analysis.xlsx"
pfizer_result_df.to_excel(output_path, index=False)

print("Processing complete. Output saved to:", output_path)

Processing complete. Output saved to: /Users/sharanya/Documents/Personalized-Risk-Stratification/Data/Target Label /pfizer_target_analysis.xlsx


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pfizer_target_df['TargetLabel'] = pfizer_target_df['TargetLabel'].str.split(',')


In [None]:
# load teh target label data for the final step
df_labels = pd.read_excel("/Users/sharanya/Documents/Personalized-Risk-Stratification/Data/Target Label /moderna_target_analysis_2.xls")

# Extract list of all target labels, removing any NaN values
MODERNA_TargetLabel = df_labels['Target Label'].dropna().tolist()

# Dictionary to map each target label to its associated ADR terms
label_to_adrs = {}

# Iterate through each row of the label DataFrame
for idx, row in df_labels.iterrows():
    moderna_target_label = row["Target Label"]
    adr_freq_list_str = row["ADR_Frequency_List"]

    # If ADR list is missing, map label to empty set
    if pd.isna(adr_freq_list_str):
        label_to_adrs[moderna_target_label] = set()
        continue

    # Convert ADR frequency string back into a list of tuples
    adr_freq_list = ast.literal_eval(adr_freq_list_str)
    # Extract and normalize ADR terms to lowercase
    adr_terms = set([str(t[0]).strip().lower() for t in adr_freq_list])

    # Assign processed ADR terms to the corresponding label
    label_to_adrs[moderna_target_label] = adr_terms

# Initialize columns with empty strings
for t_label in label_to_adrs.keys():
    moderna_sample[t_label] = ""

# Row-wise iteration
for idx, row in moderna_sample.iterrows():
    row_symptoms = set(
        str(row[col]).strip().lower()
        for col in symptom_columns
        if not pd.isna(row[col])
    )

    # Compare symptoms to ADRs under each target label, if matching ADRs are found, join and assign to the corresponding target label column
    for t_label, adr_set in label_to_adrs.items():
        matched_adrs = row_symptoms.intersection(adr_set)
        if matched_adrs:
            moderna_sample.at[idx, t_label] = ", ".join(matched_adrs)

MODERNA_df = moderna_sample.loc[:, moderna_sample.columns.notna()]

In [None]:
# retaining only needed columns for the final DataFrame
needed_col = ['SEX', 'AGE_YRS', 'VAX_ROUTE', 'VAX_DOSE_SERIES', 'ExtractedPastHC', 'ExtractedMedications', 'Gastrointestinal Issues', 'Pain Syndromes', 'Psychological Disorders',
    'Musculoskeletal Disorders', 'Fever', 'Dermatological Conditions',
    'Neurological Disorders', 'Swelling', 'Injection Site Reaction']

# Update the DataFrame to retain only the specified columns
MODERNA_df = MODERNA_df[needed_col]

In [None]:
# Save the final dataset with labels for further analysis.
MODERNA_df.to_csv("moderna_with_labels_dose_1+2.csv", index=False)

In [None]:
df_labels = pd.read_excel("Target label formation/pfizer_target_analysis_2.xlsx")

# Extract list of all target labels, removing any NaN values
PFIZER_TargetLabel = df_labels['Target Label'].dropna().tolist()

# Dictionary to map each target label to its associated ADR terms
label_to_adrs = {}

# Iterate through each row of the label DataFrame
for idx, row in df_labels.iterrows():
    pfizer_target_label = row["Target Label"]
    adr_freq_list_str = row["ADR_Frequency_List"]

    # If ADR list is missing, map label to empty set
    if pd.isna(adr_freq_list_str):
        label_to_adrs[pfizer_target_label] = set()
        continue
    
    # Convert ADR frequency string back into a list of tuples
    adr_freq_list = ast.literal_eval(adr_freq_list_str)
    # Extract and normalize ADR terms to lowercase
    adr_terms = set([str(t[0]).strip().lower() for t in adr_freq_list])

    # Assign processed ADR terms to the corresponding label
    label_to_adrs[pfizer_target_label] = adr_terms

# Initialize columns with empty strings
for t_label in label_to_adrs.keys():
    pfizer_sample[t_label] = ""

# Row-wise iteration
for idx, row in pfizer_sample.iterrows():
    row_symptoms = set(
        str(row[col]).strip().lower()
        for col in symptom_columns
        if not pd.isna(row[col])
    )

    # Compare symptoms to ADRs under each target label, if matching ADRs are found, join and assign to the corresponding target label column
    for t_label, adr_set in label_to_adrs.items():
        matched_adrs = row_symptoms.intersection(adr_set)
        if matched_adrs:
            pfizer_sample.at[idx, t_label] = ", ".join(matched_adrs)

PFIZER_df = pfizer_sample.loc[:, pfizer_sample.columns.notna()]


In [None]:
# retaining only needed columns for the final DataFrame
needed_col = ['SEX', 'AGE_YRS', 'VAX_ROUTE', 'VAX_DOSE_SERIES', 'ExtractedPastHC', 'ExtractedMedications', 'Gastrointestinal Issues', 'Pain Syndromes', 'Psychological Disorders',
    'Musculoskeletal Disorders', 'Fever', 'Dermatological Conditions',
    'Neurological Disorders', 'Postural Disorders', 'Cardiovascular Conditions', 'Respiratory Symptoms', 'Injection Site Reaction']

# Update the DataFrame to retain only the specified columns
PFIZER_df = PFIZER_df[needed_col]

In [None]:
# Save the final dataset with labels for further analysis.
PFIZER_df.to_csv("pfizer_with_labels_dose_1+2.csv", index=False)