In [33]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm



In [23]:
ped_data = pd.read_csv('../../data/pediatric_patients_report_drug_reaction.csv.gz',compression='gzip', index_col=0)

In [24]:
print(f'Ped Data: {len(ped_data):,} rows')

Ped Data: 881,396 rows


In [50]:
ped_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 881396 entries, 1 to 1754669
Data columns (total 28 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   patient_custom_master_age       881396 non-null  float64
 1   patient_custom_master_age_unit  881396 non-null  object 
 2   patient_onsetage                881396 non-null  float64
 3   patient_onsetageunit            881396 non-null  object 
 4   patient_sex                     881396 non-null  object 
 5   safetyreportid                  881396 non-null  object 
 6   nichd                           881396 non-null  object 
 7   ich_ema                         639944 non-null  object 
 8   fda                             529859 non-null  object 
 9   lastupdate_date                 881396 non-null  object 
 10  mostrecent_receive_date         881396 non-null  object 
 11  receive_date                    881396 non-null  object 
 12  congenital_anomali  

In [25]:
print(ped_data['drug_characterization'].unique()) 

['Suspect (the drug was considered by the reporter to be the cause)'
 'Concomitant (the drug was reported as being taken along with the suspect drug)'
 'Interacting (the drug was considered by the reporter to have interacted with the suspect drug)']


In [51]:
df_filtered = ped_data[ped_data['drug_characterization'].isin([ped_data['drug_characterization'].unique()[0], 'Primary Suspect Drug'])].copy()
print(f"After filtering 'Primary Suspect Drug'  {len(df_filtered):,} rows")


After filtering 'Primary Suspect Drug'  704,273 rows


In [52]:
df_filtered = df_filtered.rename(columns={
    'medicinal_product': 'drug_name',
    'reaction_meddrapt': 'adr_name'
})

In [53]:
df_filtered['serious'] = df_filtered['serious'].apply(lambda x: 1 if str(x) == '1' else 0)

In [54]:
seriousness_columns = [
    'death',
    'life_threatening',
    'hospitalization',
    'disabling',
    'congenital_anomali',
    'other',
]

In [55]:
    # 1:'other',
    # 2:'congenital_anomali',
    # 3:'disabling',
    # 4:'hospitalization',
    # 5:'life_threatening',
    # 6:'death'

## ADR Filtering

In [56]:
print("\n--- step 1: Filtering ADR (> 50 Associated Drugs) ---")

# Count the number of unique drugs for each ADR
adr_drug_counts = df_filtered.groupby('adr_name')['drug_name'].nunique()

# Filter ADR with more than 50 associated drugs
qualified_adrs = adr_drug_counts[adr_drug_counts > 50].index.tolist()

# Filter main DataFrame to keep only ADRs that meet the criteria
df_final = df_filtered[df_filtered['adr_name'].isin(qualified_adrs)].copy()

print(f"Total ADR before filtering: {len(adr_drug_counts)}")
print(f"Total ADR with more than 50 associated drugs: {len(qualified_adrs)}")
print(f"Total rows after filtering ADR: {len(df_final)}")


--- step 1: Filtering ADR (> 50 Associated Drugs) ---
Total ADR before filtering: 12208
Total ADR with more than 50 associated drugs: 1228
Total rows after filtering ADR: 556872


## Seriousness PRR

In [57]:
def calculate_seriousness_prr(df, drug, adr, seriousness_outcome):

    N_di_sj_plus = len(df[(df['drug_name'] == drug) & (df['adr_name'] == adr)])
    N_di_sj_k = len(df[(df['drug_name'] == drug) & (df['adr_name'] == adr) & (df[seriousness_outcome] == 1)])
    N_plus_k = len(df[df[seriousness_outcome] == 1])
    N_plus_plus = len(df)

    a = N_di_sj_k
    b = N_di_sj_plus - N_di_sj_k
    c = N_plus_k - N_di_sj_k
    d = N_plus_plus - N_di_sj_plus - c
    
    # Prevent division by zero
    if a == 0 or b < 0 or c < 0 or d < 0 or (a + b) == 0 or (c + d) == 0:
        return {'prr': 0, 'ci_lower': 0, 'significant': 0}

    numerator = a / (a + b)
    denominator = c / (c + d)
    
    if denominator == 0:
        return {'prr': 0, 'ci_lower': 0, 'significant': 0}
        
    prr = numerator / denominator

    # Calculate 95% CI according to the formula SE specified in the article
    # SE = sqrt(1/a - 1/(a+b) + 1/c - 1/(c+d))
    # Note: This SE formula is specific and may not be the standard formula
    try:
        se_term1 = 1/a
        se_term2 = 1/(a+b)
        se_term3 = 1/c
        se_term4 = 1/(c+d)
        
        # Check if the values under sqrt are non-negative
        se_value = se_term1 - se_term2 + se_term3 - se_term4
        if se_value < 0:
            return {'prr': prr, 'ci_lower': 0, 'significant': 0}

        se = np.sqrt(se_value)
        ci_lower = np.exp(np.log(prr) - 1.96 * se)
    except (ValueError, ZeroDivisionError):
        ci_lower = 0

    significant = 1 if ci_lower > 1 else 0

    return {'prr': prr, 'ci_lower': ci_lower, 'significant': significant}

print("Function 'calculate_seriousness_prr' is ready to use")


--- step 2: Seriousness PRR ---
Function 'calculate_seriousness_prr' is ready to use


In [58]:
seriousness_columns

['death',
 'life_threatening',
 'hospitalization',
 'disabling',
 'congenital_anomali',
 'other']

## Main Processing Loop

In [None]:
print("\n--- step 3: Start creating labels for all drug-ADR pairs ---")

# Get all unique drug-ADR pairs from the filtered data
unique_drug_adr_pairs = df_final[['drug_name', 'adr_name']].drop_duplicates().to_records(index=False)

# Prepare list to store final results
final_labels_data = []

# Use tqdm to show progress bar
for drug, adr in tqdm(unique_drug_adr_pairs, desc="Processing Drug-ADR Pairs"):
    
    # 1. Calculate Label Vector (Task 2)
    seriousness_vector = []
    for outcome in seriousness_columns:
        # print(outcome)
        result = calculate_seriousness_prr(df_final, drug, adr, outcome)
        seriousness_vector.append(result['significant'])
        
    # 2. Create Label Serious (Task 1) from Vector
    # If there is 1 in the vector, even if it is only one, set label_serious to 1
    label_serious = 1 if sum(seriousness_vector) > 0 else 0
    
    # 3.  save results
    final_labels_data.append({
        'drug_name': drug,
        'adr_name': adr,
        'label_serious': label_serious,
        'label_vector': seriousness_vector
    })

print(f"\nProcess of creating Label is complete! Total {len(final_labels_data)} pairs")


--- step 3: Start creating labels for all drug-ADR pairs ---


Processing Drug-ADR Pairs:  12%|█▏        | 26841/227012 [4:42:34<38:43:51,  1.44it/s]