In [37]:
import pandas as pd
import os

# === CONFIG ===
FAERS_FOLDER = "datasets/FAERS"      # <-- your FAERS TXT folder
OUTPUT_FILE = "output/faers_drug_summary.csv"

# Create output folder if missing
os.makedirs("output", exist_ok=True)

def load_faers_file(keyword):
    """Find and load a FAERS TXT file that contains the given keyword."""
    print(f"Searching for '{keyword}' in {FAERS_FOLDER}...")
    for f in os.listdir(FAERS_FOLDER):
        if keyword.lower() in f.lower() and f.lower().endswith(".txt"):
            path = os.path.join(FAERS_FOLDER, f)
            print(f"Loading {f} ...")
            df = pd.read_csv(
                path, 
                sep='$', 
                engine='python', 
                encoding='utf-8-sig',  # Handle BOM (Byte Order Mark)
                dtype=str,
                quoting=3,  # QUOTE_NONE - treat all characters literally
                on_bad_lines='skip'  # Skip malformed lines instead of crashing
            )
            # Clean column names (remove BOM and whitespace)
            df.columns = df.columns.str.strip().str.replace('ï»¿', '')
            return df
    print(f"⚠️ {keyword} file not found in {FAERS_FOLDER}")
    return pd.DataFrame()


In [38]:
demo = load_faers_file("demo")
drug = load_faers_file("drug")
reac = load_faers_file("reac")
indi = load_faers_file("indi")
outc = load_faers_file("outc")
ther = load_faers_file("ther")
rpsr = load_faers_file("rpsr")


Searching for 'demo' in datasets/FAERS...
Loading demo12q4.txt ...
Searching for 'drug' in datasets/FAERS...
Loading drug12q4.txt ...
Searching for 'reac' in datasets/FAERS...
Loading reac12q4.txt ...
Searching for 'indi' in datasets/FAERS...
Loading indi12q4.txt ...
Searching for 'outc' in datasets/FAERS...
Loading outc12q4.txt ...
Searching for 'ther' in datasets/FAERS...
Loading ther12q4.txt ...
Searching for 'rpsr' in datasets/FAERS...
Loading rpsr12q4.txt ...


In [34]:
# Debug: Check actual column names
print("Drug columns:", drug.columns.tolist())
print("Reac columns:", reac.columns.tolist())
print("\nFirst few rows of drug:")
print(drug.head())


Drug columns: ['primaryid', 'caseid', 'drug_seq', 'role_cod', 'drugname', 'val_vbm', 'route', 'dose_vbm', 'cum_dose_chr', 'cum_dose_unit', 'dechal', 'rechal', 'lot_nbr', 'exp_dt', 'nda_num', 'dose_amt', 'dose_unit', 'dose_form', 'dose_freq']
Reac columns: ['primaryid', 'caseid', 'pt']

First few rows of drug:
  primaryid   caseid drug_seq role_cod    drugname val_vbm           route  \
0  34483284  3448328        1       PS     SUSTIVA       1  TRANSPLACENTAL   
1  34483284  3448328        2       SS  NEVIRAPINE       1  TRANSPLACENTAL   
2  34483284  3448328        3       SS    VIRACEPT       1  TRANSPLACENTAL   
3  34483284  3448328        4       SS    COMBIVIR       1  TRANSPLACENTAL   
4  34483284  3448328        5       SS    RETROVIR       1  TRANSPLACENTAL   

  dose_vbm cum_dose_chr cum_dose_unit dechal rechal lot_nbr exp_dt nda_num  \
0  UNK UNK          NaN           NaN    NaN      U     NaN    NaN  020972   
1      NaN          NaN           NaN    NaN      U     NaN    N

In [41]:
# Debug: Check all columns
print("Outc columns:", outc.columns.tolist())
print("Indi columns:", indi.columns.tolist())
print("Demo columns:", demo.columns.tolist())
print("Ther columns:", ther.columns.tolist())


Outc columns: ['primaryid', 'caseid', 'outc_code']
Indi columns: ['primaryid', 'caseid', 'indi_drug_seq', 'indi_pt']
Demo columns: ['primaryid', 'caseid', 'caseversion', 'i_f_code', 'event_dt', 'mfr_dt', 'init_fda_dt', 'fda_dt', 'rept_cod', 'mfr_num', 'mfr_sndr', 'age', 'age_cod', 'gndr_cod', 'e_sub', 'wt', 'wt_cod', 'rept_dt', 'to_mfr', 'occp_cod', 'reporter_country', 'occr_country']
Ther columns: ['primaryid', 'caseid', 'dsg_drug_seq', 'start_dt', 'end_dt', 'dur', 'dur_cod']


In [42]:
# Keep only relevant columns and filter primary suspect drugs
if not drug.empty:
    # Select columns (caseid is automatically excluded)
    drug = drug[['primaryid','drugname','role_cod']].dropna(subset=['drugname'])
    drug = drug[drug['role_cod'].str.upper() == 'PS']
    print(f"Drug data: {drug.shape[0]} rows")

if not reac.empty:
    reac = reac[['primaryid','pt']].dropna(subset=['pt'])
    print(f"Reac data: {reac.shape[0]} rows")

if not indi.empty:
    indi = indi[['primaryid','indi_pt']]
    # Drop duplicates to avoid creating duplicate rows in merge
    indi = indi.drop_duplicates()
    print(f"Indi data: {indi.shape[0]} rows")

if not outc.empty:
    outc = outc[['primaryid','outc_code']]  # Fixed: outc_code not outc_cod
    # Drop duplicates to avoid creating duplicate rows in merge
    outc = outc.drop_duplicates()
    print(f"Outc data: {outc.shape[0]} rows")

if not demo.empty:
    demo = demo[['primaryid','age','age_cod','gndr_cod','reporter_country']]
    # Drop duplicates on primaryid (keep first occurrence)
    demo = demo.drop_duplicates(subset=['primaryid'])
    print(f"Demo data: {demo.shape[0]} rows")

if not ther.empty:
    ther = ther[['primaryid','start_dt','end_dt']]
    # Drop duplicates to avoid creating duplicate rows in merge
    ther = ther.drop_duplicates()
    print(f"Ther data: {ther.shape[0]} rows")


Drug data: 240634 rows
Reac data: 738098 rows
Indi data: 294851 rows
Outc data: 191699 rows
Demo data: 238388 rows
Ther data: 262408 rows


In [43]:
# Merge all FAERS tables
# Start fresh by merging drug and reac
faers = drug.merge(reac, on='primaryid', how='inner')
print(f"After drug+reac merge: {faers.shape[0]} rows")

# Add indication data (left join - not all drugs have indication data)
if not indi.empty: 
    faers = faers.merge(indi, on='primaryid', how='left')
    print(f"After indi merge: {faers.shape[0]} rows")

# Add outcome data (left join)
if not outc.empty: 
    faers = faers.merge(outc, on='primaryid', how='left')
    print(f"After outc merge: {faers.shape[0]} rows")

# Add demographic data (left join)
if not demo.empty:
    faers = faers.merge(demo, on='primaryid', how='left')
    print(f"After demo merge: {faers.shape[0]} rows")

# Add therapy dates (left join)
if not ther.empty: 
    faers = faers.merge(ther, on='primaryid', how='left')
    print(f"After ther merge: {faers.shape[0]} rows")

print(f"\n✅ Final merged FAERS data: {faers.shape[0]} rows, {faers.shape[1]} columns")
print(f"Columns: {faers.columns.tolist()}")


After drug+reac merge: 750539 rows
After indi merge: 1208332 rows
After outc merge: 1701070 rows
After demo merge: 1701070 rows
After ther merge: 5415863 rows

✅ Final merged FAERS data: 5415863 rows, 12 columns
Columns: ['primaryid', 'drugname', 'role_cod', 'pt', 'indi_pt', 'outc_code', 'age', 'age_cod', 'gndr_cod', 'reporter_country', 'start_dt', 'end_dt']


In [44]:
# Clean text columns - standardize to uppercase and remove whitespace
faers['drugname'] = faers['drugname'].str.upper().str.strip()
faers['pt'] = faers['pt'].str.upper().str.strip()

# Clean other text columns if they exist
if 'indi_pt' in faers.columns:
    faers['indi_pt'] = faers['indi_pt'].str.upper().str.strip()

if 'reporter_country' in faers.columns:
    faers['reporter_country'] = faers['reporter_country'].str.upper().str.strip()

print("✅ Text columns cleaned and standardized")
print(f"\nSample data:")
print(faers[['drugname', 'pt']].head())


✅ Text columns cleaned and standardized

Sample data:
  drugname              pt
0  SUSTIVA  LARYNGOMALACIA
1  SUSTIVA  LARYNGOMALACIA
2   EXELON  ABDOMINAL PAIN
3   EXELON       DYSPEPSIA
4   EXELON       PNEUMONIA


In [45]:
# Compute drug-level aggregates
summary = (
    faers.groupby('drugname')
         .agg({
             'pt': 'count',  # Count adverse events
             'indi_pt': lambda x: ', '.join(pd.Series(x).dropna().unique()[:3]) if 'indi_pt' in faers else '',
             'outc_code': lambda x: (pd.Series(x).eq('DE').mean() if 'outc_code' in faers else None)
         })
         .rename(columns={'pt': 'ADR_Count', 'outc_code': 'Severe_Outcome_Rate'})
         .reset_index()
)

# Calculate ADR rate (proportion of total adverse events)
summary['ADR_Rate'] = summary['ADR_Count'] / summary['ADR_Count'].sum()

# Sort by ADR count descending
summary = summary.sort_values('ADR_Count', ascending=False)

print(f"✅ Summary computed: {summary.shape[0]} unique drugs")
print(f"\nTop 10 drugs by adverse event count:")
summary.head(10)


✅ Summary computed: 7265 unique drugs

Top 10 drugs by adverse event count:


Unnamed: 0,drugname,ADR_Count,indi_pt,Severe_Outcome_Rate,ADR_Rate
2909,FOSAMAX,631847,"OSTEOPOROSIS, ARTHRITIS, ANKYLOSING SPONDYLITIS",0.00807,0.116666
6488,THYMOGLOBULIN,231441,"BONE MARROW CONDITIONING REGIMEN, APLASTIC ANA...",0.245717,0.042734
4667,NEXIUM,107603,"BARRETT'S OESOPHAGUS, THROMBOSIS PROPHYLAXIS, ...",0.032973,0.019868
5732,REVLIMID,105905,"MANTLE CELL LYMPHOMA, MULTIPLE MYELOMA, DRUG U...",0.185893,0.019555
7093,YASMIN,102651,"MENORRHAGIA, DEPRESSION, CONTRACEPTION",0.002163,0.018954
2434,ENBREL,100748,"PSORIATIC ARTHROPATHY, PSORIASIS, RHEUMATOID A...",0.011762,0.018602
571,AREDIA,94944,"MULTIPLE MYELOMA, HYPERTENSION, FEBRILE NEUTRO...",0.18018,0.017531
3193,HUMIRA,81666,"PSORIATIC ARTHROPATHY, TYPE 2 DIABETES MELLITU...",0.086572,0.015079
5696,REMICADE,76592,"PSORIATIC ARTHROPATHY, IRIDOCYCLITIS, JUVENILE...",0.041858,0.014142
7095,YAZ,74359,"FIBROIDS, MENOPAUSE, PNEUMONIA",0.010221,0.01373


In [46]:
# Save summary to CSV
summary.to_csv(OUTPUT_FILE, index=False)
print(f"✅ Saved FAERS summary to {OUTPUT_FILE}")
print(f"   {summary.shape[0]} drugs, {summary.shape[1]} columns")


✅ Saved FAERS summary to output/faers_drug_summary.csv
   7265 drugs, 5 columns
