In [1]:
import pandas as pd
import os
import glob
import time

# Define paths
RAW_DIR = r"C:\Users\findo\OneDrive\Desktop\faers-dashboard\data\raw"
PROCESSED_DIR = r"C:\Users\findo\OneDrive\Desktop\faers-dashboard\data\processed"


os.makedirs(PROCESSED_DIR, exist_ok=True)  # Ensures the output directory exists

print(f"Environment Ready.")
print(f"Reading from: {os.path.abspath(RAW_DIR)}")
print(f"Saving to:    {os.path.abspath(PROCESSED_DIR)}")

Environment Ready.
Reading from: C:\Users\findo\OneDrive\Desktop\faers-dashboard\data\raw
Saving to:    C:\Users\findo\OneDrive\Desktop\faers-dashboard\data\processed


In [2]:
def load_faers_file(filepath):
    """
    Reads a FAERS ASCII .txt file with '$' delimiters.
    Returns a pandas DataFrame.
    """
    try:
        # 'latin1' encoding prevents crashes on weird characters
        # low_memory=False prevents mixed-type warnings
        df = pd.read_csv(
            filepath, 
            sep='$', 
            encoding='latin1', 
            low_memory=False,
            on_bad_lines='skip' # Skip broken lines if any exist
        )
        
        # Standardize column names to lowercase immediately
        df.columns = df.columns.str.lower()
        
        return df
    except Exception as e:
        print(f"Error reading {filepath}: {e}")
        return None

In [None]:
# CELL 3: The Main Processing Engine 

# The 6 key file types
file_types = ['DEMO', 'DRUG', 'REAC', 'OUTC', 'INDI', 'RPSR']

start_time = time.time()

for f_type in file_types:
    print(f"\n=========================================")
    print(f"PROCESSING: {f_type}")
    print(f"=========================================")
    
    # 1. Find all files
    search_pattern = os.path.join(RAW_DIR, f"{f_type}*.txt")
    files = glob.glob(search_pattern)
    
    if not files:
        search_pattern_upper = os.path.join(RAW_DIR, f"{f_type}*.TXT")
        files = glob.glob(search_pattern_upper)
    
    if not files:
        print(f"WARNING: No files found for {f_type}. Check your data folder.")
        continue

    # 2. Loop through files
    dataframes = []
    print(f"Found {len(files)} files.")
    
    for file in files:
        filename = os.path.basename(file)
        df = load_faers_file(file)
        
        if df is not None:
            df['source_file'] = filename
            dataframes.append(df)

    # 3. Combine and Save
    if dataframes:
        combined_df = pd.concat(dataframes, ignore_index=True)
        row_count = len(combined_df)
        print(f"-> Combined Rows: {row_count:,}")
        
        # --- TYPE ENFORCEMENT (THE FIX) ---
        # Force ID columns to be strings to prevent Parquet errors
        # nda_num, auth_num, etc. are IDs, not math numbers.
        problem_cols = ['nda_num', 'auth_num', 'primaryid', 'caseid', 'drug_seq']
        
        for col in problem_cols:
            if col in combined_df.columns:
                # Convert to string and handle NaN values gracefully
                combined_df[col] = combined_df[col].astype(str).replace('nan', '')

        # Save as Parquet
        output_file = os.path.join(PROCESSED_DIR, f"all_{f_type.lower()}.parquet")
        combined_df.to_parquet(output_file, index=False)
        print(f"-> SAVED: {output_file}")
        
        del combined_df, dataframes
        
    else:
        print(f"No valid data loaded for {f_type}")

print(f"\nTotal Ingestion Time: {(time.time() - start_time)/60:.2f} minutes")


PROCESSING: DEMO
Found 5 files.
-> Combined Rows: 2,048,518
-> SAVED: C:\Users\findo\OneDrive\Desktop\faers-dashboard\data\processed\all_demo.parquet

PROCESSING: DRUG
Found 5 files.
-> Combined Rows: 9,923,900
-> SAVED: C:\Users\findo\OneDrive\Desktop\faers-dashboard\data\processed\all_drug.parquet

PROCESSING: REAC
Found 5 files.
-> Combined Rows: 7,213,193
-> SAVED: C:\Users\findo\OneDrive\Desktop\faers-dashboard\data\processed\all_reac.parquet

PROCESSING: OUTC
Found 5 files.
-> Combined Rows: 1,540,096
-> SAVED: C:\Users\findo\OneDrive\Desktop\faers-dashboard\data\processed\all_outc.parquet

PROCESSING: INDI
Found 5 files.
-> Combined Rows: 6,049,692
-> SAVED: C:\Users\findo\OneDrive\Desktop\faers-dashboard\data\processed\all_indi.parquet

PROCESSING: RPSR
Found 5 files.
-> Combined Rows: 54,959
-> SAVED: C:\Users\findo\OneDrive\Desktop\faers-dashboard\data\processed\all_rpsr.parquet

Total Ingestion Time: 2.76 minutes


In [3]:
print("\n--- PROCESSED FILES SUMMARY ---")
processed_files = glob.glob(os.path.join(PROCESSED_DIR, "*.parquet"))
processed_files.sort()

for p_file in processed_files:
    file_size_mb = os.path.getsize(p_file) / (1024 * 1024)
    print(f"{os.path.basename(p_file):<20} | {file_size_mb:.2f} MB")


--- PROCESSED FILES SUMMARY ---
all_demo.parquet     | 79.88 MB
all_drug.parquet     | 166.08 MB
all_indi.parquet     | 33.96 MB
all_outc.parquet     | 12.50 MB
all_reac.parquet     | 38.61 MB
all_rpsr.parquet     | 0.70 MB
