<a href="https://colab.research.google.com/github/Raunaq14/Data_Sci_Research/blob/main/02_device_data_processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import zipfile
import os
import pandas as pd

In [3]:
# Paths
zip_base = "/content/drive/MyDrive/FDA_Data"
extract_base = "/content/drive/MyDrive/Extracted_FDA_Data"

device_zips = {
    "2023": os.path.join(zip_base, "device2023.zip"),
    "2024": os.path.join(zip_base, "device2024.zip")
}


In [14]:
for year, zip_path in device_zips.items():
    extract_path = os.path.join(extract_base, f"device_{year}")
    os.makedirs(extract_path, exist_ok=True)

    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)

    # Look for .txt file inside extracted folder
    txt_files = [f for f in os.listdir(extract_path) if f.endswith(".txt")]

    if txt_files:
        extracted_file = os.path.join(extract_path, txt_files[0])
        renamed_file = os.path.join(extract_base, f"device_{year}.txt")
        os.rename(extracted_file, renamed_file)
        print(f"✅ Extracted and renamed: {txt_files[0]} → device_{year}.txt")
    else:
        print(f"⚠️ No .txt file found in {extract_path}")

✅ Extracted and renamed: DEVICE2023.txt → device_2023.txt
✅ Extracted and renamed: DEVICE2024.txt → device_2024.txt


In [4]:
# Paths
device_dir = "/content/drive/MyDrive/Extracted_FDA_Data"
mdr_dir = "/content/drive/MyDrive/Extracted_FDA_Data/yearly_mdr"
output_dir = "/content/drive/MyDrive/Extracted_FDA_Data/final_merged_data"
os.makedirs(output_dir, exist_ok=True)

# Process 2023 and 2024
for year in ["2023", "2024"]:
    device_path = os.path.join(device_dir, f"device_{year}.txt")
    mdr_path = os.path.join(mdr_dir, f"mdr_{year}.csv")
    output_path = os.path.join(output_dir, f"final_data_{year}.csv")

    print(f"\n🔄 Processing year {year}...")

    # Load files
    df_device = pd.read_csv(device_path, sep='|', dtype=str, encoding='ISO-8859-1', low_memory=False, on_bad_lines='skip')
    df_mdr = pd.read_csv(mdr_path, dtype=str, encoding='ISO-8859-1', low_memory=False)

    # Filter device entries that exist in mdr
    keys = df_mdr['MDR_REPORT_KEY'].dropna().unique()
    df_device_filtered = df_device[df_device['MDR_REPORT_KEY'].isin(keys)]

    print(f"🔗 Merging {len(df_device_filtered)} device rows with {len(df_mdr)} MDR rows...")

    df_final = df_device_filtered.merge(df_mdr, on='MDR_REPORT_KEY', how='left')

    # Save output
    df_final.to_csv(output_path, index=False)
    print(f"✅ Saved: {output_path}")


🔄 Processing year 2023...
🔗 Merging 892974 device rows with 891940 MDR rows...
✅ Saved: /content/drive/MyDrive/Extracted_FDA_Data/final_merged_data/final_data_2023.csv

🔄 Processing year 2024...
🔗 Merging 2625952 device rows with 2627139 MDR rows...
✅ Saved: /content/drive/MyDrive/Extracted_FDA_Data/final_merged_data/final_data_2024.csv
