In [1]:
import pandas as pd

# Load datasets
mimic_df = pd.read_csv(r"C:\Users\tendo\OneDrive\Documents\Srivatsav\sem 5\Big Data\Project\MIMIC\prescriptions.csv\prescriptions.csv")   # contains subject_id, drug, dosage info
risk_df   = pd.read_csv(r"C:\Users\tendo\OneDrive\Documents\Srivatsav\sem 5\Big Data\Project\db_drug_interactions.csv\db_drug_interactions.csv")  # contains Drug 1, Drug 2, Interaction Description

# Print feature names
print("=== MIMIC-IV Dataset Features ===")
print(mimic_df.columns.tolist())

print("\n=== Drug Risk Dataset Features ===")
print(risk_df.columns.tolist())

# Optionally, print first few rows to inspect
print("\n=== MIMIC-IV Sample Rows ===")
print(mimic_df.head(3))

print("\n=== Drug Risk Sample Rows ===")
print(risk_df.head(3))


  mimic_df = pd.read_csv(r"C:\Users\tendo\OneDrive\Documents\Srivatsav\sem 5\Big Data\Project\MIMIC\prescriptions.csv\prescriptions.csv")   # contains subject_id, drug, dosage info


=== MIMIC-IV Dataset Features ===
['subject_id', 'hadm_id', 'pharmacy_id', 'poe_id', 'poe_seq', 'order_provider_id', 'starttime', 'stoptime', 'drug_type', 'drug', 'formulary_drug_cd', 'gsn', 'ndc', 'prod_strength', 'form_rx', 'dose_val_rx', 'dose_unit_rx', 'form_val_disp', 'form_unit_disp', 'doses_per_24_hrs', 'route']

=== Drug Risk Dataset Features ===
['Drug 1', 'Drug 2', 'Interaction Description']

=== MIMIC-IV Sample Rows ===
   subject_id   hadm_id  pharmacy_id       poe_id  poe_seq order_provider_id  \
0    10000032  22595853     12775705  10000032-55     55.0            P85UQ1   
1    10000032  22595853     18415984  10000032-42     42.0            P23SJA   
2    10000032  22595853     23637373  10000032-35     35.0            P23SJA   

             starttime             stoptime drug_type  \
0  2180-05-08 08:00:00  2180-05-07 22:00:00      MAIN   
1  2180-05-07 02:00:00  2180-05-07 22:00:00      MAIN   
2  2180-05-07 01:00:00  2180-05-07 09:00:00      MAIN   

               

In [None]:
import pandas as pd
from itertools import combinations

# Load datasets
mimic_df = pd.read_csv(r"C:\Users\Sarang\OneDrive\Documents\Sarang\sem 5\Big Data\Project\MIMIC\prescriptions.csv\prescriptions.csv")   # contains subject_id, drug, dosage info
risk_df   = pd.read_csv(r"C:\Users\Sarang\OneDrive\Documents\Sarang\sem 5\Big Data\Project\db_drug_interactions.csv\db_drug_interactions.csv")  # contains Drug 1, Drug 2, Interaction Description


def clean_drug_name(x):
    if pd.isna(x):
        return None
    return str(x).strip().lower()

mimic_df["drug"] = mimic_df["drug"].apply(clean_drug_name)
risk_df["Drug 1"] = risk_df["Drug 1"].apply(clean_drug_name)
risk_df["Drug 2"] = risk_df["Drug 2"].apply(clean_drug_name)

# Drop rows with no drug name
mimic_df = mimic_df.dropna(subset=["drug"])
risk_df = risk_df.dropna(subset=["Drug 1", "Drug 2"])

# ===============================
# Generate unique drug pairs from MIMIC-IV
# ===============================
pairs = []
for pid, group in mimic_df.groupby("subject_id"):
    drugs = sorted(set(group["drug"].dropna()))  # ensure unique + no NaN
    for d1, d2 in combinations(drugs, 2):       # all 2-combinations
        pairs.append((pid, d1, d2))

pairs_df = pd.DataFrame(pairs, columns=["subject_id", "Drug 1", "Drug 2"])

# ===============================
# Merge with risk dataset
# ===============================
# Ensure drug pairs are ordered consistently
pairs_df["key"] = pairs_df.apply(lambda row: tuple(sorted([row["Drug 1"], row["Drug 2"]])), axis=1)
risk_df["key"] = risk_df.apply(lambda row: tuple(sorted([row["Drug 1"], row["Drug 2"]])), axis=1)

# Merge
merged_df = pairs_df.merge(risk_df[["key", "Risk"]], on="key", how="left")

# Drop helper key
merged_df = merged_df.drop(columns=["key"])

# ===============================
# Save result
# ===============================
merged_df.to_csv("mimic_drug_pairs_with_risk.csv", index=False)

print("✅ Done! Combined dataset saved as mimic_drug_pairs_with_risk.csv")
print("Total pairs:", len(merged_df))
print("Pairs with risk info:", merged_df["Risk"].notna().sum())

