In [1]:
import numpy as np
import pandas as pd
from math import ceil

N_WINDOWS = 5.1e9

In [3]:
cols = ["drug_name_A", "drug_name_B", "drug_name_C", "drug_name_D", "drug_name_E"]
df_out = pd.DataFrame([], columns=cols + ["exact_exposure_count"]) # Output dataframe
unique_drugs = pd.DataFrame([], columns=["drug_name"]) # All unique drug names, used for columns in projected dataset

for i in range(1, 6):
    fn = f"base_data/db_drugs_{i}s.tsv"
    df = pd.read_csv(fn, delimiter="\t", float_precision="high")
    tmp_df = df[cols[:i] + ["exact_exposure_count"]]
    stacked_drugs = df[cols[:i]].T.stack().reset_index(name="drug_name")
    unique_drugs = pd.concat([unique_drugs, stacked_drugs])
    tmp_df[cols[i:]] = float("nan")

    df_out = pd.concat([df_out, tmp_df])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = value
  exec(code_obj, self.user_global_ns, self.user_ns)


In [4]:
unique_drugs = unique_drugs["drug_name"].unique()


In [5]:
df_out = df_out.replace(r"^\s*$", np.nan, regex=True)  # Replace empty strings by np.nan

df_out = df_out.dropna(
    subset=["exact_exposure_count"]
)  # Drop rows which have no information on fraction
df_out["exact_exposure_count"] = pd.to_numeric(
    df_out["exact_exposure_count"], errors="coerce"
)
df_out = df_out.dropna(
    subset=["exact_exposure_count"]
)  # Drop rows which have no information on fraction


df_out["ratio_exact"] = (
    df_out["exact_exposure_count"] / N_WINDOWS
)  # Find the ratio of exact exposures across all the windows of the source dataset

df_out["ratio_exact"] = df_out["ratio_exact"] * (
    1 / df_out["ratio_exact"].sum()
)  # Rescale so that current combinations are basically all that ever existed (rescale sum of ratios to 1)
df_out = df_out[df_out["ratio_exact"] != 0]

df_out = df_out.drop(
    labels=["exact_exposure_count"], axis=1
)  # Drop the exact count, so we can rescale the dataset to an arbitrary amount of observations


In [5]:
print(f"{len(unique_drugs)=}")


len(unique_drugs)=1180


In [6]:
sample_dataset = pd.DataFrame([], columns=unique_drugs)
num_rows_dataset = 100000

Empty DataFrame
Columns: [hydrochlorothiazide, thyroxine, lisinopril, simvastatin, ethinyl estradiol, acetaminophen, amlodipine, atorvastatin, metoprolol, metformin, hydrocodone, fluticasone, omeprazole, amoxicillin, atenolol, azithromycin, sertraline, zolpidem, albuterol, losartan, montelukast, valsartan, alprazolam, esomeprazole, norethindrone, bupropion, escitalopram, rosuvastatin, citalopram, furosemide, ezetimibe, pravastatin, estradiol, fluoxetine, clopidogrel, benazepril, fenofibrate, pantoprazole, oxycodone, prednisone, gabapentin, triamterene, salmeterol, ibuprofen, warfarin, venlafaxine, tramadol, meloxicam, fexofenadine, clonazepam, dextroamphetamine, olmesartan, mometasone, cyclobenzaprine, carvedilol, amphetamine, tamsulosin, norgestimate, duloxetine, clavulanate, lorazepam, estrogens, conjugated (usp), ciprofloxacin, lansoprazole, drospirenone, trazodone, allopurinol, levonorgestrel, naproxen, diltiazem, codeine, doxycycline, paroxetine, trimethoprim, alendronate, glipizi

In [7]:
for index, row in df_out.iterrows():
    combination = tuple(row[:-1].dropna())
    fraction = float(row[-1])
    num_rows_combi = round(fraction * num_rows_dataset)
    dic = {key: [1] for key in combination}
    dummy_df = pd.DataFrame(dic, columns=unique_drugs).fillna(0)
    dummy_df = pd.DataFrame(np.repeat(dummy_df.values, num_rows_combi, axis=0), columns=dummy_df.columns)

    sample_dataset = pd.concat([sample_dataset, dummy_df])


In [None]:
sample_dataset.to_csv('new_data/sample_dataset.csv')