In [1]:
import glob
import numpy as np
import pandas as pd
from dask import delayed, compute
import dask.dataframe as dd
import pickle
import os

data_dir = "../../data/openFDA_drug_event/"
# er_dir = data_dir+'er_tables/'
er_dir = data_dir+'er_tables_memory_efficient/'

try:
    os.mkdir(er_dir)
except:
    print(er_dir+" exists")

../../data/openFDA_drug_event/er_tables_memory_efficient/ exists


## functions

In [2]:
primarykey = 'safetyreportid'

def read_file(file):
    return pd.read_csv(file,compression='gzip',index_col=0,dtype={primarykey : 'str'})

## ER tables

### report

#### report_df

In [3]:
dir_ = data_dir+'report/'
files = glob.glob(dir_+'*.csv.gzip')
results = []
for file in files:
    df = delayed(read_file)(file)
    results.append(df)
report_df = (pd.concat(compute(*results),sort=True))
report_df[primarykey] = (report_df[primarykey].astype(str))
print(report_df.columns.values)
report_df.head()

['authoritynumb' 'companynumb' 'duplicate' 'fulfillexpeditecriteria'
 'occurcountry' 'primarysource' 'primarysource.literaturereference'
 'primarysource.qualification' 'primarysource.reportercountry'
 'primarysourcecountry' 'receiptdate' 'receiptdateformat' 'receivedate'
 'receivedateformat' 'receiver' 'receiver.receiverorganization'
 'receiver.receivertype' 'reportduplicate' 'reportduplicate.duplicatenumb'
 'reportduplicate.duplicatesource' 'reporttype' 'safetyreportid'
 'safetyreportversion' 'sender.senderorganization' 'sender.sendertype'
 'serious' 'seriousnesscongenitalanomali' 'seriousnessdeath'
 'seriousnessdisabling' 'seriousnesshospitalization'
 'seriousnesslifethreatening' 'seriousnessother' 'transmissiondate'
 'transmissiondateformat']


Unnamed: 0,authoritynumb,companynumb,duplicate,fulfillexpeditecriteria,occurcountry,primarysource,primarysource.literaturereference,primarysource.qualification,primarysource.reportercountry,primarysourcecountry,...,sender.sendertype,serious,seriousnesscongenitalanomali,seriousnessdeath,seriousnessdisabling,seriousnesshospitalization,seriousnesslifethreatening,seriousnessother,transmissiondate,transmissiondateformat
0,,AU-ROCHE-3108574,,True,AU,,,Consumer or non-health professional,AU,AU,...,1.0,The adverse event did not result in any of the...,,,,,,,20250115,102
1,,PH-SUN PHARMACEUTICAL INDUSTRIES LTD-2024R1-47...,,True,PH,,Mesina FZ. Severe Relapsed Autoimmune Hemolyti...,Other health professional,PH,PH,...,1.0,"The adverse event resulted in death, a life th...",2.0,2.0,2.0,1.0,2.0,2.0,20250115,102
2,,US-NOVARTISTESTPH-NVSC2022US139252,,False,US,,,Consumer or non-health professional,US,US,...,1.0,The adverse event did not result in any of the...,2.0,2.0,2.0,2.0,2.0,2.0,20250115,102
3,,CA-CELLTRION INC.-2023CA025283,1.0,True,CA,,,Other health professional,CA,CA,...,1.0,"The adverse event resulted in death, a life th...",2.0,2.0,2.0,2.0,2.0,1.0,20250115,102
4,,US-OPELLA-2024OHG040973,,False,US,,,Consumer or non-health professional,US,US,...,1.0,The adverse event did not result in any of the...,,,,,,,20250115,102


#### report_er_df

In [4]:
columns = [primarykey,'receiptdate',
           'receivedate',
           'transmissiondate']
rename_columns = {'receiptdate' : 'mostrecent_receive_date',
                  'receivedate' : 'receive_date',
                  'transmissiondate' : 'lastupdate_date'}

report_er_df = (report_df[columns].
                rename(columns=rename_columns).
                set_index(primarykey).
                sort_index().
                reset_index().
                dropna(subset=[primarykey]).
                drop_duplicates()
               )
report_er_df = report_er_df.reindex(np.sort(report_er_df.columns),axis=1)
report_er_df[primarykey] = report_er_df[primarykey].astype(str)       
report_er_df = report_er_df.reindex(np.sort(report_er_df.columns),axis=1)
print(report_er_df.info())
report_er_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19028302 entries, 0 to 19028301
Data columns (total 4 columns):
 #   Column                   Dtype 
---  ------                   ----- 
 0   lastupdate_date          int64 
 1   mostrecent_receive_date  int64 
 2   receive_date             int64 
 3   safetyreportid           object
dtypes: int64(3), object(1)
memory usage: 580.7+ MB
None


Unnamed: 0,lastupdate_date,mostrecent_receive_date,receive_date,safetyreportid
0,20141002,20140306,20140306,10003300
1,20141002,20140228,20140228,10003301
2,20141002,20140312,20140312,10003302
3,20141212,20140424,20140312,10003304
4,20141002,20140312,20140312,10003305


In [5]:
(report_er_df.
 groupby(primarykey).
 agg(max).
 reset_index().
 dropna(subset=[primarykey])
).to_csv(er_dir+'report.csv.gz',compression='gzip',index=False)

  agg(max).


In [6]:
del report_er_df

### report_serious

In [7]:
columns = [
    primarykey,
    'serious',
    'seriousnesscongenitalanomali',
    'seriousnessdeath',
    'seriousnessdisabling',
    'seriousnesshospitalization',
    'seriousnesslifethreatening',
    'seriousnessother'
]

rename_columns = {
    'seriousnesscongenitalanomali'   : 'congenital_anomali',
    'seriousnessdeath'               : 'death',
    'seriousnessdisabling'           : 'disabling',
    'seriousnesshospitalization'     : 'hospitalization',
    'seriousnesslifethreatening'     : 'life_threatening',
    'seriousnessother'               : 'other'
}

report_serious_er_df = (report_df[columns].
                        rename(columns=rename_columns).
                        set_index(primarykey).
                        sort_index().
                        reset_index().
                        dropna(subset=[primarykey]).
                        drop_duplicates().
                        groupby(primarykey).
                        first().
                        reset_index().
                        dropna(subset=[primarykey])
                       )
report_serious_er_df[primarykey] = report_serious_er_df[primarykey].astype(str)       
report_serious_er_df = report_serious_er_df.reindex(np.sort(report_serious_er_df.columns),axis=1)
print(report_serious_er_df.info())
report_serious_er_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19026493 entries, 0 to 19026492
Data columns (total 8 columns):
 #   Column              Dtype  
---  ------              -----  
 0   congenital_anomali  float64
 1   death               float64
 2   disabling           float64
 3   hospitalization     float64
 4   life_threatening    float64
 5   other               float64
 6   safetyreportid      object 
 7   serious             object 
dtypes: float64(6), object(2)
memory usage: 1.1+ GB
None


Unnamed: 0,congenital_anomali,death,disabling,hospitalization,life_threatening,other,safetyreportid,serious
0,,,1.0,,,,10003300,"The adverse event resulted in death, a life th..."
1,,,,,,1.0,10003301,"The adverse event resulted in death, a life th..."
2,,,,,,,10003302,The adverse event did not result in any of the...
3,,,,,,,10003304,The adverse event did not result in any of the...
4,,,,,,,10003305,The adverse event did not result in any of the...


In [8]:
(report_serious_er_df).to_csv(er_dir+'report_serious.csv.gz',compression='gzip',index=False)

### reporter

In [9]:
columns = [primarykey,'companynumb',
           'primarysource.qualification',
           'primarysource.reportercountry']
rename_columns = {'companynumb' : 'reporter_company',
                  'primarysource.qualification' : 'reporter_qualification',
                  'primarysource.reportercountry' : 'reporter_country'}

reporter_er_df = (report_df[columns].
                  rename(columns=rename_columns).
                  set_index(primarykey).
                  sort_index().
                  reset_index().
                  dropna(subset=[primarykey]).
                  drop_duplicates().
                  groupby(primarykey).
                  first().
                  reset_index().
                  dropna(subset=[primarykey])
                 )
reporter_er_df[primarykey] = reporter_er_df[primarykey].astype(str)  
reporter_er_df = reporter_er_df.reindex(np.sort(reporter_er_df.columns),axis=1)
print(reporter_er_df.info())
reporter_er_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19026493 entries, 0 to 19026492
Data columns (total 4 columns):
 #   Column                  Dtype 
---  ------                  ----- 
 0   reporter_company        object
 1   reporter_country        object
 2   reporter_qualification  object
 3   safetyreportid          object
dtypes: object(4)
memory usage: 580.6+ MB
None


Unnamed: 0,reporter_company,reporter_country,reporter_qualification,safetyreportid
0,1289378,US,Consumer or non-health professional,10003300
1,US-JNJFOC-20130719067,US,Consumer or non-health professional,10003301
2,US-PFIZER INC-2014068976,US,Consumer or non-health professional,10003302
3,US-PFIZER INC-2014063856,US,Physician,10003304
4,US-PFIZER INC-2014069067,US,Physician,10003305


In [10]:
(reporter_er_df).to_csv(er_dir+'reporter.csv.gz',compression='gzip',index=False)

In [11]:
try:
    del df
except:
    pass
try:
    del report_df
except:
    pass
try:
    del report_serious_er_df
except:
    pass
try:
    del report_er_df
except:
    pass
try:
    del reporter_er_df
except:
    pass

### patient

#### patient_df

In [12]:
dir_ = data_dir+'patient/'
files = glob.glob(dir_+'*.csv.gzip')
results = []
for file in files:
    df = delayed(read_file)(file)
    results.append(df)
patient_df = (pd.concat(compute(*results),sort=True))
patient_df[primarykey] = (patient_df[primarykey].astype(str))
print(patient_df.columns.values)
patient_df.head()

['master_age' 'patient.patientagegroup'
 'patient.patientdeath.patientdeathdate'
 'patient.patientdeath.patientdeathdateformat' 'patient.patientonsetage'
 'patient.patientonsetageunit' 'patient.patientsex'
 'patient.patientweight' 'patient.summary.narrativeincludeclinical'
 'safetyreportid']


Unnamed: 0,master_age,patient.patientagegroup,patient.patientdeath.patientdeathdate,patient.patientdeath.patientdeathdateformat,patient.patientonsetage,patient.patientonsetageunit,patient.patientsex,patient.patientweight,patient.summary.narrativeincludeclinical,safetyreportid
0,,,,,,,Female,,,24392017
1,45.0,,,,45.0,Year,Female,47.0,CASE EVENT DATE: 20240115,24392408
2,72.0,,,,72.0,Year,Female,70.0,CASE EVENT DATE: 20230609,22605263
3,,,,,,,Female,,CASE EVENT DATE: 20220101,22653754
4,62.0,Adult,,,62.0,Year,Female,,,22709563


#### patient_er_df

In [13]:
columns = [primarykey,
              'patient.patientonsetage',
              'patient.patientonsetageunit',
              'master_age',
              'patient.patientsex',
              'patient.patientweight'
             ]
rename_columns = {
              'patient.patientonsetage' : 'patient_onsetage',
              'patient.patientonsetageunit' : 'patient_onsetageunit',
              'master_age': 'patient_custom_master_age',
              'patient.patientsex' : 'patient_sex',
              'patient.patientweight' : 'patient_weight'
}

patient_er_df = (patient_df[columns].
                 rename(columns=rename_columns).
                 set_index(primarykey).
                 sort_index().
                 reset_index().
                 dropna(subset=[primarykey]).
                 drop_duplicates().
                 groupby(primarykey).
                 first().
                 reset_index().
                 dropna(subset=[primarykey])
                )
patient_er_df = patient_er_df.reindex(np.sort(patient_er_df.columns),axis=1)
print(patient_er_df.info())
patient_er_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19026493 entries, 0 to 19026492
Data columns (total 6 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   patient_custom_master_age  float64
 1   patient_onsetage           float64
 2   patient_onsetageunit       object 
 3   patient_sex                object 
 4   patient_weight             float64
 5   safetyreportid             object 
dtypes: float64(3), object(3)
memory usage: 871.0+ MB
None


Unnamed: 0,patient_custom_master_age,patient_onsetage,patient_onsetageunit,patient_sex,patient_weight,safetyreportid
0,77.0,77.0,Year,Female,,10003300
1,,,,Female,,10003301
2,,,,Male,,10003302
3,,,,Female,,10003304
4,48.0,48.0,Year,Female,,10003305


In [14]:
(patient_er_df).to_csv(er_dir+'patient.csv.gz',compression='gzip',index=False)

In [15]:
del df 
del patient_df

### drug_characteristics

#### patient.drug

In [3]:
dir_ = data_dir+'patient_drug/'
files = glob.glob(dir_+'*.csv.gzip')
results = []
for file in files:
    df = delayed(read_file)(file)
    results.append(df)
patient_drug_df = (pd.concat(compute(*results),sort=True))
patient_drug_df[primarykey] = (patient_drug_df[primarykey].astype(str))
print(patient_drug_df.columns.values)
patient_drug_df.head()

  return pd.read_csv(file,compression='gzip',index_col=0,dtype={primarykey : 'str'})
  return pd.read_csv(file,compression='gzip',index_col=0,dtype={primarykey : 'str'})
  return pd.read_csv(file,compression='gzip',index_col=0,dtype={primarykey : 'str'})
  return pd.read_csv(file,compression='gzip',index_col=0,dtype={primarykey : 'str'})
  return pd.read_csv(file,compression='gzip',index_col=0,dtype={primarykey : 'str'})
  return pd.read_csv(file,compression='gzip',index_col=0,dtype={primarykey : 'str'})
  return pd.read_csv(file,compression='gzip',index_col=0,dtype={primarykey : 'str'})
  return pd.read_csv(file,compression='gzip',index_col=0,dtype={primarykey : 'str'})
  return pd.read_csv(file,compression='gzip',index_col=0,dtype={primarykey : 'str'})
  return pd.read_csv(file,compression='gzip',index_col=0,dtype={primarykey : 'str'})
  return pd.read_csv(file,compression='gzip',index_col=0,dtype={primarykey : 'str'})
  return pd.read_csv(file,compression='gzip',index_col=0,dtype={p

['actiondrug' 'activesubstance' 'drugadditional' 'drugadministrationroute'
 'drugauthorizationnumb' 'drugbatchnumb' 'drugcharacterization'
 'drugcumulativedosagenumb' 'drugcumulativedosageunit' 'drugdosageform'
 'drugdosagetext' 'drugenddate' 'drugenddateformat' 'drugindication'
 'drugintervaldosagedefinition' 'drugintervaldosageunitnumb'
 'drugrecurreadministration' 'drugrecurrence' 'drugseparatedosagenumb'
 'drugstartdate' 'drugstartdateformat' 'drugstructuredosagenumb'
 'drugstructuredosageunit' 'drugtreatmentduration'
 'drugtreatmentdurationunit' 'entry' 'medicinalproduct' 'safetyreportid']


Unnamed: 0,actiondrug,activesubstance,drugadditional,drugadministrationroute,drugauthorizationnumb,drugbatchnumb,drugcharacterization,drugcumulativedosagenumb,drugcumulativedosageunit,drugdosageform,...,drugseparatedosagenumb,drugstartdate,drugstartdateformat,drugstructuredosagenumb,drugstructuredosageunit,drugtreatmentduration,drugtreatmentdurationunit,entry,medicinalproduct,safetyreportid
0,,{'activesubstancename': 'BIOTIN'},,,,,Concomitant (the drug was reported as being ta...,,,,...,,,,,,,,0,BIOTIN,11428226
1,,{'activesubstancename': 'ZINC\\ZINC CHLORIDE'},,,,,Concomitant (the drug was reported as being ta...,,,,...,,,,,,,,1,ZINC,11428226
2,Unknown,{'activesubstancename': 'ETANERCEPT'},,Subcutaneous,103795.0,1057047.0,Suspect (the drug was considered by the report...,,,SOLUTION FOR INJECTION IN PRE-FILLED SYRINGE,...,1.0,,,50.0,,,,2,ENBREL,11428226
3,,,,,,,Concomitant (the drug was reported as being ta...,,,,...,,,,,,,,3,IRON AND VITAMIN C,11428226
4,,{'activesubstancename': 'ASCORBIC ACID'},,,,,Concomitant (the drug was reported as being ta...,,,,...,,,,,,,,4,VITAMIN C,11428226


#### drugcharacteristics_er_df

In [4]:
# columns = [primarykey,
#            'medicinalproduct',
#            'drugcharacterization',
#            'drugadministrationroute',
#            'drugindication'
#           ]
# rename_columns = {
#               'medicinalproduct' : 'medicinal_product',
#               'drugcharacterization' : 'drug_characterization',
#               'drugadministrationroute': 'drug_administration',
#     'drugindication' : 'drug_indication'
# }

# drugcharacteristics_er_df = (patient_drug_df[columns].
#                              rename(columns=rename_columns).
#                              set_index(primarykey).
#                              sort_index().
#                              reset_index().
#                              drop_duplicates().
#                              dropna(subset=[primarykey])
#                             )
# drugcharacteristics_er_df = (drugcharacteristics_er_df.
#                              reindex(np.sort(drugcharacteristics_er_df.columns),axis=1))
# print(drugcharacteristics_er_df.info())
# drugcharacteristics_er_df.head()

# Memory-efficient drug characteristics processing
import gc
import warnings
warnings.filterwarnings('ignore')

print("Starting memory-efficient drug characteristics processing...")

# ตรวจสอบขนาดข้อมูลก่อน
if 'patient_drug_df' in locals():
    print(f"patient_drug_df shape: {patient_drug_df.shape}")
    print(f"Memory usage: {patient_drug_df.memory_usage(deep=True).sum() / 1e6:.2f} MB")
else:
    print("Error: patient_drug_df not found")

# Define columns and rename mapping
columns = [primarykey,
           'medicinalproduct',
           'drugcharacterization',
           'drugadministrationroute',
           'drugindication'
          ]

rename_columns = {
    'medicinalproduct' : 'medicinal_product',
    'drugcharacterization' : 'drug_characterization',
    'drugadministrationroute': 'drug_administration',
    'drugindication' : 'drug_indication'
}

try:
    # Check if all required columns exist
    missing_cols = [col for col in columns if col not in patient_drug_df.columns]
    if missing_cols:
        print(f"Missing columns: {missing_cols}")
        print(f"Available columns: {list(patient_drug_df.columns)}")
    else:
        print("Processing drug characteristics...")
        
        # Process in more memory-efficient way
        # 1. Select columns first to reduce memory
        print("Selecting required columns...")
        selected_data = patient_drug_df[columns].copy()
        
        print(f"Selected data shape: {selected_data.shape}")
        
        # 2. Drop rows with null primary key early
        print("Dropping null primary keys...")
        selected_data = selected_data.dropna(subset=[primarykey])
        print(f"After dropping null primary keys: {selected_data.shape}")
        
        # 3. Remove duplicates early to reduce size
        print("Removing duplicates...")
        selected_data = selected_data.drop_duplicates()
        print(f"After removing duplicates: {selected_data.shape}")
        
        # 4. Rename columns
        print("Renaming columns...")
        selected_data = selected_data.rename(columns=rename_columns)
        
        # 5. Convert to appropriate dtypes to save memory
        print("Optimizing data types...")
        if primarykey in selected_data.columns:
            selected_data[primarykey] = selected_data[primarykey].astype('str')
        
        # Convert other columns to category if they have limited unique values
        for col in selected_data.columns:
            if col != primarykey and selected_data[col].dtype == 'object':
                unique_ratio = selected_data[col].nunique() / len(selected_data)
                if unique_ratio < 0.5:  # If less than 50% unique values, use category
                    selected_data[col] = selected_data[col].astype('category')
                    print(f"Converted {col} to category (unique ratio: {unique_ratio:.3f})")
        
        # 6. Set index and sort (more efficiently)
        print("Setting index and sorting...")
        drugcharacteristics_er_df = (selected_data.
                                     set_index(primarykey).
                                     sort_index())
        
        # Clean up intermediate data
        del selected_data
        gc.collect()
        
        # 7. Reset index and final cleanup
        print("Final processing...")
        drugcharacteristics_er_df = drugcharacteristics_er_df.reset_index()
        
        # 8. Sort columns
        drugcharacteristics_er_df = (drugcharacteristics_er_df.
                                    reindex(sorted(drugcharacteristics_er_df.columns), axis=1))
        
        print("Processing completed successfully!")
        print(f"Final shape: {drugcharacteristics_er_df.shape}")
        print(f"Memory usage: {drugcharacteristics_er_df.memory_usage(deep=True).sum() / 1e6:.2f} MB")
        print("\nDataFrame info:")
        print(drugcharacteristics_er_df.info())
        print("\nFirst few rows:")
        print(drugcharacteristics_er_df.head())

except Exception as e:
    print(f"Error during processing: {e}")
    import traceback
    traceback.print_exc()

# Final cleanup
gc.collect()
print("Memory cleanup completed.")

Starting memory-efficient drug characteristics processing...
patient_drug_df shape: (70179931, 28)
Memory usage: 67469.37 MB
Processing drug characteristics...
Selecting required columns...
Selected data shape: (70179931, 5)
Dropping null primary keys...
After dropping null primary keys: (70179931, 5)
Removing duplicates...
After removing duplicates: (64177920, 5)
Renaming columns...
Optimizing data types...
Converted medicinal_product to category (unique ratio: 0.013)
Converted drug_characterization to category (unique ratio: 0.000)
Converted drug_administration to category (unique ratio: 0.000)
Converted drug_indication to category (unique ratio: 0.001)
Setting index and sorting...
Final processing...
Processing completed successfully!
Final shape: (64177920, 5)
Memory usage: 4276.95 MB

DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64177920 entries, 0 to 64177919
Data columns (total 5 columns):
 #   Column                 Dtype   
---  ------                 ----

In [5]:
(drugcharacteristics_er_df
).to_csv(er_dir+'drugcharacteristics.csv.gz',compression='gzip',index=False)

In [6]:
del drugcharacteristics_er_df
del patient_drug_df
del df

### drugs

#### patient.drug.openfda.rxcui_df

In [4]:
# dir_ = data_dir+'patient_drug_openfda_rxcui/'
# files = glob.glob(dir_+'*.csv.gzip')
# results = []
# for file in files:
#     df = delayed(read_file)(file)
#     results.append(df)
# patient_drug_openfda_rxcui_df = (pd.concat(compute(*results),sort=True))
# print(patient_drug_openfda_rxcui_df.columns.values)
# patient_drug_openfda_rxcui_df[primarykey] = (patient_drug_openfda_rxcui_df[primarykey].
#                                        astype(str))
# patient_drug_openfda_rxcui_df.value = (patient_drug_openfda_rxcui_df.
#                                  value.astype(int))
# patient_drug_openfda_rxcui_df.head()

# Improved memory-efficient loading - process cleaning in each chunk
import gc

dir_ = data_dir+'patient_drug_openfda_rxcui/'
files = glob.glob(dir_+'*.csv.gzip')

print(f"Found {len(files)} files to process")

# Process files in smaller chunks and clean each chunk before combining
chunk_size = 10  # Reduced chunk size for better memory management
all_chunks = []

for i in range(0, len(files), chunk_size):
    chunk_files = files[i:i+chunk_size]
    print(f"Processing chunk {i//chunk_size + 1}/{(len(files)-1)//chunk_size + 1} ({len(chunk_files)} files)")
    
    # Load current chunk
    results = []
    for file in chunk_files:
        df = delayed(read_file)(file)
        results.append(df)
    
    # Combine current chunk
    chunk_df = pd.concat(compute(*results), sort=True)
    
    # Clean and process current chunk BEFORE adding to all_chunks
    print(f"  - Cleaning chunk {i//chunk_size + 1}...")
    
    # Handle primary key
    chunk_df[primarykey] = chunk_df[primarykey].astype(str)
    
    # Handle missing values and type conversion in this chunk
    if 'value' in chunk_df.columns:
        chunk_df = chunk_df.dropna(subset=['value'])  # Clean nulls in smaller chunk
        chunk_df['value'] = chunk_df['value'].astype(int)
    
    # Store cleaned chunk
    all_chunks.append(chunk_df)
    
    print(f"  - Chunk {i//chunk_size + 1} shape: {chunk_df.shape}")
    
    # Clean up memory for this chunk
    del results, chunk_df
    gc.collect()

# Combine all cleaned chunks (should be safer now)
print("Combining all cleaned chunks...")
patient_drug_openfda_rxcui_df = pd.concat(all_chunks, sort=True)

# Clean up chunk list
del all_chunks
gc.collect()

print(f"Final dataframe shape: {patient_drug_openfda_rxcui_df.shape}")
print(patient_drug_openfda_rxcui_df.columns.values)
patient_drug_openfda_rxcui_df.head()

Found 1626 files to process
Processing chunk 1/163 (10 files)
  - Cleaning chunk 1...
  - Chunk 1 shape: (4415694, 4)
Processing chunk 2/163 (10 files)
  - Cleaning chunk 2...
  - Chunk 2 shape: (3431682, 4)
Processing chunk 3/163 (10 files)
  - Cleaning chunk 3...
  - Chunk 3 shape: (2818902, 4)
Processing chunk 4/163 (10 files)
  - Cleaning chunk 4...
  - Chunk 4 shape: (3815900, 4)
Processing chunk 5/163 (10 files)
  - Cleaning chunk 5...
  - Chunk 5 shape: (2431579, 4)
Processing chunk 6/163 (10 files)
  - Cleaning chunk 6...
  - Chunk 6 shape: (3266609, 4)
Processing chunk 7/163 (10 files)
  - Cleaning chunk 7...
  - Chunk 7 shape: (3010601, 4)
Processing chunk 8/163 (10 files)
  - Cleaning chunk 8...
  - Chunk 8 shape: (3788597, 4)
Processing chunk 9/163 (10 files)
  - Cleaning chunk 9...
  - Chunk 9 shape: (4307168, 4)
Processing chunk 10/163 (10 files)
  - Cleaning chunk 10...
  - Chunk 10 shape: (4804794, 4)
Processing chunk 11/163 (10 files)
  - Cleaning chunk 11...
  - Chunk

Unnamed: 0,entry,key,safetyreportid,value
18,0,rxcui,20115377,403878
19,0,rxcui,20115377,403879
59,1,rxcui,20115377,403878
60,1,rxcui,20115377,403879
160,2,rxcui,20115377,643105


#### drugs_er_df

In [5]:
columns = [primarykey,
              'value'
             ]
rename_columns = {
              'value' : 'rxcui'
}

drugs_er_df = (patient_drug_openfda_rxcui_df[columns].
               rename(columns=rename_columns).
               set_index(primarykey).
               sort_index().
               reset_index().
               drop_duplicates().
               dropna(subset=[primarykey])
              )
drugs_er_df = drugs_er_df.reindex(np.sort(drugs_er_df.columns),axis=1)
print(drugs_er_df.info())
drugs_er_df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 433861426 entries, 0 to 552739240
Data columns (total 2 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   rxcui           int64 
 1   safetyreportid  object
dtypes: int64(1), object(1)
memory usage: 9.7+ GB
None


Unnamed: 0,rxcui,safetyreportid
0,197807,10003301
1,854183,10003301
2,731533,10003301
3,544393,10003301
4,206878,10003301


In [6]:
drugs_er_df['rxcui'] = drugs_er_df['rxcui'].astype(int)

In [7]:
drugs_er_df[primarykey] = drugs_er_df[primarykey].astype(str)

In [8]:
(drugs_er_df).to_csv(er_dir+'drugs.csv.gz',compression='gzip',index=False)

In [9]:
del patient_drug_openfda_rxcui_df
del drugs_er_df
del df

### reactions

#### patient.reaction_df

In [3]:
dir_ = data_dir+'patient_reaction/'
files = glob.glob(dir_+'*.csv.gzip')
results = []
for file in files:
    df = delayed(read_file)(file)
    results.append(df)
patient_reaction_df = (pd.concat(compute(*results),sort=True))
patient_reaction_df[primarykey] = (patient_reaction_df[primarykey].astype(str))
print(patient_reaction_df.columns.values)
patient_reaction_df.head()

['entry' 'reactionmeddrapt' 'reactionmeddraversionpt' 'reactionoutcome'
 'safetyreportid']


Unnamed: 0,entry,reactionmeddrapt,reactionmeddraversionpt,reactionoutcome,safetyreportid
0,0,Hypogammaglobulinaemia,27.1,Recovered/resolved,23694428
1,1,Graft versus host disease in gastrointestinal ...,27.1,Unknown,23694428
2,2,Graft versus host disease in skin,27.1,Unknown,23694428
3,3,Graft versus host disease in liver,27.1,Unknown,23694428
4,4,Acute graft versus host disease in skin,27.1,Recovered/resolved,23694428


#### patient_reaction_er_df

In [4]:
columns = [primarykey,
              'reactionmeddrapt',
           'reactionoutcome'
             ]
rename_columns = {
              'reactionmeddrapt' : 'reaction_meddrapt',
    'reactionoutcome' : 'reaction_outcome'
}

reactions_er_df = (patient_reaction_df[columns].
                   rename(columns=rename_columns).
                   set_index(primarykey).
                   sort_index().
                   reset_index().
                   dropna(subset=[primarykey]).
                   drop_duplicates()
                  )
reactions_er_df[primarykey] = reactions_er_df[primarykey].astype(str)
reactions_er_df = reactions_er_df.reindex(np.sort(reactions_er_df.columns),axis=1)
print(reactions_er_df.info())
reactions_er_df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 56821724 entries, 0 to 57109545
Data columns (total 3 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   reaction_meddrapt  object
 1   reaction_outcome   object
 2   safetyreportid     object
dtypes: object(3)
memory usage: 1.7+ GB
None


Unnamed: 0,reaction_meddrapt,reaction_outcome,safetyreportid
0,Diarrhoea,,10003300
1,Arthralgia,,10003300
2,Headache,,10003300
3,Vomiting,,10003300
4,Dyspepsia,,10003301


In [5]:
(reactions_er_df).to_csv(er_dir+'reactions.csv.gz',compression='gzip',index=False)

In [6]:
del patient_reaction_df
del reactions_er_df
del df

### omop tables for joining

In [7]:
concept = (pd.read_csv('../../data/openFDA_drug_event/vocabulary_download_v5/CONCEPT.csv',sep='\t',
                      dtype={
                          'concept_id' : 'int'
                      }))
concept.head()

  concept = (pd.read_csv('../../data/openFDA_drug_event/vocabulary_download_v5/CONCEPT.csv',sep='\t',


Unnamed: 0,concept_id,concept_name,domain_id,vocabulary_id,concept_class_id,standard_concept,concept_code,valid_start_date,valid_end_date,invalid_reason
0,45756805,Pediatric Cardiology,Provider,ABMS,Physician Specialty,S,OMOP4821938,19700101,20991231,
1,45756804,Pediatric Anesthesiology,Provider,ABMS,Physician Specialty,S,OMOP4821939,19700101,20991231,
2,45756803,Pathology-Anatomic / Pathology-Clinical,Provider,ABMS,Physician Specialty,S,OMOP4821940,19700101,20991231,
3,45756802,Pathology - Pediatric,Provider,ABMS,Physician Specialty,S,OMOP4821941,19700101,20991231,
4,45756801,Pathology - Molecular Genetic,Provider,ABMS,Physician Specialty,S,OMOP4821942,19700101,20991231,


In [8]:
concept_relationship = (pd.
                        read_csv('../../data/openFDA_drug_event/vocabulary_download_v5/'+
                                 'CONCEPT_RELATIONSHIP.csv',sep='\t',
                                dtype={
                                    'concept_id_1' : 'int',
                                    'concept_id_2' : 'int'
                                }))
concept_relationship.head()

Unnamed: 0,concept_id_1,concept_id_2,relationship_id,valid_start_date,valid_end_date,invalid_reason
0,21014172,21034689,RxNorm dose form of,20150817,20991231,
1,21014172,21044546,RxNorm dose form of,20150817,20991231,
2,21014172,21054337,RxNorm dose form of,20150817,20991231,
3,21014172,21064199,RxNorm dose form of,20150817,20991231,
4,21014172,21095576,RxNorm dose form of,20150817,20991231,


### standard_drugs

In [9]:
drugs = (pd.read_csv(
    er_dir+'drugs.csv.gz',
    compression='gzip',
    dtype={
        'safetyreportid' : 'str'
    }
)
        )

In [10]:
drugs['rxcui'] = drugs['rxcui'].astype(int)

In [11]:
urxcuis = drugs['rxcui'].unique()

In [12]:
print(len(urxcuis))
urxcuis[:5]

15665


array([197807, 854183, 731533, 544393, 206878])

In [13]:
rxnorm_concept = concept.query('vocabulary_id=="RxNorm"')

In [14]:
concept_codes = rxnorm_concept['concept_code'].astype(int).unique()
print(len(concept_codes))
print(len(urxcuis))

intersect = np.intersect1d(concept_codes,urxcuis)

print(len(intersect))
print(len(intersect)/len(urxcuis))

311332
15665
15330
0.978614746249601


In [15]:
del urxcuis
del concept_codes

In [16]:
# rxnorm_concept = concept.query('vocabulary_id=="RxNorm"')

# rxnorm_concept_ids = (rxnorm_concept.
#                       query('concept_code in @intersect')['concept_id'].
#                       astype(int).
#                       unique()
#                      )
# all_rxnorm_concept_ids = (rxnorm_concept['concept_id'].
#                           unique()
#                          )

# r = (concept_relationship.
#      copy().
#      loc[:,['concept_id_1','concept_id_2','relationship_id']].
#      drop_duplicates()
#     )
# c = rxnorm_concept.copy()
# c['concept_id'] = c['concept_id'].astype(int)
# c['concept_code'] = c['concept_code'].astype(int)

# joined = (drugs.
#           set_index('rxcui').
#           join(
#               c. 
#               query('vocabulary_id=="RxNorm"').
#               loc[:,['concept_id','concept_code','concept_name','concept_class_id']].
#               drop_duplicates().
#               set_index('concept_code')
#           ).
#           dropna().
#           rename_axis('RxNorm_concept_code').
#           reset_index().
#           rename(
#               columns={
#                   'concept_class_id' : 'RxNorm_concept_class_id',
#                   'concept_name' : 'RxNorm_concept_name',
#                   'concept_id' : 'RxNorm_concept_id'
#               }
#           ).
#           dropna(subset=['RxNorm_concept_id']).
#           drop_duplicates()
#          )
# joined = (joined.
#           reindex(np.sort(joined.columns),axis=1)
#          )
# print(joined.shape)
# print(joined.head())

In [17]:
# Memory-efficient replacement - Fixed version with proper column handling
import gc
import warnings
warnings.filterwarnings('ignore')

print("Starting memory-efficient RxNorm processing (v3)...")

# Step 1: Filter RxNorm concepts early and optimize memory
print("Filtering RxNorm concepts...")
rxnorm_concept = concept.query('vocabulary_id=="RxNorm"').copy()

# Convert concept_code to int32 to save memory
print("Converting concept codes...")
try:
    rxnorm_concept['concept_code'] = pd.to_numeric(rxnorm_concept['concept_code'], errors='coerce')
    rxnorm_concept = rxnorm_concept.dropna(subset=['concept_code'])
    rxnorm_concept['concept_code'] = rxnorm_concept['concept_code'].astype('int32')
    rxnorm_concept['concept_id'] = rxnorm_concept['concept_id'].astype('int32')
except Exception as e:
    print(f"Error in data conversion: {e}")

print(f"RxNorm concept shape: {rxnorm_concept.shape}")

# Step 2: Create filtered concept IDs using the intersect
print("Creating filtered concept IDs...")
if 'intersect' in locals():
    rxnorm_concept_ids = (rxnorm_concept.
                          query('concept_code in @intersect')['concept_id'].
                          unique())
else:
    print("Warning: 'intersect' variable not found, using all RxNorm concepts")
    rxnorm_concept_ids = rxnorm_concept['concept_id'].unique()

all_rxnorm_concept_ids = rxnorm_concept['concept_id'].unique()

print(f"Filtered concept IDs: {len(rxnorm_concept_ids)}")
print(f"All RxNorm concept IDs: {len(all_rxnorm_concept_ids)}")

# Step 3: Prepare concept relationship data efficiently
print("Preparing concept relationship data...")
if 'concept_relationship' in locals():
    r = (concept_relationship.
         loc[:,['concept_id_1','concept_id_2','relationship_id']].
         drop_duplicates())
    r['concept_id_1'] = r['concept_id_1'].astype('int32')
    r['concept_id_2'] = r['concept_id_2'].astype('int32')
else:
    print("Warning: 'concept_relationship' not loaded")
    r = None

# Step 4: Check for duplicates and prepare data more carefully
print("Checking for duplicates in drugs data...")
if 'drugs' in locals():
    drugs_copy = drugs.copy()
    
    # Convert rxcui and check for issues
    try:
        drugs_copy['rxcui'] = pd.to_numeric(drugs_copy['rxcui'], errors='coerce')
        drugs_copy = drugs_copy.dropna(subset=['rxcui'])
        drugs_copy['rxcui'] = drugs_copy['rxcui'].astype('int32')
        
        print(f"Drugs shape before duplicate check: {drugs_copy.shape}")
        
        # Check for duplicate rxcui values
        duplicate_rxcui = drugs_copy['rxcui'].duplicated().sum()
        print(f"Number of duplicate rxcui values: {duplicate_rxcui}")
        
        # If there are too many duplicates, this might cause the explosion
        if duplicate_rxcui > 100000:  # Arbitrary threshold
            print("Too many duplicates detected - using first occurrence only")
            drugs_copy = drugs_copy.drop_duplicates(subset=['rxcui'], keep='first')
            print(f"Drugs shape after removing duplicates: {drugs_copy.shape}")
            
    except Exception as e:
        print(f"Error converting drugs rxcui: {e}")
        
else:
    print("Error: 'drugs' variable not found")

# Step 5: Prepare concept data and check for duplicates
print("Preparing concept data for join...")
if 'intersect' in locals():
    # Use only the intersecting concept codes to reduce size
    c_filtered = rxnorm_concept.query('concept_code in @intersect').copy()
else:
    c_filtered = rxnorm_concept.copy()

print(f"Filtered concept data shape: {c_filtered.shape}")

# Remove duplicates in concept_code 
concept_code_duplicates = c_filtered['concept_code'].duplicated().sum()
print(f"Number of duplicate concept_code values: {concept_code_duplicates}")

if concept_code_duplicates > 0:
    print("Removing duplicate concept codes...")
    c_filtered = c_filtered.drop_duplicates(subset=['concept_code'], keep='first')
    print(f"Concept data shape after removing duplicates: {c_filtered.shape}")

# Prepare final concept data for join
concept_for_join = (c_filtered[['concept_id', 'concept_code', 'concept_name', 'concept_class_id']]
                   .set_index('concept_code'))

# Step 6: Perform join with proper column handling
print("Performing join operation...")
try:
    # Check the expected size of the join
    drugs_rxcui_unique = drugs_copy['rxcui'].nunique()
    concept_codes_unique = concept_for_join.index.nunique()
    
    print(f"Unique drugs rxcui: {drugs_rxcui_unique}")
    print(f"Unique concept codes: {concept_codes_unique}")
    
    # Find intersection to estimate result size
    common_codes = set(drugs_copy['rxcui'].unique()) & set(concept_for_join.index.unique())
    print(f"Common codes between drugs and concepts: {len(common_codes)}")
    
    if len(common_codes) == 0:
        print("Warning: No common codes found between drugs and concepts!")
        joined = pd.DataFrame()
    else:
        # Perform the join with explicit column creation
        drugs_filtered = drugs_copy[drugs_copy['rxcui'].isin(common_codes)]
        concept_filtered = concept_for_join[concept_for_join.index.isin(common_codes)]
        
        print(f"Filtered drugs shape: {drugs_filtered.shape}")
        print(f"Filtered concept shape: {concept_filtered.shape}")
        
        # Use merge instead of join for better control
        joined = drugs_filtered.merge(
            concept_filtered.reset_index(),  # Reset index to make concept_code a column
            left_on='rxcui',
            right_on='concept_code',
            how='inner'
        )
        
        # Properly rename columns and ensure RxNorm_concept_code exists
        joined = joined.rename(columns={
            'concept_code': 'RxNorm_concept_code',  # This should work now
            'concept_class_id': 'RxNorm_concept_class_id',
            'concept_name': 'RxNorm_concept_name',
            'concept_id': 'RxNorm_concept_id'
        })
        
        # Remove the duplicate rxcui column since we have RxNorm_concept_code
        joined = joined.drop('rxcui', axis=1)
        
        # Clean up and sort
        joined = joined.dropna().drop_duplicates()
        
        print(f"Columns after join and rename:")
        print(list(joined.columns))
    
    # Clean up intermediate variables
    del drugs_copy, concept_for_join, c_filtered
    if 'drugs_filtered' in locals():
        del drugs_filtered
    if 'concept_filtered' in locals():
        del concept_filtered
    gc.collect()
    
    # Sort columns
    if not joined.empty:
        joined = joined.reindex(sorted(joined.columns), axis=1)
    
    print(f"Join completed successfully!")
    print(f"Joined shape: {joined.shape}")
    if not joined.empty:
        print("First few rows:")
        print(joined.head())
        
        # Test the attribute that was causing the error
        if 'RxNorm_concept_code' in joined.columns:
            print(f"RxNorm_concept_code column exists with {joined['RxNorm_concept_code'].nunique()} unique values")
        else:
            print("Warning: RxNorm_concept_code column still missing!")
            print("Available columns:", list(joined.columns))
    else:
        print("Warning: Joined DataFrame is empty!")

except Exception as e:
    print(f"Error during join: {e}")
    import traceback
    traceback.print_exc()

# Final cleanup
gc.collect()
print("Memory cleanup completed.")

Starting memory-efficient RxNorm processing (v3)...
Filtering RxNorm concepts...
Converting concept codes...
RxNorm concept shape: (311332, 10)
Creating filtered concept IDs...
Filtered concept IDs: 15330
All RxNorm concept IDs: 311332
Preparing concept relationship data...
Checking for duplicates in drugs data...
Drugs shape before duplicate check: (433861426, 2)
Number of duplicate rxcui values: 433845761
Too many duplicates detected - using first occurrence only
Drugs shape after removing duplicates: (15665, 2)
Preparing concept data for join...
Filtered concept data shape: (15330, 10)
Number of duplicate concept_code values: 0
Performing join operation...
Unique drugs rxcui: 15665
Unique concept codes: 15330
Common codes between drugs and concepts: 15330
Filtered drugs shape: (15330, 2)
Filtered concept shape: (15330, 3)
Columns after join and rename:
['safetyreportid', 'RxNorm_concept_code', 'RxNorm_concept_id', 'RxNorm_concept_name', 'RxNorm_concept_class_id']
Join completed succ

In [18]:
print("Available columns in joined DataFrame:")
print(joined.columns.tolist())
print()


Available columns in joined DataFrame:
['RxNorm_concept_class_id', 'RxNorm_concept_code', 'RxNorm_concept_id', 'RxNorm_concept_name', 'safetyreportid']



In [19]:
len(np.intersect1d(joined.RxNorm_concept_code.unique(),intersect))/len(intersect)

1.0

In [20]:
ids = joined.RxNorm_concept_id.dropna().astype(int).unique()

In [21]:
pickle.dump(
    ids,
    open('../../data/all_openFDA_rxnorm_concept_ids.pkl','wb')
)

In [22]:
(joined.to_csv(er_dir+'standard_drugs.csv.gz',compression='gzip',index=False))

In [23]:
del joined

### standard_reactions

In [24]:
patient_reaction_df = (pd.read_csv(
    er_dir+'reactions.csv.gz',
    compression='gzip',
                               dtype={
                                   'safetyreportid' : 'str'
                               }
                              ))
all_reports = patient_reaction_df.safetyreportid.unique()
print(patient_reaction_df.columns)
print(patient_reaction_df.safetyreportid.nunique())
print(patient_reaction_df.reaction_meddrapt.nunique())

Index(['reaction_meddrapt', 'reaction_outcome', 'safetyreportid'], dtype='object')
19026493
38900


In [25]:
patient_reaction_df.head()

Unnamed: 0,reaction_meddrapt,reaction_outcome,safetyreportid
0,Diarrhoea,,10003300
1,Arthralgia,,10003300
2,Headache,,10003300
3,Vomiting,,10003300
4,Dyspepsia,,10003301


In [26]:
meddra_concept = concept.query('vocabulary_id=="MedDRA"')
meddra_concept.head()

Unnamed: 0,concept_id,concept_name,domain_id,vocabulary_id,concept_class_id,standard_concept,concept_code,valid_start_date,valid_end_date,invalid_reason
418253,36775929,Unrest,Condition,MedDRA,LLT,C,10045556,19700101,20991231,
418254,37587530,Urethroplasty,Procedure,MedDRA,LLT,,10046493,19700101,20991231,
418255,37182453,Vaginal pain,Condition,MedDRA,LLT,C,10046937,19700101,20991231,
418256,35506483,Adrenal cyst,Condition,MedDRA,PT,C,10049647,19700101,20991231,
418257,37383375,Acquired ichthyosis,Condition,MedDRA,LLT,,10000561,19700101,20991231,


In [27]:
reactions = patient_reaction_df.reaction_meddrapt.copy().astype(str).str.title().unique()
print(len(reactions))
concept_names = meddra_concept.concept_name.astype(str).str.title().unique()
print(len(concept_names))

intersect_title = np.intersect1d(reactions,concept_names)
print(len(intersect_title))

print(len(intersect_title)/len(reactions))

24589
91161
24201
0.9842205864410916


In [28]:
patient_reaction_df['reaction_meddrapt'] = (patient_reaction_df['reaction_meddrapt'].
                                            astype(str).
                                            str.
                                            title())
meddra_concept['concept_name'] = (meddra_concept['concept_name'].
                                  astype(str).
                                  str.
                                  title())
print(patient_reaction_df.shape[0])

joined = ((patient_reaction_df.
  set_index('reaction_meddrapt').
  join(
      meddra_concept.
      query('concept_class_id=="PT"').
      loc[:,['concept_id','concept_name','concept_code','concept_class_id']].
      drop_duplicates().
      set_index('concept_name')
  ).
           rename(columns={'concept_id' : 'MedDRA_concept_id',
                          'concept_code' : 'MedDRA_concept_code',
                          'concept_class_id' : 'MedDRA_concept_class_id'}).
           drop_duplicates()
 )
).rename_axis('MedDRA_concept_name').reset_index()
joined = joined.reindex(np.sort(joined.columns),axis=1)
print(joined.shape[0])
print(joined.head())

56821724
56666050
  MedDRA_concept_class_id MedDRA_concept_code  MedDRA_concept_id  \
0                      PT            10012735         35708093.0   
1                      PT            10003239         36516812.0   
2                      PT            10019211         36718132.0   
3                      PT            10047700         35708208.0   
4                      PT            10013946         35708139.0   

  MedDRA_concept_name reaction_outcome safetyreportid  
0           Diarrhoea              NaN       10003300  
1          Arthralgia              NaN       10003300  
2            Headache              NaN       10003300  
3            Vomiting              NaN       10003300  
4           Dyspepsia              NaN       10003301  


In [29]:
del meddra_concept
del patient_reaction_df

In [30]:
joined_notnull = joined[joined.MedDRA_concept_id.notnull()]
print(joined_notnull.shape[0])
joined_notnull['MedDRA_concept_id'] = joined_notnull['MedDRA_concept_id'].astype(int)
print(joined_notnull.head())

55164660
  MedDRA_concept_class_id MedDRA_concept_code  MedDRA_concept_id  \
0                      PT            10012735           35708093   
1                      PT            10003239           36516812   
2                      PT            10019211           36718132   
3                      PT            10047700           35708208   
4                      PT            10013946           35708139   

  MedDRA_concept_name reaction_outcome safetyreportid  
0           Diarrhoea              NaN       10003300  
1          Arthralgia              NaN       10003300  
2            Headache              NaN       10003300  
3            Vomiting              NaN       10003300  
4           Dyspepsia              NaN       10003301  


In [31]:
print(
    len(
        np.intersect1d(
            all_reports,
            joined_notnull.safetyreportid.astype(str).unique()
        )
    )/len(all_reports)
)

0.9850631958291


In [32]:
print(joined_notnull.MedDRA_concept_class_id.value_counts())
print(joined_notnull.safetyreportid.nunique())
print(joined_notnull.MedDRA_concept_id.nunique())

MedDRA_concept_class_id
PT    55164660
Name: count, dtype: int64
18742298
22226


In [33]:
pickle.dump(
    joined_notnull.MedDRA_concept_id.astype(int).unique,
    open('../../data/all_openFDA_meddra_concept_ids.pkl','wb')
)

In [34]:
(joined_notnull.to_csv(er_dir+'standard_reactions.csv.gz',compression='gzip',index=False))

In [35]:
del joined_notnull

In [36]:
del joined

### standard_drugs_atc

In [37]:
standard_drugs = (pd.read_csv(
    er_dir+'standard_drugs.csv.gz',
    compression='gzip',
    dtype={
        'safetyreportid' : 'str'
    }
))

In [38]:
all_reports = standard_drugs.safetyreportid.unique()
len(all_reports)

2815

In [None]:
standard_drugs.RxNorm_concept_id = standard_drugs.RxNorm_concept_id.astype(int)

In [40]:
standard_drugs.head()

Unnamed: 0,RxNorm_concept_class_id,RxNorm_concept_code,RxNorm_concept_id,RxNorm_concept_name,safetyreportid
0,Clinical Drug,197807,19019074,ibuprofen 800 MG Oral Tablet,10003301
1,Quant Clinical Drug,854183,40161247,8 ML ibuprofen 100 MG/ML Injection,10003301
2,Branded Drug,731533,1177843,ibuprofen 200 MG Oral Capsule [Advil],10003301
3,Branded Drug,544393,1177822,ibuprofen 20 MG/ML Oral Suspension [Motrin],10003301
4,Branded Drug,206878,19033719,ibuprofen 20 MG/ML Oral Suspension [Advil],10003301


In [41]:
rxnorm_concept = concept.query('vocabulary_id=="RxNorm"')
rxnorm_concept_ids = rxnorm_concept['concept_id'].unique()

In [42]:
openfda_concept_ids = standard_drugs.RxNorm_concept_id.dropna().astype(int).unique()

In [43]:
atc_concept = concept.query('vocabulary_id=="ATC" & concept_class_id=="ATC 5th"')

r = (concept_relationship.
     copy().
     loc[:,['concept_id_1','concept_id_2','relationship_id']].
     drop_duplicates()
    )
                            
r['concept_id_1'] = r['concept_id_1'].astype(int)
r['concept_id_2'] = r['concept_id_2'].astype(int)
ac = atc_concept.copy()
ac['concept_id'] = ac['concept_id'].astype(int)
atc_concept_ids = ac['concept_id'].unique()
rc = rxnorm_concept.copy()
rc['concept_id'] = rc['concept_id'].astype(int)
rxnorm_concept_ids = rc['concept_id'].unique()

In [44]:
rxnorm_to_atc_relationships = (r.
                         query('concept_id_1 in @openfda_concept_ids & '\
                               'concept_id_2 in @atc_concept_ids').
                         set_index('concept_id_1').
                         join(
                             rc. # standard concepts for 1
                             loc[:,['concept_id','concept_code',
                                    'concept_name','concept_class_id']].
                             drop_duplicates().
                             set_index('concept_id')
                         ).
                         rename_axis('RxNorm_concept_id').
                         reset_index().
                         dropna().
                         rename(
                             columns={
                                 'concept_code' : 'RxNorm_concept_code',
                                 'concept_class_id' : 'RxNorm_concept_class_id',
                                 'concept_name' : 'RxNorm_concept_name',
                                 'concept_id_2' : 'ATC_concept_id',
                             }
                         ).
                         set_index('ATC_concept_id').
                         join(
                             ac. # standard concepts for 2
                             loc[:,['concept_id','concept_code',
                                    'concept_name','concept_class_id']].
                             drop_duplicates().
                             set_index('concept_id')
                         ).
                         dropna().
                         rename_axis('ATC_concept_id').
                         reset_index().
                         rename(
                             columns={
                                 'concept_code' : 'ATC_concept_code',
                                 'concept_class_id' : 'ATC_concept_class_id',
                                 'concept_name' : 'ATC_concept_name'
                             }
                         )
                        )
rxnorm_to_atc_relationships.RxNorm_concept_id = \
(rxnorm_to_atc_relationships.RxNorm_concept_id.
astype(int))
rxnorm_to_atc_relationships.ATC_concept_id = \
(rxnorm_to_atc_relationships.ATC_concept_id.
astype(int))

rxnorm_to_atc_relationships = (rxnorm_to_atc_relationships.
                            reindex(np.sort(rxnorm_to_atc_relationships.columns),axis=1)
                           )
print(rxnorm_to_atc_relationships.shape)
print(rxnorm_to_atc_relationships.head())

(225, 9)
  ATC_concept_class_id ATC_concept_code  ATC_concept_id  \
0              ATC 5th          R07AX32          947932   
1              ATC 5th          R05FB02        21603442   
2              ATC 5th          G03FA12        21602579   
3              ATC 5th          D10AE51        21602303   
4              ATC 5th          G03AA06        21602479   

                                ATC_concept_name RxNorm_concept_class_id  \
0    ivacaftor, tezacaftor and elexacaftor; oral           Clinical Pack   
1  cough suppressants and expectorants; systemic           Clinical Pack   
2     medroxyprogesterone and estrogen; systemic           Clinical Pack   
3        benzoyl peroxide, combinations; topical           Clinical Pack   
4      norgestrel and ethinylestradiol; systemic           Clinical Pack   

  RxNorm_concept_code  RxNorm_concept_id  \
0             2257012           37499629   
1             2056073           35201891   
2             1000496           40225693   
3  

In [45]:
rxnorm_to_atc_relationships.ATC_concept_class_id.value_counts()

ATC_concept_class_id
ATC 5th    225
Name: count, dtype: int64

In [46]:
del r
del ac
del rc

In [47]:
standard_drugs_atc = (standard_drugs.
                      loc[:,['RxNorm_concept_id','safetyreportid']].
                      drop_duplicates().
                      set_index('RxNorm_concept_id').
                      join(rxnorm_to_atc_relationships.
                           set_index('RxNorm_concept_id')
                          ).
                      drop_duplicates().
                      reset_index(drop=True).
                      drop(['RxNorm_concept_code','RxNorm_concept_name',
                            'RxNorm_concept_class_id','relationship_id'],axis=1).
                      dropna(subset=['ATC_concept_id']).
                      drop_duplicates()
                     )

standard_drugs_atc = \
standard_drugs_atc.reindex(np.sort(standard_drugs_atc.columns),axis=1)
standard_drugs_atc.ATC_concept_id = standard_drugs_atc.ATC_concept_id.astype(int)
print(len(
    np.intersect1d(all_reports,
                   standard_drugs_atc.safetyreportid.unique()
                  )
)/len(all_reports))
print(standard_drugs_atc.shape)
print(standard_drugs_atc.info())
print(standard_drugs_atc.head())

0.017761989342806393
(149, 5)
<class 'pandas.core.frame.DataFrame'>
Index: 149 entries, 138 to 2855
Data columns (total 5 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   ATC_concept_class_id  149 non-null    object
 1   ATC_concept_code      149 non-null    object
 2   ATC_concept_id        149 non-null    int64 
 3   ATC_concept_name      149 non-null    object
 4   safetyreportid        149 non-null    object
dtypes: int64(1), object(4)
memory usage: 7.0+ KB
None
    ATC_concept_class_id ATC_concept_code  ATC_concept_id  \
138              ATC 5th          N05BA01        21604566   
155              ATC 5th          G03AA12        21602485   
156              ATC 5th          G03FA17        21602584   
200              ATC 5th          G03AA12        21602485   
201              ATC 5th          G03FA17        21602584   

                                ATC_concept_name safetyreportid  
138                   diazepam

In [48]:
del standard_drugs
del rxnorm_to_atc_relationships

In [49]:
standard_drugs_atc.to_csv(er_dir+'standard_drugs_atc.csv.gz',compression='gzip',index=False)

In [50]:
del standard_drugs_atc

### standard_drugs_rxnorm_ingredients

https://www.nlm.nih.gov/research/umls/rxnorm/overview.html

In [51]:
all_openFDA_rxnorm_concept_ids = pickle.load(
    open('../../data/all_openFDA_rxnorm_concept_ids.pkl','rb')
)

In [52]:
all_openFDA_rxnorm_concept_ids

array([19019074, 40161247,  1177843, ..., 40162916, 40229767, 36249133],
      shape=(15330,))

In [53]:
all_rxnorm_concept_ids = (concept.
                          query('vocabulary_id=="RxNorm"').
                          concept_id.
                          astype(int).
                          unique()
                         )

In [54]:
r = (concept_relationship.
     loc[:,['concept_id_1','concept_id_2','relationship_id']].
    drop_duplicates().
    dropna().
     copy()
    )
r.concept_id_1 = r.concept_id_1.astype(int)
r.concept_id_2 = r.concept_id_2.astype(int)

In [55]:
c = (concept.
    query('vocabulary_id=="RxNorm" & standard_concept=="S"').
    loc[:,['concept_id','concept_code',
          'concept_class_id','concept_name']].
    drop_duplicates().
    dropna().
     copy()
    )
c.concept_id = c.concept_id.astype(int).copy()

In [56]:
all_rxnorm_concept_ids = concept.query('vocabulary_id=="RxNorm"').concept_id.astype(int).unique()
rxnorm_relationships = (r.
 query('concept_id_1 in @all_rxnorm_concept_ids & '+
       'concept_id_2 in @all_rxnorm_concept_ids').
 relationship_id.
 value_counts()
)
rxnorm_relationships

relationship_id
RxNorm inverse is a     228372
RxNorm is a             228372
Maps to                 208755
Mapped from             208755
Brand name of           119926
Has brand name          119926
RxNorm dose form of     111117
RxNorm has dose form    111117
Has tradename           110295
Tradename of            110295
RxNorm has ing           89816
RxNorm ing of            89816
Constitutes              88135
Consists of              88135
Dose form group of       40541
Has dose form group      40541
Concept replaced by      40291
Concept replaces         40291
Precise ing of           14752
Has precise ing          14752
Has quantified form       7347
Quantified form of        7347
Has form                  3542
Form of                   3542
Contains                  3450
Contained in              3450
Has component                1
Component of                 1
Name: count, dtype: int64

In [57]:
first_second_relations = (r.
                          query('concept_id_1 in @all_openFDA_rxnorm_concept_ids').
                          set_index('concept_id_1').
                          join(c.
                               set_index('concept_id')).
                          rename(
                              columns={
                                  'concept_id_1' : 'RxNorm_concept_id_1',
                                  'concept_code' : 'RxNorm_concept_code_1',
                                  'concept_class_id' : 'RxNorm_concept_class_id_1',
                                  'concept_name' : 'RxNorm_concept_name_1'
                              }
                          ).
                          rename_axis('RxNorm_concept_id_1').
                          reset_index().
                          set_index('concept_id_2').
                          join(c.
                               set_index('concept_id')
                              ).
                          rename(
                              columns={'concept_id_2' : 'RxNorm_concept_id_2',
                                       'concept_code' : 'RxNorm_concept_code_2',
                                       'concept_class_id' : 'RxNorm_concept_class_id_2',
                                       'concept_name' : 'RxNorm_concept_name_2',
                                       'relationship_id' :'relationship_id_12'
                                      }
                          ).
                          rename_axis('RxNorm_concept_id_2').
                          reset_index().
                          dropna().
                          drop_duplicates()
                         )
first_second_relations = first_second_relations[
    first_second_relations.RxNorm_concept_id_1!=first_second_relations.RxNorm_concept_id_2
]
print(first_second_relations.shape)
first_second_relations = (first_second_relations.
                          reindex(np.sort(first_second_relations.columns),
                                  axis=1)
                         )
print(first_second_relations.head())

(49854, 9)
   RxNorm_concept_class_id_1 RxNorm_concept_class_id_2 RxNorm_concept_code_1  \
0               Branded Drug         Branded Drug Comp               2052609   
7              Clinical Drug        Clinical Drug Form               2053517   
17             Clinical Drug        Clinical Drug Comp               2054096   
25              Branded Drug         Branded Drug Comp               2055014   
37              Branded Drug         Branded Drug Form               2055286   

   RxNorm_concept_code_2  RxNorm_concept_id_1  RxNorm_concept_id_2  \
0                2052606             35200101             35200037   
7                2053508             35200183             35200176   
17               2054092             35200225             35200221   
25               2054982             35200332             35200300   
37               2055284             35200380             35200378   

                                RxNorm_concept_name_1  \
0   tacrolimus 0.2 MG Granules

In [58]:
(first_second_relations.loc[:,['RxNorm_concept_class_id_1','RxNorm_concept_class_id_2']].
groupby(['RxNorm_concept_class_id_1','RxNorm_concept_class_id_2']).
 count()
)

RxNorm_concept_class_id_1,RxNorm_concept_class_id_2
Branded Drug,Branded Drug Comp
Branded Drug,Branded Drug Form
Branded Drug,Branded Pack
Branded Drug,Clinical Drug
Branded Drug,Quant Branded Drug
Branded Pack,Branded Drug
Branded Pack,Clinical Drug
Branded Pack,Clinical Pack
Branded Pack,Quant Branded Drug
Branded Pack,Quant Clinical Drug


In [59]:
ids = first_second_relations.RxNorm_concept_id_2.astype(int).unique()

second_third_relations = (r.
                          query('concept_id_1 in @ids').
                          set_index('concept_id_1').
                          join(c.
                               set_index('concept_id')).
                          rename(
                              columns={
                                  'concept_id_1' : 'RxNorm_concept_id_2',
                                  'concept_code' : 'RxNorm_concept_code_2',
                                  'concept_class_id' : 'RxNorm_concept_class_id_2',
                                  'concept_name' : 'RxNorm_concept_name_2'
                              }
                          ).
                          rename_axis('RxNorm_concept_id_2').
                          reset_index().
                          set_index('concept_id_2').
                          join(c.
                               set_index('concept_id')
                              ).
                          rename(
                              columns={'concept_id_2' : 'RxNorm_concept_id_3',
                                       'concept_code' : 'RxNorm_concept_code_3',
                                       'concept_class_id' : 'RxNorm_concept_class_id_3',
                                       'concept_name' : 'RxNorm_concept_name_3',
                                       'relationship_id' :'relationship_id_23'
                                      }
                          ).
                          rename_axis('RxNorm_concept_id_3').
                          reset_index().
                          dropna().
                          drop_duplicates()
                         )
second_third_relations = second_third_relations[
    second_third_relations.RxNorm_concept_id_2!=second_third_relations.RxNorm_concept_id_3
]
print(second_third_relations.shape)
second_third_relations = (second_third_relations.
                          reindex(np.sort(second_third_relations.columns),
                                  axis=1)
                         )
print(second_third_relations.head())

(153597, 9)
   RxNorm_concept_class_id_2 RxNorm_concept_class_id_3 RxNorm_concept_code_2  \
4          Branded Drug Comp              Branded Drug               2052606   
5               Branded Drug         Branded Drug Comp               2052609   
15        Clinical Drug Form             Clinical Drug               2053508   
18             Clinical Drug        Clinical Drug Form               2053517   
29        Clinical Drug Comp             Clinical Drug               2054092   

   RxNorm_concept_code_3  RxNorm_concept_id_2  RxNorm_concept_id_3  \
4                2052609             35200037             35200101   
5                2052606             35200101             35200037   
15               2053517             35200176             35200183   
18               2053508             35200183             35200176   
29               2054096             35200221             35200225   

                                RxNorm_concept_name_2  \
4                         tac

In [60]:
(second_third_relations.loc[:,['RxNorm_concept_class_id_2','RxNorm_concept_class_id_3']].
groupby(['RxNorm_concept_class_id_2','RxNorm_concept_class_id_3']).
 count()
)

RxNorm_concept_class_id_2,RxNorm_concept_class_id_3
Branded Drug,Branded Drug Comp
Branded Drug,Branded Drug Form
Branded Drug,Branded Pack
Branded Drug,Clinical Drug
Branded Drug,Quant Branded Drug
Branded Drug Comp,Branded Drug
Branded Drug Comp,Clinical Drug Comp
Branded Drug Form,Branded Drug
Branded Drug Form,Clinical Drug Form
Branded Pack,Branded Drug


In [61]:
ids = second_third_relations.RxNorm_concept_id_3.astype(int).unique()

third_fourth_relations = (r.
                          query('concept_id_1 in @ids').
                          set_index('concept_id_1').
                          join(c.
                               set_index('concept_id')).
                          rename(
                              columns={
                                  'concept_id_1' : 'RxNorm_concept_id_3',
                                  'concept_code' : 'RxNorm_concept_code_3',
                                  'concept_class_id' : 'RxNorm_concept_class_id_3',
                                  'concept_name' : 'RxNorm_concept_name_3'
                              }
                          ).
                          rename_axis('RxNorm_concept_id_3').
                          reset_index().
                          set_index('concept_id_2').
                          join(c.
                               set_index('concept_id')
                              ).
                          rename(
                              columns={'concept_id_2' : 'RxNorm_concept_id_4',
                                       'concept_code' : 'RxNorm_concept_code_4',
                                       'concept_class_id' : 'RxNorm_concept_class_id_4',
                                       'concept_name' : 'RxNorm_concept_name_4',
                                       'relationship_id' :'relationship_id_34'
                                      }
                          ).
                          rename_axis('RxNorm_concept_id_4').
                          reset_index().
                          dropna().
                          drop_duplicates()
                         )
third_fourth_relations = third_fourth_relations[
    third_fourth_relations.RxNorm_concept_id_3!=third_fourth_relations.RxNorm_concept_id_4
]
print(third_fourth_relations.shape)
third_fourth_relations = (third_fourth_relations.
                          reindex(np.sort(third_fourth_relations.columns),
                                  axis=1)
                         )
print(third_fourth_relations.head())

(251166, 9)
   RxNorm_concept_class_id_3 RxNorm_concept_class_id_4 RxNorm_concept_code_3  \
2          Branded Drug Comp              Branded Drug               2052606   
9               Branded Drug         Branded Drug Comp               2052609   
24        Clinical Drug Form             Clinical Drug               2053508   
27             Clinical Drug        Clinical Drug Form               2053517   
41        Clinical Drug Comp             Clinical Drug               2054092   

   RxNorm_concept_code_4  RxNorm_concept_id_3  RxNorm_concept_id_4  \
2                2052609             35200037             35200101   
9                2052606             35200101             35200037   
24               2053517             35200176             35200183   
27               2053508             35200183             35200176   
41               2054096             35200221             35200225   

                                RxNorm_concept_name_3  \
2                         tac

In [62]:
(third_fourth_relations.loc[:,['RxNorm_concept_class_id_3','RxNorm_concept_class_id_4']].
groupby(['RxNorm_concept_class_id_3','RxNorm_concept_class_id_4']).
 count()
)

RxNorm_concept_class_id_3,RxNorm_concept_class_id_4
Branded Drug,Branded Drug Comp
Branded Drug,Branded Drug Form
Branded Drug,Branded Pack
Branded Drug,Clinical Drug
Branded Drug,Quant Branded Drug
Branded Drug Comp,Branded Drug
Branded Drug Comp,Clinical Drug Comp
Branded Drug Form,Branded Drug
Branded Drug Form,Clinical Drug Form
Branded Pack,Branded Drug


In [63]:
ids = third_fourth_relations.RxNorm_concept_id_4.astype(int).unique()

fourth_fifth_relations = (r.
                          query('concept_id_1 in @ids').
                          set_index('concept_id_1').
                          join(c.
                               set_index('concept_id')).
                          rename(
                              columns={
                                  'concept_id_1' : 'RxNorm_concept_id_4',
                                  'concept_code' : 'RxNorm_concept_code_4',
                                  'concept_class_id' : 'RxNorm_concept_class_id_4',
                                  'concept_name' : 'RxNorm_concept_name_4'
                              }
                          ).
                          rename_axis('RxNorm_concept_id_4').
                          reset_index().
                          set_index('concept_id_2').
                          join(c.
                               set_index('concept_id')
                              ).
                          rename(
                              columns={'concept_id_2' : 'RxNorm_concept_id_5',
                                       'concept_code' : 'RxNorm_concept_code_5',
                                       'concept_class_id' : 'RxNorm_concept_class_id_5',
                                       'concept_name' : 'RxNorm_concept_name_5',
                                       'relationship_id' :'relationship_id_45'
                                      }
                          ).
                          rename_axis('RxNorm_concept_id_5').
                          reset_index().
                          dropna().
                          drop_duplicates()
                         )
fourth_fifth_relations = fourth_fifth_relations[
    fourth_fifth_relations.RxNorm_concept_id_4!=fourth_fifth_relations.RxNorm_concept_id_5
]
print(fourth_fifth_relations.shape)
fourth_fifth_relations = (fourth_fifth_relations.
                          reindex(np.sort(fourth_fifth_relations.columns),
                                  axis=1)
                         )
print(fourth_fifth_relations.head())

(334584, 9)
   RxNorm_concept_class_id_4 RxNorm_concept_class_id_5 RxNorm_concept_code_4  \
4          Branded Drug Comp              Branded Drug               2052606   
13        Clinical Drug Comp             Clinical Drug               2052398   
16              Branded Drug         Branded Drug Comp               2052609   
35        Clinical Drug Form             Clinical Drug               2053508   
38             Clinical Drug        Clinical Drug Form               2053517   

   RxNorm_concept_code_5  RxNorm_concept_id_4  RxNorm_concept_id_5  \
4                2052609             35200037             35200101   
13               2052399             35200091             35200092   
16               2052606             35200101             35200037   
35               2053517             35200176             35200183   
38               2053508             35200183             35200176   

                                RxNorm_concept_name_4  \
4                         tac

In [64]:
(fourth_fifth_relations.loc[:,['RxNorm_concept_class_id_4','RxNorm_concept_class_id_5']].
groupby(['RxNorm_concept_class_id_4','RxNorm_concept_class_id_5']).
 count()
)

RxNorm_concept_class_id_4,RxNorm_concept_class_id_5
Branded Drug,Branded Drug Comp
Branded Drug,Branded Drug Form
Branded Drug,Branded Pack
Branded Drug,Clinical Drug
Branded Drug,Quant Branded Drug
Branded Drug Comp,Branded Drug
Branded Drug Comp,Clinical Drug Comp
Branded Drug Form,Branded Drug
Branded Drug Form,Clinical Drug Form
Branded Pack,Branded Drug


In [65]:
ids = fourth_fifth_relations.RxNorm_concept_id_4.astype(int).unique()

fifth_sixth_relations = (r.
                          query('concept_id_1 in @ids').
                          set_index('concept_id_1').
                          join(c.
                               set_index('concept_id')).
                          rename(
                              columns={
                                  'concept_id_1' : 'RxNorm_concept_id_5',
                                  'concept_code' : 'RxNorm_concept_code_5',
                                  'concept_class_id' : 'RxNorm_concept_class_id_5',
                                  'concept_name' : 'RxNorm_concept_name_5'
                              }
                          ).
                          rename_axis('RxNorm_concept_id_5').
                          reset_index().
                          set_index('concept_id_2').
                          join(c.
                               set_index('concept_id')
                              ).
                          rename(
                              columns={'concept_id_2' : 'RxNorm_concept_id_6',
                                       'concept_code' : 'RxNorm_concept_code_6',
                                       'concept_class_id' : 'RxNorm_concept_class_id_6',
                                       'concept_name' : 'RxNorm_concept_name_6',
                                       'relationship_id' :'relationship_id_56'
                                      }
                          ).
                          rename_axis('RxNorm_concept_id_6').
                          reset_index().
                          dropna().
                          drop_duplicates()
                         )
fifth_sixth_relations = fifth_sixth_relations[
    fifth_sixth_relations.RxNorm_concept_id_5!= fifth_sixth_relations.RxNorm_concept_id_6
]
print(fifth_sixth_relations.shape)
fifth_sixth_relations = (fifth_sixth_relations.
                          reindex(np.sort(fifth_sixth_relations.columns),
                                  axis=1)
                         )
print(fifth_sixth_relations.head())

(334584, 9)
   RxNorm_concept_class_id_5 RxNorm_concept_class_id_6 RxNorm_concept_code_5  \
4          Branded Drug Comp              Branded Drug               2052606   
13        Clinical Drug Comp             Clinical Drug               2052398   
16              Branded Drug         Branded Drug Comp               2052609   
35        Clinical Drug Form             Clinical Drug               2053508   
38             Clinical Drug        Clinical Drug Form               2053517   

   RxNorm_concept_code_6  RxNorm_concept_id_5  RxNorm_concept_id_6  \
4                2052609             35200037             35200101   
13               2052399             35200091             35200092   
16               2052606             35200101             35200037   
35               2053517             35200176             35200183   
38               2053508             35200183             35200176   

                                RxNorm_concept_name_5  \
4                         tac

In [66]:
(fifth_sixth_relations.loc[:,['RxNorm_concept_class_id_5','RxNorm_concept_class_id_6']].
groupby(['RxNorm_concept_class_id_5','RxNorm_concept_class_id_6']).
 count()
)

RxNorm_concept_class_id_5,RxNorm_concept_class_id_6
Branded Drug,Branded Drug Comp
Branded Drug,Branded Drug Form
Branded Drug,Branded Pack
Branded Drug,Clinical Drug
Branded Drug,Quant Branded Drug
Branded Drug Comp,Branded Drug
Branded Drug Comp,Clinical Drug Comp
Branded Drug Form,Branded Drug
Branded Drug Form,Clinical Drug Form
Branded Pack,Branded Drug


In [67]:
rxnorm_to_ings123 = (first_second_relations.
 set_index(['RxNorm_concept_id_2','RxNorm_concept_code_2',
            'RxNorm_concept_name_2','RxNorm_concept_class_id_2']).
 join(second_third_relations.
      set_index(['RxNorm_concept_id_2','RxNorm_concept_code_2',
                 'RxNorm_concept_name_2','RxNorm_concept_class_id_2'])
     ).
 query('RxNorm_concept_class_id_3=="Ingredient" & '+
       '(RxNorm_concept_class_id_1!=RxNorm_concept_class_id_3)').
                  reset_index()
)
print(rxnorm_to_ings123.shape)
print(rxnorm_to_ings123.head())

(17209, 14)
   RxNorm_concept_id_2 RxNorm_concept_code_2  \
0             35200176               2053508   
1             35200176               2053508   
2             35200221               2054092   
3             40224671               1001569   
4             35200540               2056487   

                  RxNorm_concept_name_2 RxNorm_concept_class_id_2  \
0  ivacaftor / lumacaftor Oral Granules        Clinical Drug Form   
1  ivacaftor / lumacaftor Oral Granules        Clinical Drug Form   
2                    tafenoquine 100 MG        Clinical Drug Comp   
3           levothyroxine Oral Solution        Clinical Drug Form   
4      levothyroxine sodium 0.125 MG/ML        Clinical Drug Comp   

  RxNorm_concept_class_id_1 RxNorm_concept_code_1  RxNorm_concept_id_1  \
0             Clinical Drug               2053517             35200183   
1             Clinical Drug               2053517             35200183   
2             Clinical Drug               2054096             

In [68]:
len(np.intersect1d(
    rxnorm_to_ings123.RxNorm_concept_id_1.dropna().astype(int).unique(),
    all_openFDA_rxnorm_concept_ids
))/len(all_openFDA_rxnorm_concept_ids)

0.42393998695368557

In [69]:
(rxnorm_to_ings123.
loc[:,['RxNorm_concept_name_1','RxNorm_concept_name_3']].
drop_duplicates()
).head()

Unnamed: 0,RxNorm_concept_name_1,RxNorm_concept_name_3
0,ivacaftor 188 MG / lumacaftor 150 MG Oral Gran...,ivacaftor
1,ivacaftor 188 MG / lumacaftor 150 MG Oral Gran...,lumacaftor
2,tafenoquine 100 MG Oral Tablet,tafenoquine
3,levothyroxine sodium 0.088 MG/ML Oral Solution,levothyroxine
4,levothyroxine sodium 0.125 MG/ML Oral Solution,levothyroxine


In [70]:
(rxnorm_to_ings123.
loc[:,['RxNorm_concept_class_id_1','RxNorm_concept_class_id_2',
       'RxNorm_concept_class_id_3']].
 drop_duplicates()
)

Unnamed: 0,RxNorm_concept_class_id_1,RxNorm_concept_class_id_2,RxNorm_concept_class_id_3
0,Clinical Drug,Clinical Drug Form,Ingredient
2,Clinical Drug,Clinical Drug Comp,Ingredient


In [71]:
rxnorm_to_ings123_to_add = (rxnorm_to_ings123.
loc[:,['RxNorm_concept_id_1','RxNorm_concept_code_1',
       'RxNorm_concept_name_1','RxNorm_concept_class_id_1',
       'RxNorm_concept_id_3','RxNorm_concept_code_3',
       'RxNorm_concept_name_3','RxNorm_concept_class_id_3']].
 drop_duplicates().
 rename(
     columns={
         'RxNorm_concept_id_3' : 'RxNorm_concept_id_2',
         'RxNorm_concept_code_3' : 'RxNorm_concept_code_2',
         'RxNorm_concept_name_3' : 'RxNorm_concept_name_2',
         'RxNorm_concept_class_id_3' : 'RxNorm_concept_class_id_2'
     })
                            .drop_duplicates()
)
print(rxnorm_to_ings123_to_add.shape)
rxnorm_to_ings123_to_add.head()

(8591, 8)


Unnamed: 0,RxNorm_concept_id_1,RxNorm_concept_code_1,RxNorm_concept_name_1,RxNorm_concept_class_id_1,RxNorm_concept_id_2,RxNorm_concept_code_2,RxNorm_concept_name_2,RxNorm_concept_class_id_2
0,35200183,2053517,ivacaftor 188 MG / lumacaftor 150 MG Oral Gran...,Clinical Drug,42709323,1243041,ivacaftor,Ingredient
1,35200183,2053517,ivacaftor 188 MG / lumacaftor 150 MG Oral Gran...,Clinical Drug,46275580,1655922,lumacaftor,Ingredient
2,35200225,2054096,tafenoquine 100 MG Oral Tablet,Clinical Drug,35200201,2054023,tafenoquine,Ingredient
3,35200529,2056476,levothyroxine sodium 0.088 MG/ML Oral Solution,Clinical Drug,1501700,10582,levothyroxine,Ingredient
4,35200541,2056488,levothyroxine sodium 0.125 MG/ML Oral Solution,Clinical Drug,1501700,10582,levothyroxine,Ingredient


In [72]:
rxnorm_to_ings1234 = (first_second_relations.
 set_index(['RxNorm_concept_id_2','RxNorm_concept_code_2',
            'RxNorm_concept_name_2','RxNorm_concept_class_id_2']).
 join(second_third_relations.
      set_index(['RxNorm_concept_id_2','RxNorm_concept_code_2',
                 'RxNorm_concept_name_2','RxNorm_concept_class_id_2'])
     ).
 query('RxNorm_concept_class_id_3!="Ingredient" & '+
       '(RxNorm_concept_class_id_1!=RxNorm_concept_class_id_3)').
                  reset_index().
                      set_index(
                          ['RxNorm_concept_id_3','RxNorm_concept_code_3',
                           'RxNorm_concept_name_3','RxNorm_concept_class_id_3']
                      ).
                      join(third_fourth_relations.
                          set_index(
                          ['RxNorm_concept_id_3','RxNorm_concept_code_3',
                           'RxNorm_concept_name_3','RxNorm_concept_class_id_3']
                          )
                          ).
 query('RxNorm_concept_class_id_4=="Ingredient"').
                      reset_index()
)
rxnorm_to_ings1234 = rxnorm_to_ings1234.reindex(np.sort(rxnorm_to_ings1234.columns),axis=1)
print(rxnorm_to_ings1234.shape)
rxnorm_to_ings1234.head()

(28538, 19)


Unnamed: 0,RxNorm_concept_class_id_1,RxNorm_concept_class_id_2,RxNorm_concept_class_id_3,RxNorm_concept_class_id_4,RxNorm_concept_code_1,RxNorm_concept_code_2,RxNorm_concept_code_3,RxNorm_concept_code_4,RxNorm_concept_id_1,RxNorm_concept_id_2,RxNorm_concept_id_3,RxNorm_concept_id_4,RxNorm_concept_name_1,RxNorm_concept_name_2,RxNorm_concept_name_3,RxNorm_concept_name_4,relationship_id_12,relationship_id_23,relationship_id_34
0,Branded Drug,Branded Drug Comp,Clinical Drug Comp,Ingredient,2052609,2052606,2052503,42316,35200101,35200037,35200095,950637,tacrolimus 0.2 MG Granules for Oral Suspension...,tacrolimus 0.2 MG [Prograf],tacrolimus 0.2 MG,tacrolimus,Consists of,Tradename of,RxNorm has ing
1,Branded Drug,Branded Drug Comp,Clinical Drug Comp,Ingredient,2055014,2054982,2054980,2054968,35200332,35200300,35200298,35200286,stiripentol 500 MG Powder for Oral Suspension ...,stiripentol 500 MG [Diacomit],stiripentol 500 MG,stiripentol,Consists of,Tradename of,RxNorm has ing
2,Branded Drug,Branded Drug Form,Clinical Drug Form,Ingredient,2055286,2055284,375144,52177,35200380,35200378,40059397,967562,loteprednol etabonate 10 MG/ML Ophthalmic Susp...,loteprednol etabonate Ophthalmic Suspension [I...,loteprednol etabonate Ophthalmic Suspension,loteprednol etabonate,RxNorm is a,Tradename of,RxNorm has ing
3,Branded Drug,Clinical Drug,Clinical Drug Comp,Ingredient,2056502,2056500,2056499,10582,35200555,35200553,35200552,1501700,levothyroxine sodium 0.15 MG/ML Oral Solution ...,levothyroxine sodium 0.15 MG/ML Oral Solution,levothyroxine sodium 0.15 MG/ML,levothyroxine,Tradename of,Consists of,RxNorm has ing
4,Branded Drug,Clinical Drug,Clinical Drug Form,Ingredient,2056502,2056500,1001569,10582,35200555,35200553,40224671,1501700,levothyroxine sodium 0.15 MG/ML Oral Solution ...,levothyroxine sodium 0.15 MG/ML Oral Solution,levothyroxine Oral Solution,levothyroxine,Tradename of,RxNorm is a,RxNorm has ing


In [73]:
(rxnorm_to_ings1234.
loc[:,['RxNorm_concept_name_1','RxNorm_concept_name_4']].
drop_duplicates()
).head()
len(np.intersect1d(rxnorm_to_ings1234.RxNorm_concept_id_1.dropna().astype(int).unique(),
                  all_openFDA_rxnorm_concept_ids
                  ))/len(all_openFDA_rxnorm_concept_ids)

0.43561643835616437

In [74]:
(rxnorm_to_ings1234.
loc[:,['RxNorm_concept_class_id_1','RxNorm_concept_class_id_2',
       'RxNorm_concept_class_id_3','RxNorm_concept_class_id_4']].
 drop_duplicates()
)

Unnamed: 0,RxNorm_concept_class_id_1,RxNorm_concept_class_id_2,RxNorm_concept_class_id_3,RxNorm_concept_class_id_4
0,Branded Drug,Branded Drug Comp,Clinical Drug Comp,Ingredient
2,Branded Drug,Branded Drug Form,Clinical Drug Form,Ingredient
3,Branded Drug,Clinical Drug,Clinical Drug Comp,Ingredient
4,Branded Drug,Clinical Drug,Clinical Drug Form,Ingredient
15,Clinical Pack,Clinical Drug,Clinical Drug Comp,Ingredient
17,Clinical Pack,Clinical Drug,Clinical Drug Form,Ingredient
19,Quant Clinical Drug,Clinical Drug,Clinical Drug Form,Ingredient
20,Quant Clinical Drug,Clinical Drug,Clinical Drug Comp,Ingredient
48,Branded Pack,Clinical Drug,Clinical Drug Form,Ingredient
50,Branded Pack,Clinical Drug,Clinical Drug Comp,Ingredient


In [75]:
rxnorm_to_ings1234_to_add = (rxnorm_to_ings1234.
loc[:,['RxNorm_concept_id_1','RxNorm_concept_code_1',
       'RxNorm_concept_name_1','RxNorm_concept_class_id_1',
       'RxNorm_concept_id_4','RxNorm_concept_code_4',
       'RxNorm_concept_name_4','RxNorm_concept_class_id_4']].
 drop_duplicates().
 rename(
     columns={
         'RxNorm_concept_id_4' : 'RxNorm_concept_id_2',
         'RxNorm_concept_code_4' : 'RxNorm_concept_code_2',
         'RxNorm_concept_name_4' : 'RxNorm_concept_name_2',
         'RxNorm_concept_class_id_4' : 'RxNorm_concept_class_id_2'
     })
                            .drop_duplicates()
)
print(rxnorm_to_ings1234_to_add.shape)
rxnorm_to_ings1234_to_add.head()

(8647, 8)


Unnamed: 0,RxNorm_concept_id_1,RxNorm_concept_code_1,RxNorm_concept_name_1,RxNorm_concept_class_id_1,RxNorm_concept_id_2,RxNorm_concept_code_2,RxNorm_concept_name_2,RxNorm_concept_class_id_2
0,35200101,2052609,tacrolimus 0.2 MG Granules for Oral Suspension...,Branded Drug,950637,42316,tacrolimus,Ingredient
1,35200332,2055014,stiripentol 500 MG Powder for Oral Suspension ...,Branded Drug,35200286,2054968,stiripentol,Ingredient
2,35200380,2055286,loteprednol etabonate 10 MG/ML Ophthalmic Susp...,Branded Drug,967562,52177,loteprednol etabonate,Ingredient
3,35200555,2056502,levothyroxine sodium 0.15 MG/ML Oral Solution ...,Branded Drug,1501700,10582,levothyroxine,Ingredient
6,35200559,2056506,levothyroxine sodium 0.175 MG/ML Oral Solution...,Branded Drug,1501700,10582,levothyroxine,Ingredient


In [76]:
len(
    np.intersect1d(
        np.union1d(
            rxnorm_to_ings123.RxNorm_concept_id_1.dropna().astype(int).unique(),
            rxnorm_to_ings1234.RxNorm_concept_id_1.dropna().astype(int).unique()
        ),
        all_openFDA_rxnorm_concept_ids
    )
                  )/len(all_openFDA_rxnorm_concept_ids)

0.85955642530985

In [77]:
rxnorm_to_ings12345 = (first_second_relations.
 set_index(['RxNorm_concept_id_2','RxNorm_concept_code_2',
            'RxNorm_concept_name_2','RxNorm_concept_class_id_2']).
 join(second_third_relations.
      set_index(['RxNorm_concept_id_2','RxNorm_concept_code_2',
                 'RxNorm_concept_name_2','RxNorm_concept_class_id_2'])
     ).
 query('RxNorm_concept_class_id_3!="Ingredient" & '+
       '(RxNorm_concept_class_id_1!=RxNorm_concept_class_id_3)').
                  reset_index().
                      set_index(
                          ['RxNorm_concept_id_3','RxNorm_concept_code_3',
                           'RxNorm_concept_name_3','RxNorm_concept_class_id_3']
                      ).
                      join(third_fourth_relations.
                          set_index(
                          ['RxNorm_concept_id_3','RxNorm_concept_code_3',
                           'RxNorm_concept_name_3','RxNorm_concept_class_id_3']
                          )
                          ).
 query('RxNorm_concept_class_id_4!="Ingredient" & '+
       '(RxNorm_concept_class_id_2!=RxNorm_concept_class_id_4)').
                      reset_index().
                      set_index(
                          ['RxNorm_concept_id_4','RxNorm_concept_code_4',
                           'RxNorm_concept_name_4','RxNorm_concept_class_id_4']
                      ).
                      join(fourth_fifth_relations.
                          set_index(
                          ['RxNorm_concept_id_4','RxNorm_concept_code_4',
                           'RxNorm_concept_name_4','RxNorm_concept_class_id_4']
                          )
                          ).
 query('RxNorm_concept_class_id_5=="Ingredient"').
                      reset_index()
)
rxnorm_to_ings12345 = rxnorm_to_ings12345.reindex(np.sort(rxnorm_to_ings12345.columns),axis=1)
print(rxnorm_to_ings12345.shape)
rxnorm_to_ings12345.head()

(40419, 24)


Unnamed: 0,RxNorm_concept_class_id_1,RxNorm_concept_class_id_2,RxNorm_concept_class_id_3,RxNorm_concept_class_id_4,RxNorm_concept_class_id_5,RxNorm_concept_code_1,RxNorm_concept_code_2,RxNorm_concept_code_3,RxNorm_concept_code_4,RxNorm_concept_code_5,...,RxNorm_concept_id_5,RxNorm_concept_name_1,RxNorm_concept_name_2,RxNorm_concept_name_3,RxNorm_concept_name_4,RxNorm_concept_name_5,relationship_id_12,relationship_id_23,relationship_id_34,relationship_id_45
0,Clinical Drug,Branded Drug,Branded Drug Comp,Clinical Drug Comp,Ingredient,2056500,2056502,2056501,2056499,10582,...,1501700,levothyroxine sodium 0.15 MG/ML Oral Solution,levothyroxine sodium 0.15 MG/ML Oral Solution ...,levothyroxine sodium 0.15 MG/ML [Tirosint],levothyroxine sodium 0.15 MG/ML,levothyroxine,Has tradename,Consists of,Tradename of,RxNorm has ing
1,Clinical Drug,Branded Drug,Branded Drug Form,Clinical Drug Form,Ingredient,2056500,2056502,2056464,1001569,10582,...,1501700,levothyroxine sodium 0.15 MG/ML Oral Solution,levothyroxine sodium 0.15 MG/ML Oral Solution ...,levothyroxine Oral Solution [Tirosint],levothyroxine Oral Solution,levothyroxine,Has tradename,RxNorm is a,Tradename of,RxNorm has ing
2,Clinical Drug,Branded Drug,Branded Drug Form,Clinical Drug Form,Ingredient,2056504,2056506,2056464,1001569,10582,...,1501700,levothyroxine sodium 0.175 MG/ML Oral Solution,levothyroxine sodium 0.175 MG/ML Oral Solution...,levothyroxine Oral Solution [Tirosint],levothyroxine Oral Solution,levothyroxine,Has tradename,RxNorm is a,Tradename of,RxNorm has ing
3,Clinical Drug,Branded Drug,Branded Drug Comp,Clinical Drug Comp,Ingredient,2056504,2056506,2056505,2056503,10582,...,1501700,levothyroxine sodium 0.175 MG/ML Oral Solution,levothyroxine sodium 0.175 MG/ML Oral Solution...,levothyroxine sodium 0.175 MG/ML [Tirosint],levothyroxine sodium 0.175 MG/ML,levothyroxine,Has tradename,Consists of,Tradename of,RxNorm has ing
4,Clinical Drug,Branded Drug,Branded Drug Form,Clinical Drug Form,Ingredient,2056895,2056897,977430,845506,141704,...,19011440,everolimus 1 MG Oral Tablet,everolimus 1 MG Oral Tablet [Zortress],everolimus Oral Tablet [Zortress],everolimus Oral Tablet,everolimus,Has tradename,RxNorm is a,Tradename of,RxNorm has ing


In [78]:
(rxnorm_to_ings12345.
loc[:,['RxNorm_concept_name_1','RxNorm_concept_name_5']].
drop_duplicates()
).head()
len(np.intersect1d(rxnorm_to_ings12345.RxNorm_concept_id_1.dropna().astype(int).unique(),
                  all_openFDA_rxnorm_concept_ids
                  ))/len(all_openFDA_rxnorm_concept_ids)

0.4806262230919765

In [79]:
(rxnorm_to_ings12345.
loc[:,['RxNorm_concept_class_id_1','RxNorm_concept_class_id_2',
       'RxNorm_concept_class_id_3','RxNorm_concept_class_id_4',
       'RxNorm_concept_class_id_5']].
 drop_duplicates()
)

Unnamed: 0,RxNorm_concept_class_id_1,RxNorm_concept_class_id_2,RxNorm_concept_class_id_3,RxNorm_concept_class_id_4,RxNorm_concept_class_id_5
0,Clinical Drug,Branded Drug,Branded Drug Comp,Clinical Drug Comp,Ingredient
1,Clinical Drug,Branded Drug,Branded Drug Form,Clinical Drug Form,Ingredient
8,Branded Pack,Clinical Pack,Clinical Drug,Clinical Drug Comp,Ingredient
9,Branded Pack,Clinical Pack,Clinical Drug,Clinical Drug Form,Ingredient
18,Quant Branded Drug,Branded Drug,Branded Drug Comp,Clinical Drug Comp,Ingredient
20,Quant Branded Drug,Branded Drug,Branded Drug Form,Clinical Drug Form,Ingredient
22,Quant Branded Drug,Branded Drug,Clinical Drug,Clinical Drug Form,Ingredient
24,Quant Branded Drug,Branded Drug,Clinical Drug,Clinical Drug Comp,Ingredient
26,Quant Branded Drug,Quant Clinical Drug,Clinical Drug,Clinical Drug Comp,Ingredient
27,Quant Branded Drug,Quant Clinical Drug,Clinical Drug,Clinical Drug Form,Ingredient


In [80]:
rxnorm_to_ings12345_to_add = (rxnorm_to_ings12345.
loc[:,['RxNorm_concept_id_1','RxNorm_concept_code_1',
       'RxNorm_concept_name_1','RxNorm_concept_class_id_1',
       'RxNorm_concept_id_5','RxNorm_concept_code_5',
       'RxNorm_concept_name_5','RxNorm_concept_class_id_5']].
 drop_duplicates().
 rename(
     columns={
         'RxNorm_concept_id_5' : 'RxNorm_concept_id_2',
         'RxNorm_concept_code_5' : 'RxNorm_concept_code_2',
         'RxNorm_concept_name_5' : 'RxNorm_concept_name_2',
         'RxNorm_concept_class_id_5' : 'RxNorm_concept_class_id_2'
     })
                            .drop_duplicates()
)
print(rxnorm_to_ings12345_to_add.shape)
rxnorm_to_ings12345_to_add.head()

(9350, 8)


Unnamed: 0,RxNorm_concept_id_1,RxNorm_concept_code_1,RxNorm_concept_name_1,RxNorm_concept_class_id_1,RxNorm_concept_id_2,RxNorm_concept_code_2,RxNorm_concept_name_2,RxNorm_concept_class_id_2
0,35200553,2056500,levothyroxine sodium 0.15 MG/ML Oral Solution,Clinical Drug,1501700,10582,levothyroxine,Ingredient
2,35200557,2056504,levothyroxine sodium 0.175 MG/ML Oral Solution,Clinical Drug,1501700,10582,levothyroxine,Ingredient
4,35200596,2056895,everolimus 1 MG Oral Tablet,Clinical Drug,19011440,141704,everolimus,Ingredient
6,35200958,2059274,omadacycline 150 MG Oral Tablet,Clinical Drug,35200953,2059269,omadacycline,Ingredient
8,35201880,2054269,{7 (valbenazine 40 MG Oral Capsule [Ingrezza])...,Branded Pack,1593849,1918219,valbenazine,Ingredient


In [81]:
len(
    np.intersect1d(
        np.union1d(
            np.union1d(
                rxnorm_to_ings123.RxNorm_concept_id_1.dropna().astype(int).unique(),
                rxnorm_to_ings1234.RxNorm_concept_id_1.dropna().astype(int).unique()
            ),
            rxnorm_to_ings12345.RxNorm_concept_id_1.dropna().astype(int).unique()
        ),
        all_openFDA_rxnorm_concept_ids
    )
                  )/len(all_openFDA_rxnorm_concept_ids)

0.9869536855838226

In [82]:
np.setdiff1d(
        all_openFDA_rxnorm_concept_ids,
    np.union1d(
            np.union1d(
                rxnorm_to_ings123.RxNorm_concept_id_1.dropna().astype(int).unique(),
                rxnorm_to_ings1234.RxNorm_concept_id_1.dropna().astype(int).unique()
            ),
            rxnorm_to_ings12345.RxNorm_concept_id_1.dropna().astype(int).unique()
        )
    )

array([  740228,   747527,   747535,   747537,   747541,   747543,
         747545,   747547,   747549,   747551,   747553,   747555,
         747557,   747559,   747561,   747563,   747565,   747567,
         747569,   747571,   747573,   747575,   747577,   747579,
         747581,   747583,   747585,   747587,   747589,   747591,
         747593,   747595,   747597,   747599,   747601,   747603,
         747605,   747607,   747609,   747611,   747613,   747615,
         747617,   747619,   747621,   747623,   747625,   747627,
         747629,   747631,   747633,   747635,   747637,   747639,
         747641,   747643,   747645,   747647,   747649,   747651,
         747653,   747655,   747657,   747659,   747661,   747663,
         747665,   747667,   747690,   747692,   747694,   747696,
        1146716,  1201524,  1201538,  1201540,  1302184,  1302186,
        1302188,  1302190,  1302192,  1302194,  1302196,  1302198,
        1302200,  1302202,  1302204,  1302206,  1302208,  1302

In [83]:
rxnorm_to_ings123456 = (first_second_relations.
 set_index(['RxNorm_concept_id_2','RxNorm_concept_code_2',
            'RxNorm_concept_name_2','RxNorm_concept_class_id_2']).
 join(second_third_relations.
      set_index(['RxNorm_concept_id_2','RxNorm_concept_code_2',
                 'RxNorm_concept_name_2','RxNorm_concept_class_id_2'])
     ).
 query('RxNorm_concept_class_id_3!="Ingredient" & '+
       '(RxNorm_concept_class_id_1!=RxNorm_concept_class_id_3)').
                  reset_index().
                      set_index(
                          ['RxNorm_concept_id_3','RxNorm_concept_code_3',
                           'RxNorm_concept_name_3','RxNorm_concept_class_id_3']
                      ).
                      join(third_fourth_relations.
                          set_index(
                          ['RxNorm_concept_id_3','RxNorm_concept_code_3',
                           'RxNorm_concept_name_3','RxNorm_concept_class_id_3']
                          )
                          ).
 query('RxNorm_concept_class_id_4!="Ingredient" & '+
       '(RxNorm_concept_class_id_2!=RxNorm_concept_class_id_4)').
                      reset_index().
                      set_index(
                          ['RxNorm_concept_id_4','RxNorm_concept_code_4',
                           'RxNorm_concept_name_4','RxNorm_concept_class_id_4']
                      ).
                      join(fourth_fifth_relations.
                          set_index(
                          ['RxNorm_concept_id_4','RxNorm_concept_code_4',
                           'RxNorm_concept_name_4','RxNorm_concept_class_id_4']
                          )
                          ).
 query('RxNorm_concept_class_id_5!="Ingredient" & '+
       '(RxNorm_concept_class_id_3!=RxNorm_concept_class_id_5)').
                      reset_index().
                      set_index(
                          ['RxNorm_concept_id_5','RxNorm_concept_code_5',
                           'RxNorm_concept_name_5','RxNorm_concept_class_id_5']
                      ).
                      join(fifth_sixth_relations.
                          set_index(
                          ['RxNorm_concept_id_5','RxNorm_concept_code_5',
                           'RxNorm_concept_name_5','RxNorm_concept_class_id_5']
                          )
                          ).
 query('RxNorm_concept_class_id_6=="Ingredient"').
                      reset_index()
)
rxnorm_to_ings123456 = rxnorm_to_ings123456.reindex(np.sort(rxnorm_to_ings123456.columns),axis=1)
print(rxnorm_to_ings123456.shape)
rxnorm_to_ings123456.head()

(180769, 29)


Unnamed: 0,RxNorm_concept_class_id_1,RxNorm_concept_class_id_2,RxNorm_concept_class_id_3,RxNorm_concept_class_id_4,RxNorm_concept_class_id_5,RxNorm_concept_class_id_6,RxNorm_concept_code_1,RxNorm_concept_code_2,RxNorm_concept_code_3,RxNorm_concept_code_4,...,RxNorm_concept_name_2,RxNorm_concept_name_3,RxNorm_concept_name_4,RxNorm_concept_name_5,RxNorm_concept_name_6,relationship_id_12,relationship_id_23,relationship_id_34,relationship_id_45,relationship_id_56
0,Branded Drug,Branded Drug Comp,Clinical Drug Comp,Clinical Drug,Clinical Drug Form,Ingredient,2052609,2052606,2052503,2052605,...,tacrolimus 0.2 MG [Prograf],tacrolimus 0.2 MG,tacrolimus 0.2 MG Granules for Oral Suspension,tacrolimus Granules for Oral Suspension,tacrolimus,Consists of,Tradename of,Constitutes,RxNorm is a,RxNorm has ing
1,Branded Drug,Branded Drug Comp,Clinical Drug Comp,Clinical Drug,Clinical Drug Form,Ingredient,2055014,2054982,2054980,2054981,...,stiripentol 500 MG [Diacomit],stiripentol 500 MG,stiripentol 500 MG Oral Capsule,stiripentol Oral Capsule,stiripentol,Consists of,Tradename of,Constitutes,RxNorm is a,RxNorm has ing
2,Branded Drug,Branded Drug Comp,Clinical Drug Comp,Clinical Drug,Clinical Drug Form,Ingredient,2055014,2054982,2054980,2055011,...,stiripentol 500 MG [Diacomit],stiripentol 500 MG,stiripentol 500 MG Powder for Oral Suspension,stiripentol Powder for Oral Suspension,stiripentol,Consists of,Tradename of,Constitutes,RxNorm is a,RxNorm has ing
3,Branded Drug,Branded Drug Form,Clinical Drug Form,Clinical Drug,Clinical Drug Comp,Ingredient,2055286,2055284,375144,311382,...,loteprednol etabonate Ophthalmic Suspension [I...,loteprednol etabonate Ophthalmic Suspension,loteprednol etabonate 5 MG/ML Ophthalmic Suspe...,loteprednol etabonate 5 MG/ML,loteprednol etabonate,RxNorm is a,Tradename of,RxNorm inverse is a,Consists of,RxNorm has ing
4,Branded Drug,Branded Drug Form,Clinical Drug Form,Clinical Drug,Clinical Drug Comp,Ingredient,2055286,2055284,375144,2464838,...,loteprednol etabonate Ophthalmic Suspension [I...,loteprednol etabonate Ophthalmic Suspension,loteprednol etabonate 2.5 MG/ML Ophthalmic Sus...,loteprednol etabonate 2.5 MG/ML,loteprednol etabonate,RxNorm is a,Tradename of,RxNorm inverse is a,Consists of,RxNorm has ing


In [84]:
(rxnorm_to_ings123456.
loc[:,['RxNorm_concept_name_1','RxNorm_concept_name_6']].
drop_duplicates()
).head()
len(np.intersect1d(rxnorm_to_ings123456.RxNorm_concept_id_1.dropna().astype(int).unique(),
                  all_openFDA_rxnorm_concept_ids
                  ))/len(all_openFDA_rxnorm_concept_ids)

0.41448140900195696

In [85]:
(rxnorm_to_ings123456.
loc[:,['RxNorm_concept_class_id_1','RxNorm_concept_class_id_2',
       'RxNorm_concept_class_id_3','RxNorm_concept_class_id_4',
       'RxNorm_concept_class_id_5','RxNorm_concept_class_id_6']].
 drop_duplicates()
)

Unnamed: 0,RxNorm_concept_class_id_1,RxNorm_concept_class_id_2,RxNorm_concept_class_id_3,RxNorm_concept_class_id_4,RxNorm_concept_class_id_5,RxNorm_concept_class_id_6
0,Branded Drug,Branded Drug Comp,Clinical Drug Comp,Clinical Drug,Clinical Drug Form,Ingredient
3,Branded Drug,Branded Drug Form,Clinical Drug Form,Clinical Drug,Clinical Drug Comp,Ingredient
12,Clinical Pack,Branded Pack,Branded Drug,Branded Drug Comp,Clinical Drug Comp,Ingredient
13,Clinical Pack,Branded Pack,Branded Drug,Clinical Drug,Clinical Drug Comp,Ingredient
14,Clinical Pack,Branded Pack,Branded Drug,Clinical Drug,Clinical Drug Form,Ingredient
15,Clinical Pack,Branded Pack,Branded Drug,Branded Drug Form,Clinical Drug Form,Ingredient
20,Quant Clinical Drug,Clinical Drug,Branded Drug,Branded Drug Form,Clinical Drug Form,Ingredient
21,Quant Clinical Drug,Clinical Drug,Branded Drug,Branded Drug Comp,Clinical Drug Comp,Ingredient
24,Quant Clinical Drug,Quant Branded Drug,Branded Drug,Clinical Drug,Clinical Drug Comp,Ingredient
25,Quant Clinical Drug,Quant Branded Drug,Branded Drug,Clinical Drug,Clinical Drug Form,Ingredient


In [86]:
rxnorm_to_ings123456_to_add = (rxnorm_to_ings123456.
loc[:,['RxNorm_concept_id_1','RxNorm_concept_code_1',
       'RxNorm_concept_name_1','RxNorm_concept_class_id_1',
       'RxNorm_concept_id_6','RxNorm_concept_code_6',
       'RxNorm_concept_name_6','RxNorm_concept_class_id_6']].
 drop_duplicates().
 rename(
     columns={
         'RxNorm_concept_id_6' : 'RxNorm_concept_id_2',
         'RxNorm_concept_code_6' : 'RxNorm_concept_code_2',
         'RxNorm_concept_name_6' : 'RxNorm_concept_name_2',
         'RxNorm_concept_class_id_6' : 'RxNorm_concept_class_id_2'
     }).
                            drop_duplicates()
)
print(rxnorm_to_ings123456_to_add.shape)
rxnorm_to_ings123456_to_add.head()

(18428, 8)


Unnamed: 0,RxNorm_concept_id_1,RxNorm_concept_code_1,RxNorm_concept_name_1,RxNorm_concept_class_id_1,RxNorm_concept_id_2,RxNorm_concept_code_2,RxNorm_concept_name_2,RxNorm_concept_class_id_2
0,35200101,2052609,tacrolimus 0.2 MG Granules for Oral Suspension...,Branded Drug,950637.0,42316,tacrolimus,Ingredient
1,35200332,2055014,stiripentol 500 MG Powder for Oral Suspension ...,Branded Drug,35200286.0,2054968,stiripentol,Ingredient
3,35200380,2055286,loteprednol etabonate 10 MG/ML Ophthalmic Susp...,Branded Drug,967562.0,52177,loteprednol etabonate,Ingredient
7,35200555,2056502,levothyroxine sodium 0.15 MG/ML Oral Solution ...,Branded Drug,1501700.0,10582,levothyroxine,Ingredient
8,35200892,2059029,sarecycline 100 MG Oral Tablet [Seysara],Branded Drug,35200881.0,2059018,sarecycline,Ingredient


In [87]:
len(
    np.intersect1d(
        np.union1d(
            np.union1d(
                np.union1d(
                    rxnorm_to_ings123.RxNorm_concept_id_1.dropna().astype(int).unique(),
                    rxnorm_to_ings1234.RxNorm_concept_id_1.dropna().astype(int).unique()
                ),
                rxnorm_to_ings12345.RxNorm_concept_id_1.dropna().astype(int).unique()
            ),
            rxnorm_to_ings123456.RxNorm_concept_id_1.dropna().astype(int).unique()
        ),
        all_openFDA_rxnorm_concept_ids
    )
                  )/len(all_openFDA_rxnorm_concept_ids)

0.9992172211350293

In [88]:
np.setdiff1d(
        all_openFDA_rxnorm_concept_ids,
        np.union1d(
            np.union1d(
                np.union1d(
                    rxnorm_to_ings123.RxNorm_concept_id_1.dropna().astype(int).unique(),
                    rxnorm_to_ings1234.RxNorm_concept_id_1.dropna().astype(int).unique()
                ),
                rxnorm_to_ings12345.RxNorm_concept_id_1.dropna().astype(int).unique()
            ),
            rxnorm_to_ings123456.RxNorm_concept_id_1.dropna().astype(int).unique()
        )
)

array([1594443, 1594444, 1594468, 1594469, 1594470, 1594471, 1594472,
       1594473, 1594484, 1594485, 1594486, 1594487])

In [89]:
rxnorm_to_ings_all = pd.concat(
    [
        rxnorm_to_ings123_to_add,
        rxnorm_to_ings1234_to_add,
        rxnorm_to_ings12345_to_add,
        rxnorm_to_ings123456_to_add
    ]
).dropna().drop_duplicates()
rxnorm_to_ings_all.RxNorm_concept_id_2 = rxnorm_to_ings_all.RxNorm_concept_id_2.astype(int)
print(rxnorm_to_ings_all.shape)
rxnorm_to_ings_all.head()

(30149, 8)


Unnamed: 0,RxNorm_concept_id_1,RxNorm_concept_code_1,RxNorm_concept_name_1,RxNorm_concept_class_id_1,RxNorm_concept_id_2,RxNorm_concept_code_2,RxNorm_concept_name_2,RxNorm_concept_class_id_2
0,35200183,2053517,ivacaftor 188 MG / lumacaftor 150 MG Oral Gran...,Clinical Drug,42709323,1243041,ivacaftor,Ingredient
1,35200183,2053517,ivacaftor 188 MG / lumacaftor 150 MG Oral Gran...,Clinical Drug,46275580,1655922,lumacaftor,Ingredient
2,35200225,2054096,tafenoquine 100 MG Oral Tablet,Clinical Drug,35200201,2054023,tafenoquine,Ingredient
3,35200529,2056476,levothyroxine sodium 0.088 MG/ML Oral Solution,Clinical Drug,1501700,10582,levothyroxine,Ingredient
4,35200541,2056488,levothyroxine sodium 0.125 MG/ML Oral Solution,Clinical Drug,1501700,10582,levothyroxine,Ingredient


In [90]:
len(
    np.intersect1d(
        rxnorm_to_ings_all.RxNorm_concept_id_1,
        all_openFDA_rxnorm_concept_ids
    )
)/len(all_openFDA_rxnorm_concept_ids)

0.9992172211350293

In [91]:
standard_drug = (pd.
                 read_csv(er_dir+'standard_drugs.csv.gz',
                          compression='gzip',
                          dtype={
                              'safetyreportid' : 'str'
                          })
                )
standard_drug.RxNorm_concept_id = standard_drug.RxNorm_concept_id.astype(int)
all_reports = standard_drug.safetyreportid.astype(str).unique()
print(standard_drug.shape)
standard_drug.head()

(15330, 5)


Unnamed: 0,RxNorm_concept_class_id,RxNorm_concept_code,RxNorm_concept_id,RxNorm_concept_name,safetyreportid
0,Clinical Drug,197807,19019074,ibuprofen 800 MG Oral Tablet,10003301
1,Quant Clinical Drug,854183,40161247,8 ML ibuprofen 100 MG/ML Injection,10003301
2,Branded Drug,731533,1177843,ibuprofen 200 MG Oral Capsule [Advil],10003301
3,Branded Drug,544393,1177822,ibuprofen 20 MG/ML Oral Suspension [Motrin],10003301
4,Branded Drug,206878,19033719,ibuprofen 20 MG/ML Oral Suspension [Advil],10003301


In [92]:
standard_drug_ingredients = ((standard_drug.
  loc[:,['RxNorm_concept_id','safetyreportid']].
  drop_duplicates().
set_index(
    [
        'RxNorm_concept_id'
    ]
)
).join(rxnorm_to_ings_all.
       loc[:,['RxNorm_concept_id_1','RxNorm_concept_id_2',
             'RxNorm_concept_code_2','RxNorm_concept_name_2',
             'RxNorm_concept_class_id_2']].
       drop_duplicates().
set_index(
    [
        'RxNorm_concept_id_1'
    ]
)
).drop_duplicates().
 rename(
     columns={
         'RxNorm_concept_id_2':'RxNorm_concept_id',
         'RxNorm_concept_code_2':'RxNorm_concept_code',
         'RxNorm_concept_name_2':'RxNorm_concept_name',
         'RxNorm_concept_class_id_2':'RxNorm_concept_class_id'
     }).
                             reset_index(drop=True).
                             dropna().
                             drop_duplicates()
       )
standard_drug_ingredients = (standard_drug_ingredients.
                             reindex(np.sort(standard_drug_ingredients.columns),axis=1)
                            )
print(standard_drug_ingredients.shape)
standard_drug_ingredients.head()

(10133, 5)


Unnamed: 0,RxNorm_concept_class_id,RxNorm_concept_code,RxNorm_concept_id,RxNorm_concept_name,safetyreportid
0,Ingredient,5640,1177480.0,ibuprofen,10003301
1,Ingredient,8163,1135766.0,phenylephrine,10003301
2,Ingredient,34930,19029550.0,pseudoisocytidine,10003301
3,Ingredient,3498,1129625.0,diphenhydramine,10003301
4,Ingredient,8896,1154332.0,pseudoephedrine,10003301


In [93]:
print(len(
    np.intersect1d(
        standard_drug_ingredients.safetyreportid.astype(str).unique(),
        all_reports
    )
)/len(all_reports))

0.9996447602131439


In [94]:
(standard_drug_ingredients.
 to_csv(er_dir+'standard_drugs_rxnorm_ingredients.csv.gz',compression='gzip',index=False))

### standard_reactions_meddra_relationships

In [95]:
standard_reactions = (pd.
                      read_csv(er_dir+'standard_reactions.csv.gz',
                               compression="gzip",
                               dtype={
                                   'safetyreportid' : 'str'
                               }
                              )
                     )
all_reports = (standard_reactions.safetyreportid.unique())
print(standard_reactions.shape)
print(standard_reactions.head())

(55164660, 6)
  MedDRA_concept_class_id  MedDRA_concept_code  MedDRA_concept_id  \
0                      PT             10012735           35708093   
1                      PT             10003239           36516812   
2                      PT             10019211           36718132   
3                      PT             10047700           35708208   
4                      PT             10013946           35708139   

  MedDRA_concept_name reaction_outcome safetyreportid  
0           Diarrhoea              NaN       10003300  
1          Arthralgia              NaN       10003300  
2            Headache              NaN       10003300  
3            Vomiting              NaN       10003300  
4           Dyspepsia              NaN       10003301  


In [96]:
reactions = standard_reactions.MedDRA_concept_id.astype(int).unique()
print(len(reactions))
meddra_concept_ids = concept.query('vocabulary_id=="MedDRA"').concept_id.astype(int).unique()
len(meddra_concept_ids)

intersect = np.intersect1d(reactions,meddra_concept_ids)
print(len(intersect))
print(len(intersect)/len(reactions))

22226
22226
1.0


In [97]:
meddra_concept = concept.query('vocabulary_id=="MedDRA"')
meddra_concept.concept_id = meddra_concept.concept_id.astype(int)
all_meddra_concept_ids = meddra_concept.concept_id.unique()

r = (concept_relationship.
     copy().
     loc[:,['concept_id_1','concept_id_2','relationship_id']].
     drop_duplicates()
    )
r.concept_id_1 = r.concept_id_1.astype(int)
r.concept_id_2 = r.concept_id_2.astype(int)


In [98]:
(r.
query('concept_id_1 in @all_meddra_concept_ids & '+
     'concept_id_2 in @all_meddra_concept_ids').
relationship_id.value_counts()
)

relationship_id
Subsumes               104517
Is a                   104517
Concept replaces        21009
Concept replaced by     21009
Name: count, dtype: int64

In [99]:
c = meddra_concept.copy()

all_meddra_relationships = (r.
 query('concept_id_1 in @meddra_concept_ids & '+\
       'concept_id_2 in @meddra_concept_ids').
 set_index('concept_id_1').
 join(
     c. # standard concepts for 1
     query('vocabulary_id=="MedDRA"').
     loc[:,['concept_id','concept_code','concept_name','concept_class_id']].
     drop_duplicates().
     set_index('concept_id')
    ).
 rename_axis('MedDRA_concept_id_1').
 reset_index().
 rename(
     columns={
         'concept_code' : 'MedDRA_concept_code_1',
         'concept_class_id' : 'MedDRA_concept_class_id_1',
         'concept_name' : 'MedDRA_concept_name_1',
         'concept_id_2' : 'MedDRA_concept_id_2',
         'relationship_id' : 'relationship_id_12'
     }
 ).
set_index('MedDRA_concept_id_2').
 join(
     c. # standard concepts for 2
     query('vocabulary_id=="MedDRA"').
     loc[:,['concept_id','concept_code','concept_name','concept_class_id']].
     drop_duplicates().
     set_index('concept_id')
    ).
 rename_axis('MedDRA_concept_id_2').
 reset_index().
 rename(
     columns={
         'concept_code' : 'MedDRA_concept_code_2',
         'concept_class_id' : 'MedDRA_concept_class_id_2',
         'concept_name' : 'MedDRA_concept_name_2'
     }
 ))
all_meddra_relationships = (all_meddra_relationships.
                            reindex(np.sort(all_meddra_relationships.columns),axis=1)
                           )
print(all_meddra_relationships.shape)
print(all_meddra_relationships.head())

(251052, 9)
  MedDRA_concept_class_id_1 MedDRA_concept_class_id_2 MedDRA_concept_code_1  \
0                      HLGT                       HLT              10002086   
1                      HLGT                       HLT              10018911   
2                      HLGT                       HLT              10018911   
3                      HLGT                       HLT              10035227   
4                      HLGT                       HLT              10041641   

  MedDRA_concept_code_2  MedDRA_concept_id_1  MedDRA_concept_id_2  \
0              10026847             35102033             35102369   
1              10002052             35102038             35102384   
2              10038185             35102038             35102388   
3              10028229             35102045             35102436   
4              10041635             35102048             35102446   

                               MedDRA_concept_name_1  \
0       Anaemias nonhaemolytic and marrow 

In [100]:
print(all_meddra_relationships.MedDRA_concept_class_id_1.value_counts())
print(all_meddra_relationships.MedDRA_concept_class_id_2.value_counts())

MedDRA_concept_class_id_1
PT      121362
LLT      85282
HLT      41844
HLGT      2200
SOC        364
Name: count, dtype: int64
MedDRA_concept_class_id_2
PT      121362
LLT      85282
HLT      41844
HLGT      2200
SOC        364
Name: count, dtype: int64


In [101]:
all_meddra_relationships.MedDRA_concept_id_1 = (all_meddra_relationships.
                                                  MedDRA_concept_id_1.
                                                  astype(int)
                                                 )
all_meddra_relationships.MedDRA_concept_code_1 = (all_meddra_relationships.
                                                  MedDRA_concept_code_1.
                                                  astype(int)
                                                 )
all_meddra_relationships.MedDRA_concept_id_2 = (all_meddra_relationships.
                                                  MedDRA_concept_id_2.
                                                  astype(int)
                                                 )
all_meddra_relationships.MedDRA_concept_code_2 = (all_meddra_relationships.
                                                  MedDRA_concept_code_2.
                                                  astype(int)
                                                 )

In [102]:
first_rxs = reactions
first_relations = (all_meddra_relationships.
                   query('MedDRA_concept_id_1 in @first_rxs & '+
                         'MedDRA_concept_class_id_2=="HLT"')
                  ).reset_index(drop=True)
first_relations = (first_relations[
    first_relations.MedDRA_concept_id_1!=first_relations.MedDRA_concept_id_2
])
print(first_relations.shape)
print(first_relations.head())
print(first_relations.MedDRA_concept_class_id_2.value_counts())

(34004, 9)
  MedDRA_concept_class_id_1 MedDRA_concept_class_id_2  MedDRA_concept_code_1  \
0                        PT                       HLT               10061101   
1                        PT                       HLT               10002065   
2                        PT                       HLT               10044697   
3                        PT                       HLT               10058116   
4                        PT                       HLT               10002967   

   MedDRA_concept_code_2  MedDRA_concept_id_1  MedDRA_concept_id_2  \
0               10018067             35104069             36403210   
1               10047627             35104076             36403246   
2               10047627             35104095             36403246   
3               10010180             35104098             37003670   
4               10026847             35104101             35102369   

   MedDRA_concept_name_1                       MedDRA_concept_name_2  \
0     Deficienc

In [103]:
second_rxs = first_relations.MedDRA_concept_id_2.unique()
second_relations = (all_meddra_relationships.
                    query('MedDRA_concept_id_1 in @second_rxs & '+
                         'MedDRA_concept_class_id_2=="HLGT"').
                    rename(columns={
                        'MedDRA_concept_id_2' : 'MedDRA_concept_id_3',
                        'MedDRA_concept_code_2' : 'MedDRA_concept_code_3',
                        'MedDRA_concept_name_2' : 'MedDRA_concept_name_3',
                        'MedDRA_concept_class_id_2' : 'MedDRA_concept_class_id_3',
                        'MedDRA_concept_id_1' : 'MedDRA_concept_id_2',
                        'MedDRA_concept_code_1' : 'MedDRA_concept_code_2',
                        'MedDRA_concept_name_1' : 'MedDRA_concept_name_2',
                        'MedDRA_concept_class_id_1' : 'MedDRA_concept_class_id_2',
                        'relationship_id_12' : 'relationship_id_23'
                    }
                          )
                  ).reset_index(drop=True)
second_relations = (second_relations[
    second_relations.MedDRA_concept_id_2!=second_relations.MedDRA_concept_id_3
])
print(second_relations.shape)
print(second_relations.head())
print(second_relations.MedDRA_concept_class_id_2.value_counts())
print(second_relations.MedDRA_concept_class_id_3.value_counts())

(1828, 9)
  MedDRA_concept_class_id_2 MedDRA_concept_class_id_3  MedDRA_concept_code_2  \
0                       HLT                      HLGT               10026847   
1                       HLT                      HLGT               10002052   
2                       HLT                      HLGT               10038185   
3                       HLT                      HLGT               10028229   
4                       HLT                      HLGT               10041635   

   MedDRA_concept_code_3  MedDRA_concept_id_2  MedDRA_concept_id_3  \
0               10002086             35102369             35102033   
1               10018911             35102384             35102038   
2               10018911             35102388             35102038   
3               10035227             35102436             35102045   
4               10041641             35102446             35102048   

                        MedDRA_concept_name_2  \
0  Marrow depression and hypoplastic an

In [104]:
third_rxs = second_relations.MedDRA_concept_id_3.unique()
third_relations = (all_meddra_relationships.
                    query('MedDRA_concept_id_1 in @third_rxs & '+
                         'MedDRA_concept_class_id_2=="SOC"').
                    rename(columns={
                        'MedDRA_concept_id_2' : 'MedDRA_concept_id_4',
                        'MedDRA_concept_code_2' : 'MedDRA_concept_code_4',
                        'MedDRA_concept_name_2' : 'MedDRA_concept_name_4',
                        'MedDRA_concept_class_id_2' : 'MedDRA_concept_class_id_4',
                        'MedDRA_concept_id_1' : 'MedDRA_concept_id_3',
                        'MedDRA_concept_code_1' : 'MedDRA_concept_code_3',
                        'MedDRA_concept_name_1' : 'MedDRA_concept_name_3',
                        'MedDRA_concept_class_id_1' : 'MedDRA_concept_class_id_3',
                        'relationship_id_12' : 'relationship_id_34'
                    }
                          )
                  ).reset_index(drop=True)
third_relations = (third_relations[
    third_relations.MedDRA_concept_id_3!=third_relations.MedDRA_concept_id_4
])
print(third_relations.shape)
print(third_relations.head())
print(third_relations.MedDRA_concept_class_id_3.value_counts())
print(third_relations.MedDRA_concept_class_id_4.value_counts())

(364, 9)
  MedDRA_concept_class_id_3 MedDRA_concept_class_id_4  MedDRA_concept_code_3  \
0                      HLGT                       SOC               10027664   
1                      HLGT                       SOC               10019806   
2                      HLGT                       SOC               10021429   
3                      HLGT                       SOC               10038686   
4                      HLGT                       SOC               10018424   

   MedDRA_concept_code_4  MedDRA_concept_id_3  MedDRA_concept_id_4  \
0               10010331             35302063             35300000   
1               10010331             35302069             35300000   
2               10010331             35302070             35300000   
3               10010331             35302077             35300000   
4               10014698             35502089             35500000   

                               MedDRA_concept_name_3  \
0            Congenital and hered

In [106]:
first_second_third_relations = \
(first_relations.
 set_index('MedDRA_concept_id_2').
 join(second_relations.
      loc[:,['MedDRA_concept_id_2','MedDRA_concept_id_3',
             'MedDRA_concept_name_3','MedDRA_concept_class_id_3',
             'MedDRA_concept_code_3','relationship_id_23']].
      set_index('MedDRA_concept_id_2')
     ).
 reset_index()
)
first_second_third_relations = \
(first_second_third_relations.
 reindex(np.sort(first_second_third_relations.columns),
         axis=1)
)
first_second_third_relations['MedDRA_concept_id_3'] = \
first_second_third_relations['MedDRA_concept_id_3'].astype(int)
print(first_second_third_relations.shape)
print(first_second_third_relations.head())
print(first_second_third_relations.MedDRA_concept_class_id_1.value_counts())
print(first_second_third_relations.MedDRA_concept_class_id_2.value_counts())
print(first_second_third_relations.MedDRA_concept_class_id_3.value_counts())

(34954, 14)
  MedDRA_concept_class_id_1 MedDRA_concept_class_id_2  \
0                        PT                       HLT   
1                        PT                       HLT   
2                        PT                       HLT   
3                        PT                       HLT   
4                        PT                       HLT   

  MedDRA_concept_class_id_3  MedDRA_concept_code_1  MedDRA_concept_code_2  \
0                      HLGT               10061101               10018067   
1                      HLGT               10002065               10047627   
2                      HLGT               10044697               10047627   
3                      HLGT               10058116               10010180   
4                      HLGT               10002967               10026847   

   MedDRA_concept_code_3  MedDRA_concept_id_1  MedDRA_concept_id_2  \
0               10003018             35104069             36403210   
1               10047635             35104

In [107]:
first_second_third_fourth_relations = \
(first_relations.
 set_index('MedDRA_concept_id_2').
 join(second_relations.
      loc[:,['MedDRA_concept_id_2','MedDRA_concept_id_3',
             'MedDRA_concept_name_3','MedDRA_concept_class_id_3',
             'MedDRA_concept_code_3','relationship_id_23']].
      drop_duplicates().
      set_index('MedDRA_concept_id_2')
     ).
 reset_index().
 set_index('MedDRA_concept_id_3').
 join(third_relations.
      loc[:,['MedDRA_concept_id_3','MedDRA_concept_id_4',
             'MedDRA_concept_name_4','MedDRA_concept_class_id_4',
             'MedDRA_concept_code_4','relationship_id_34']].
      drop_duplicates().
      set_index('MedDRA_concept_id_3')
     ).
 reset_index()
)
first_second_third_fourth_relations = \
(first_second_third_fourth_relations.
 reindex(np.sort(first_second_third_fourth_relations.columns),
         axis=1)
)
first_second_third_fourth_relations['MedDRA_concept_id_4'] = \
first_second_third_fourth_relations['MedDRA_concept_id_4'].astype(int)
print(first_second_third_fourth_relations.shape)
print(first_second_third_fourth_relations.head())
print(first_second_third_fourth_relations.MedDRA_concept_class_id_1.value_counts())
print(first_second_third_fourth_relations.MedDRA_concept_class_id_2.value_counts())
print(first_second_third_fourth_relations.MedDRA_concept_class_id_3.value_counts())
print(first_second_third_fourth_relations.MedDRA_concept_class_id_4.value_counts())

(36540, 19)
  MedDRA_concept_class_id_1 MedDRA_concept_class_id_2  \
0                        PT                       HLT   
1                        PT                       HLT   
2                        PT                       HLT   
3                        PT                       HLT   
4                        PT                       HLT   

  MedDRA_concept_class_id_3 MedDRA_concept_class_id_4  MedDRA_concept_code_1  \
0                      HLGT                       SOC               10061101   
1                      HLGT                       SOC               10002065   
2                      HLGT                       SOC               10044697   
3                      HLGT                       SOC               10058116   
4                      HLGT                       SOC               10002967   

   MedDRA_concept_code_2  MedDRA_concept_code_3  MedDRA_concept_code_4  \
0               10018067               10003018               10027433   
1               

In [108]:
len(np.setdiff1d(reactions,first_second_third_fourth_relations.MedDRA_concept_id_1.unique()))

8

In [109]:
left_over = np.setdiff1d(reactions,first_second_third_fourth_relations.MedDRA_concept_id_1.unique())

all_meddra_relationships.query('MedDRA_concept_id_1 in @left_over')

Unnamed: 0,MedDRA_concept_class_id_1,MedDRA_concept_class_id_2,MedDRA_concept_code_1,MedDRA_concept_code_2,MedDRA_concept_id_1,MedDRA_concept_id_2,MedDRA_concept_name_1,MedDRA_concept_name_2,relationship_id_12


In [110]:
df1 = (standard_reactions.
       loc[:,['MedDRA_concept_id']].
       drop_duplicates().
       dropna().
       set_index('MedDRA_concept_id')
      )
print(df1.shape)

(22226, 0)


In [111]:
df2 = (first_second_third_fourth_relations.
       set_index('MedDRA_concept_id_1')
      )
print(df2.shape)

(36540, 18)


In [112]:
joined = df1.join(df2).rename_axis('MedDRA_concept_id_1').reset_index().dropna()
joined = joined.reindex(np.sort(joined.columns),axis=1)
joined.MedDRA_concept_id_1 = joined.MedDRA_concept_id_1.astype(int).copy()
joined.MedDRA_concept_id_2 = joined.MedDRA_concept_id_2.astype(int).copy()
joined.MedDRA_concept_id_3 = joined.MedDRA_concept_id_3.astype(int).copy()
joined.MedDRA_concept_id_4 = joined.MedDRA_concept_id_4.astype(int).copy()
joined.MedDRA_concept_code_1 = joined.MedDRA_concept_code_1.astype(int).copy()
joined.MedDRA_concept_code_2 = joined.MedDRA_concept_code_2.astype(int).copy()
joined.MedDRA_concept_code_3 = joined.MedDRA_concept_code_3.astype(int).copy()
joined.MedDRA_concept_code_4 = joined.MedDRA_concept_code_4.astype(int).copy()
print(joined.shape)
print(joined.head())

(36540, 19)
  MedDRA_concept_class_id_1 MedDRA_concept_class_id_2  \
0                        PT                       HLT   
1                        PT                       HLT   
2                        PT                       HLT   
3                        PT                       HLT   
4                        PT                       HLT   

  MedDRA_concept_class_id_3 MedDRA_concept_class_id_4  MedDRA_concept_code_1  \
0                      HLGT                       SOC               10012735   
1                      HLGT                       SOC               10003239   
2                      HLGT                       SOC               10019211   
3                      HLGT                       SOC               10047700   
4                      HLGT                       SOC               10013946   

   MedDRA_concept_code_2  MedDRA_concept_code_3  MedDRA_concept_code_4  \
0               10012736               10017977               10017947   
1               

In [113]:
print(joined.MedDRA_concept_class_id_1.value_counts())
print(joined.MedDRA_concept_class_id_2.value_counts())
print(joined.MedDRA_concept_class_id_3.value_counts())
print(joined.MedDRA_concept_class_id_4.value_counts())

MedDRA_concept_class_id_1
PT    36540
Name: count, dtype: int64
MedDRA_concept_class_id_2
HLT    36540
Name: count, dtype: int64
MedDRA_concept_class_id_3
HLGT    36540
Name: count, dtype: int64
MedDRA_concept_class_id_4
SOC    36540
Name: count, dtype: int64


In [114]:
(joined.
 to_csv(er_dir+'standard_reactions_meddra_relationships.csv.gz',
        compression='gzip',index=False)
)

In [115]:
pt_to_soc = (joined.
loc[:,['MedDRA_concept_id_1','MedDRA_concept_code_1',
       'MedDRA_concept_name_1','MedDRA_concept_class_id_1',
       'MedDRA_concept_id_4','MedDRA_concept_code_4',
       'MedDRA_concept_name_4','MedDRA_concept_class_id_4']].
query('MedDRA_concept_class_id_4=="SOC"').
              drop_duplicates()
)
print(pt_to_soc.shape)
print(pt_to_soc.head())

(35284, 8)
   MedDRA_concept_id_1  MedDRA_concept_code_1 MedDRA_concept_name_1  \
0             35708093               10012735             Diarrhoea   
1             36516812               10003239            Arthralgia   
2             36718132               10019211              Headache   
3             35708208               10047700              Vomiting   
4             35708139               10013946             Dyspepsia   

  MedDRA_concept_class_id_1  MedDRA_concept_id_4  MedDRA_concept_code_4  \
0                        PT             35700000               10017947   
1                        PT             36500000               10028395   
2                        PT             36700000               10029205   
3                        PT             35700000               10017947   
4                        PT             35700000               10017947   

                             MedDRA_concept_name_4 MedDRA_concept_class_id_4  
0                       Gastroin

In [116]:
pt_to_hlgt = (joined.
loc[:,['MedDRA_concept_id_1','MedDRA_concept_code_1',
       'MedDRA_concept_name_1','MedDRA_concept_class_id_1',
       'MedDRA_concept_id_3','MedDRA_concept_code_3',
       'MedDRA_concept_name_3','MedDRA_concept_class_id_3']].
query('MedDRA_concept_class_id_3=="HLGT"').
              drop_duplicates()
)
print(pt_to_hlgt.shape)
print(pt_to_hlgt.head())

(34360, 8)
   MedDRA_concept_id_1  MedDRA_concept_code_1 MedDRA_concept_name_1  \
0             35708093               10012735             Diarrhoea   
1             36516812               10003239            Arthralgia   
2             36718132               10019211              Headache   
3             35708208               10047700              Vomiting   
4             35708139               10013946             Dyspepsia   

  MedDRA_concept_class_id_1  MedDRA_concept_id_3  MedDRA_concept_code_3  \
0                        PT             35702117               10017977   
1                        PT             36502199               10023213   
2                        PT             36702243               10019231   
3                        PT             35702118               10018012   
4                        PT             35702118               10018012   

                               MedDRA_concept_name_3 MedDRA_concept_class_id_3  
0  Gastrointestinal motility a

In [117]:
pt_to_hlt = (joined.
loc[:,['MedDRA_concept_id_1','MedDRA_concept_code_1',
       'MedDRA_concept_name_1','MedDRA_concept_class_id_1',
       'MedDRA_concept_id_2','MedDRA_concept_code_2',
       'MedDRA_concept_name_2','MedDRA_concept_class_id_2']].
query('MedDRA_concept_class_id_2=="HLT"').
             drop_duplicates()
)
print(pt_to_hlt.shape)
print(pt_to_hlt.head())

(34004, 8)
   MedDRA_concept_id_1  MedDRA_concept_code_1 MedDRA_concept_name_1  \
0             35708093               10012735             Diarrhoea   
1             36516812               10003239            Arthralgia   
2             36718132               10019211              Headache   
3             35708208               10047700              Vomiting   
4             35708139               10013946             Dyspepsia   

  MedDRA_concept_class_id_1  MedDRA_concept_id_2  MedDRA_concept_code_2  \
0                        PT             35702756               10012736   
1                        PT             36503268               10023226   
2                        PT             36703479               10019233   
3                        PT             35702767               10028817   
4                        PT             35702762               10013949   

              MedDRA_concept_name_2 MedDRA_concept_class_id_2  
0        Diarrhoea (excl infective)            

In [118]:
standard_reactions_pt_to_hlt = \
(standard_reactions.
 loc[:,['safetyreportid','MedDRA_concept_id']].
 drop_duplicates().
 set_index(['MedDRA_concept_id']).
 join(pt_to_hlt.
      loc[:,['MedDRA_concept_id_1','MedDRA_concept_id_2',
           'MedDRA_concept_code_2','MedDRA_concept_name_2',
           'MedDRA_concept_class_id_2']].
      set_index('MedDRA_concept_id_1')
     ).
 reset_index(drop=True).
 rename(
     columns={
         'MedDRA_concept_id_2' : 'MedDRA_concept_id',
         'MedDRA_concept_code_2' : 'MedDRA_concept_code',
         'MedDRA_concept_name_2' : 'MedDRA_concept_name',
         'MedDRA_concept_class_id_2' : 'MedDRA_concept_class_id'
     }
 ).
 dropna().
 drop_duplicates()
)
standard_reactions_pt_to_hlt = (standard_reactions_pt_to_hlt.
                               reindex(np.sort(standard_reactions_pt_to_hlt.columns),axis=1)
                               )
print(standard_reactions_pt_to_hlt.shape)
print(standard_reactions_pt_to_hlt.head())

(72845796, 5)
  MedDRA_concept_class_id  MedDRA_concept_code  MedDRA_concept_id  \
0                     HLT           10012736.0         35702756.0   
1                     HLT           10023226.0         36503268.0   
2                     HLT           10019233.0         36703479.0   
3                     HLT           10028817.0         35702767.0   
4                     HLT           10013949.0         35702762.0   

                MedDRA_concept_name safetyreportid  
0        Diarrhoea (excl infective)       10003300  
1  Joint related signs and symptoms       10003300  
2                     Headaches NEC       10003300  
3      Nausea and vomiting symptoms       10003300  
4      Dyspeptic signs and symptoms       10003301  


In [119]:
print(
    len(
        np.intersect1d(
            all_reports,
            standard_reactions_pt_to_hlt.safetyreportid.astype(str).unique()
                        )
    )/len(all_reports)
)

0.9999439769872404


In [120]:
(standard_reactions_pt_to_hlt.
 to_csv(er_dir+'standard_reactions_meddra_hlt.csv.gz',
        compression='gzip',index=False)
)

In [121]:
standard_reactions_pt_to_hlgt = \
(standard_reactions.
 loc[:,['safetyreportid','MedDRA_concept_id']].
 drop_duplicates().
 set_index(['MedDRA_concept_id']).
 join(pt_to_hlgt.
      loc[:,['MedDRA_concept_id_1','MedDRA_concept_id_3',
           'MedDRA_concept_code_3','MedDRA_concept_name_3',
           'MedDRA_concept_class_id_3']].
      set_index('MedDRA_concept_id_1')
     ).
 reset_index(drop=True).
 rename(
     columns={
         'MedDRA_concept_id_3' : 'MedDRA_concept_id',
         'MedDRA_concept_code_3' : 'MedDRA_concept_code',
         'MedDRA_concept_name_3' : 'MedDRA_concept_name',
         'MedDRA_concept_class_id_3' : 'MedDRA_concept_class_id'
     }
 ).
 dropna().
 drop_duplicates()
)
standard_reactions_pt_to_hlgt = (standard_reactions_pt_to_hlgt.
                               reindex(np.sort(standard_reactions_pt_to_hlgt.columns),axis=1)
                               )
print(standard_reactions_pt_to_hlgt.shape)
print(standard_reactions_pt_to_hlgt.head())

(68132818, 5)
  MedDRA_concept_class_id  MedDRA_concept_code  MedDRA_concept_id  \
0                    HLGT           10017977.0         35702117.0   
1                    HLGT           10023213.0         36502199.0   
2                    HLGT           10019231.0         36702243.0   
3                    HLGT           10018012.0         35702118.0   
4                    HLGT           10018012.0         35702118.0   

                                 MedDRA_concept_name safetyreportid  
0  Gastrointestinal motility and defaecation cond...       10003300  
1                                    Joint disorders       10003300  
2                                          Headaches       10003300  
3                Gastrointestinal signs and symptoms       10003300  
4                Gastrointestinal signs and symptoms       10003301  


In [122]:
print(
    len(
        np.intersect1d(
            all_reports,
            standard_reactions_pt_to_hlgt.safetyreportid.astype(str).unique()
                        )
    )/len(all_reports)
)

0.9999439769872404


In [123]:
(standard_reactions_pt_to_hlgt.
 to_csv(er_dir+'standard_reactions_meddra_hlgt.csv.gz',
        compression='gzip',index=False) 
)

In [124]:
standard_reactions_pt_to_soc = \
(standard_reactions.
 loc[:,['safetyreportid','MedDRA_concept_id']].
 drop_duplicates().
 set_index(['MedDRA_concept_id']).
 join(pt_to_soc.
      loc[:,['MedDRA_concept_id_1','MedDRA_concept_id_4',
           'MedDRA_concept_code_4','MedDRA_concept_name_4',
           'MedDRA_concept_class_id_4']].
      set_index('MedDRA_concept_id_1')
     ).
 reset_index(drop=True).
 rename(
     columns={
         'MedDRA_concept_id_4' : 'MedDRA_concept_id',
         'MedDRA_concept_code_4' : 'MedDRA_concept_code',
         'MedDRA_concept_name_4' : 'MedDRA_concept_name',
         'MedDRA_concept_class_id_4' : 'MedDRA_concept_class_id'
     }
 ).
 dropna().
 drop_duplicates()
)
standard_reactions_pt_to_soc = (standard_reactions_pt_to_soc.
                               reindex(np.sort(standard_reactions_pt_to_soc.columns),axis=1)
                               )
print(standard_reactions_pt_to_soc.shape)
print(standard_reactions_pt_to_soc.head())

(53157000, 5)
  MedDRA_concept_class_id  MedDRA_concept_code  MedDRA_concept_id  \
0                     SOC           10017947.0         35700000.0   
1                     SOC           10028395.0         36500000.0   
2                     SOC           10029205.0         36700000.0   
4                     SOC           10017947.0         35700000.0   
5                     SOC           10038359.0         37000000.0   

                               MedDRA_concept_name safetyreportid  
0                       Gastrointestinal disorders       10003300  
1  Musculoskeletal and connective tissue disorders       10003300  
2                         Nervous system disorders       10003300  
4                       Gastrointestinal disorders       10003301  
5                      Renal and urinary disorders       10003301  


In [125]:
print(
    len(
        np.intersect1d(
            all_reports,
            standard_reactions_pt_to_soc.safetyreportid.astype(str).unique()
                        )
    )/len(all_reports)
)

0.9999439769872404


In [126]:
(standard_reactions_pt_to_soc.
 to_csv(er_dir+'standard_reactions_meddra_soc.csv.gz',
        compression='gzip',index=False)
)

In [127]:
del c
del r
del first_relations
del second_relations
del first_second_third_relations
del all_meddra_relationships
del meddra_concept
del df1
del df2
del joined
del standard_reactions_pt_to_soc
del standard_reactions_pt_to_hlgt
del standard_reactions_pt_to_hlt

### standard_reactions_snomed

In [134]:
standard_reactions_meddra_relationships = (pd.read_csv(
    er_dir+'standard_reactions_meddra_relationships.csv.gz',
    compression='gzip',
    dtype={
    'safetyreportid' : 'str'
    })
    )

print(standard_reactions_meddra_relationships.MedDRA_concept_id_1.nunique())
print(standard_reactions_meddra_relationships.MedDRA_concept_id_2.nunique())
print(standard_reactions_meddra_relationships.MedDRA_concept_id_3.nunique())
print(standard_reactions_meddra_relationships.MedDRA_concept_id_4.nunique())

standard_reactions_meddra_relationships.MedDRA_concept_id_1 = \
standard_reactions_meddra_relationships.MedDRA_concept_id_1.astype(int)

standard_reactions_meddra_relationships.MedDRA_concept_id_2 = \
standard_reactions_meddra_relationships.MedDRA_concept_id_2.astype(int)

standard_reactions_meddra_relationships.MedDRA_concept_id_3 = \
standard_reactions_meddra_relationships.MedDRA_concept_id_3.astype(int)

standard_reactions_meddra_relationships.MedDRA_concept_id_4 = \
standard_reactions_meddra_relationships.MedDRA_concept_id_4.astype(int)

print(standard_reactions_meddra_relationships.shape)
print(standard_reactions_meddra_relationships.head())

22218
1778
347
27
(36540, 19)
  MedDRA_concept_class_id_1 MedDRA_concept_class_id_2  \
0                        PT                       HLT   
1                        PT                       HLT   
2                        PT                       HLT   
3                        PT                       HLT   
4                        PT                       HLT   

  MedDRA_concept_class_id_3 MedDRA_concept_class_id_4  MedDRA_concept_code_1  \
0                      HLGT                       SOC               10012735   
1                      HLGT                       SOC               10003239   
2                      HLGT                       SOC               10019211   
3                      HLGT                       SOC               10047700   
4                      HLGT                       SOC               10013946   

   MedDRA_concept_code_2  MedDRA_concept_code_3  MedDRA_concept_code_4  \
0               10012736               10017977               10017947  

In [135]:
reactions = standard_reactions_meddra_relationships.MedDRA_concept_id_1.unique()
print(len(reactions))
meddra_concept_ids = concept.query('vocabulary_id=="MedDRA"').concept_id.astype(int).unique()
len(meddra_concept_ids)

intersect = np.intersect1d(reactions,meddra_concept_ids)
print(len(intersect))
print(len(intersect)/len(reactions))

22218
22218
1.0


In [136]:
m_to_s_r = (concept_relationship.
            query('relationship_id=="MedDRA - SNOMED eq"').
            loc[:,['concept_id_1','concept_id_2']].
            drop_duplicates().
            set_index('concept_id_2').
            join(concept.
                 query('vocabulary_id=="SNOMED"').
                 loc[:,['concept_id','concept_code','concept_class_id','concept_name']].
                 drop_duplicates().
                 set_index('concept_id')
                ).
            rename_axis('SNOMED_concept_id').
            reset_index().
            rename(columns={
                'concept_id_1' : 'MedDRA_concept_id',
                'concept_name' : 'SNOMED_concept_name',
                'concept_code' : 'SNOMED_concept_code',
                'concept_class_id' : 'SNOMED_concept_class_id'
            })
)
m_to_s_r.MedDRA_concept_id = m_to_s_r.MedDRA_concept_id.astype(int)
m_to_s_r = m_to_s_r.reindex(np.sort(m_to_s_r.columns),axis=1)
print(m_to_s_r.shape)
print(m_to_s_r.SNOMED_concept_class_id.value_counts())
print(m_to_s_r.head())

(5, 5)
SNOMED_concept_class_id
Disorder     4
Procedure    1
Name: count, dtype: int64
   MedDRA_concept_id SNOMED_concept_class_id SNOMED_concept_code  \
0           36919215                Disorder            89415002   
1           36918906                Disorder           191722009   
2           36919216                Disorder            24121004   
3           36316034               Procedure           129112001   
4           36918906                Disorder            35607004   

   SNOMED_concept_id                                SNOMED_concept_name  
0             436669  Hypersomnia disorder related to another mental...  
1             439786                     Agoraphobia with panic attacks  
2             439013  Insomnia disorder related to another mental di...  
3            4043669                              Aspiration of trachea  
4            4147466                    Panic disorder with agoraphobia  


In [137]:
r2s = m_to_s_r.MedDRA_concept_id.unique()

In [138]:
pts = (standard_reactions_meddra_relationships.
       query('MedDRA_concept_class_id_1=="PT"').
       MedDRA_concept_id_1.
       unique())
print(len(np.intersect1d(pts,r2s))/len(pts))
print(len(np.intersect1d(pts,r2s))/len(r2s))

df = (standard_reactions_meddra_relationships.
      query('MedDRA_concept_id_1 in @r2s'))

print(df.shape)

joinedpt = (df.
           set_index('MedDRA_concept_id_1').
           join(m_to_s_r.
                query('MedDRA_concept_id in @pts').
                set_index('MedDRA_concept_id')
               ).
           rename_axis('MedDRA_concept_id_1').
           reset_index().
           rename(columns={
               'SNOMED_concept_id' : 'SNOMED_concept_id_1',
               'SNOMED_concept_code' : 'SNOMED_concept_code_1',
               'SNOMED_concept_name' : 'SNOMED_concept_name_1',
               'SNOMED_concept_class_id' : 'SNOMED_concept_class_id_1',
           }).
           dropna()
          )
joinedpt = joinedpt.reindex(np.sort(joinedpt.columns),axis=1)
print(joinedpt.shape)
print(joinedpt.head())

0.00013502565487442613
0.75
(5, 19)
(6, 23)
  MedDRA_concept_class_id_1 MedDRA_concept_class_id_2  \
0                        PT                       HLT   
1                        PT                       HLT   
2                        PT                       HLT   
3                        PT                       HLT   
4                        PT                       HLT   

  MedDRA_concept_class_id_3 MedDRA_concept_class_id_4  MedDRA_concept_code_1  \
0                      HLGT                       SOC               10052794   
1                      HLGT                       SOC               10052794   
2                      HLGT                       SOC               10022443   
3                      HLGT                       SOC               10022443   
4                      HLGT                       SOC               10020767   

   MedDRA_concept_code_2  MedDRA_concept_code_3  MedDRA_concept_code_4  \
0               10068300               10002861           

In [139]:
hlts = (joinedpt.
       query('MedDRA_concept_class_id_2=="HLT"').
       MedDRA_concept_id_2.
       unique())
print(len(np.intersect1d(hlts,r2s))/len(hlts))
print(len(np.intersect1d(hlts,r2s))/len(r2s))

df = (joinedpt.copy())

print(df.shape)
print(df.head())


joinedhlt = (df.
           set_index('MedDRA_concept_id_2').
           join(m_to_s_r.
                query('MedDRA_concept_id in @hlts').
                set_index('MedDRA_concept_id')
               ).
           rename_axis('MedDRA_concept_id_2').
           reset_index().
           rename(columns={
               'SNOMED_concept_id' : 'SNOMED_concept_id_2',
               'SNOMED_concept_code' : 'SNOMED_concept_code_2',
               'SNOMED_concept_name' : 'SNOMED_concept_name_2',
               'SNOMED_concept_class_id' : 'SNOMED_concept_class_id_2',
           })
          )
joinedhlt = joinedhlt.reindex(np.sort(joinedhlt.columns),axis=1)
print(joinedhlt.shape)
print(joinedhlt.head())

0.0
0.0
(6, 23)
  MedDRA_concept_class_id_1 MedDRA_concept_class_id_2  \
0                        PT                       HLT   
1                        PT                       HLT   
2                        PT                       HLT   
3                        PT                       HLT   
4                        PT                       HLT   

  MedDRA_concept_class_id_3 MedDRA_concept_class_id_4  MedDRA_concept_code_1  \
0                      HLGT                       SOC               10052794   
1                      HLGT                       SOC               10052794   
2                      HLGT                       SOC               10022443   
3                      HLGT                       SOC               10022443   
4                      HLGT                       SOC               10020767   

   MedDRA_concept_code_2  MedDRA_concept_code_3  MedDRA_concept_code_4  \
0               10068300               10002861               10037175   
1           

In [140]:
hlgts = (joinedhlt.
       query('MedDRA_concept_class_id_3=="HLGT"').
       MedDRA_concept_id_3.
       unique())
print(len(np.intersect1d(hlgts,r2s))/len(hlgts))
print(len(np.intersect1d(hlgts,r2s))/len(r2s))

df = (joinedhlt.copy())

print(df.shape)

joinedhlgt = (df.
           set_index('MedDRA_concept_id_3').
           join(m_to_s_r.
                query('MedDRA_concept_id in @hlgts').
                set_index('MedDRA_concept_id')
               ).
           rename_axis('MedDRA_concept_id_3').
           reset_index().
           drop_duplicates().
           rename(columns={
               'SNOMED_concept_id' : 'SNOMED_concept_id_3',
               'SNOMED_concept_code' : 'SNOMED_concept_code_3',
               'SNOMED_concept_name' : 'SNOMED_concept_name_3',
               'SNOMED_concept_class_id' : 'SNOMED_concept_class_id_3',
           })
          )
joinedhlgt = joinedhlgt.reindex(np.sort(joinedhlgt.columns),axis=1)
print(joinedhlgt.shape)
print(joinedhlgt.head())

0.0
0.0
(6, 27)
(6, 31)
  MedDRA_concept_class_id_1 MedDRA_concept_class_id_2  \
0                        PT                       HLT   
1                        PT                       HLT   
2                        PT                       HLT   
3                        PT                       HLT   
4                        PT                       HLT   

  MedDRA_concept_class_id_3 MedDRA_concept_class_id_4  MedDRA_concept_code_1  \
0                      HLGT                       SOC               10052794   
1                      HLGT                       SOC               10052794   
2                      HLGT                       SOC               10022443   
3                      HLGT                       SOC               10022443   
4                      HLGT                       SOC               10020767   

   MedDRA_concept_code_2  MedDRA_concept_code_3  MedDRA_concept_code_4  \
0               10068300               10002861               10037175   
1   

In [141]:
socs = (joinedhlgt.
       query('MedDRA_concept_class_id_4=="SOC"').
       MedDRA_concept_id_4.
       unique())
print(len(np.intersect1d(socs,r2s))/len(socs))
print(len(np.intersect1d(socs,r2s))/len(r2s))

df = (joinedhlgt.copy())

print(df.shape)
print(df.head())
print(m_to_s_r.shape)
print(m_to_s_r.head())

joinedsoc = (df.
           set_index('MedDRA_concept_id_4').
           join(m_to_s_r.
                query('MedDRA_concept_id in @socs').
                set_index('MedDRA_concept_id')
               ).
           rename_axis('MedDRA_concept_id_4').
           reset_index().
           drop_duplicates().
           rename(columns={
               'SNOMED_concept_id' : 'SNOMED_concept_id_4',
               'SNOMED_concept_code' : 'SNOMED_concept_code_4',
               'SNOMED_concept_name' : 'SNOMED_concept_name_4',
               'SNOMED_concept_class_id' : 'SNOMED_concept_class_id_4',
           })
          )
joinedsoc = joinedsoc.reindex(np.sort(joinedsoc.columns),axis=1)
print(joinedsoc.shape)
print(joinedsoc.head())

0.0
0.0
(6, 31)
  MedDRA_concept_class_id_1 MedDRA_concept_class_id_2  \
0                        PT                       HLT   
1                        PT                       HLT   
2                        PT                       HLT   
3                        PT                       HLT   
4                        PT                       HLT   

  MedDRA_concept_class_id_3 MedDRA_concept_class_id_4  MedDRA_concept_code_1  \
0                      HLGT                       SOC               10052794   
1                      HLGT                       SOC               10052794   
2                      HLGT                       SOC               10022443   
3                      HLGT                       SOC               10022443   
4                      HLGT                       SOC               10020767   

   MedDRA_concept_code_2  MedDRA_concept_code_3  MedDRA_concept_code_4  \
0               10068300               10002861               10037175   
1           

In [142]:
smeddraconcepts = joinedpt.MedDRA_concept_id_1.unique()
print(len(smeddraconcepts))
allmeddraconcepts = (standard_reactions_meddra_relationships.
                     query('MedDRA_concept_class_id_1=="PT"').
                     MedDRA_concept_id_1.
                     unique())
print(len(allmeddraconcepts))

print(len(np.intersect1d(smeddraconcepts,allmeddraconcepts))/len(smeddraconcepts))
print(len(np.intersect1d(smeddraconcepts,allmeddraconcepts))/len(allmeddraconcepts))

3
22218
1.0
0.00013502565487442613


In [143]:
smeddraconcepts = joinedhlt.MedDRA_concept_id_2.unique()
print(len(smeddraconcepts))
allmeddraconcepts = (standard_reactions_meddra_relationships.
                     query('MedDRA_concept_class_id_2=="HLT"').
                     MedDRA_concept_id_2.
                     unique())
print(len(allmeddraconcepts))

print(len(np.intersect1d(smeddraconcepts,allmeddraconcepts))/len(smeddraconcepts))
print(len(np.intersect1d(smeddraconcepts,allmeddraconcepts))/len(allmeddraconcepts))

3
1778
1.0
0.001687289088863892


In [144]:
smeddraconcepts = joinedhlgt.MedDRA_concept_id_3.unique()
print(len(smeddraconcepts))
allmeddraconcepts = (standard_reactions_meddra_relationships.
                     query('MedDRA_concept_class_id_3=="HLGT"').
                     MedDRA_concept_id_3.
                     unique())
print(len(allmeddraconcepts))

print(len(np.intersect1d(smeddraconcepts,allmeddraconcepts))/len(smeddraconcepts))
print(len(np.intersect1d(smeddraconcepts,allmeddraconcepts))/len(allmeddraconcepts))

2
347
1.0
0.005763688760806916


In [145]:
smeddraconcepts = joinedsoc.MedDRA_concept_id_4.unique()
print(len(smeddraconcepts))
allmeddraconcepts = (standard_reactions_meddra_relationships.
                     query('MedDRA_concept_class_id_4=="SOC"').
                     MedDRA_concept_id_4.
                     unique())
print(len(allmeddraconcepts))

print(len(np.intersect1d(smeddraconcepts,allmeddraconcepts))/len(smeddraconcepts))
print(len(np.intersect1d(smeddraconcepts,allmeddraconcepts))/len(allmeddraconcepts))

1
27
1.0
0.037037037037037035


In [146]:
print(joinedsoc.head())
print(joinedsoc.shape)
print(joinedsoc[joinedsoc.SNOMED_concept_id_1.notnull()].shape)
print(joinedsoc.SNOMED_concept_id_1.nunique())
print(joinedsoc[joinedsoc.SNOMED_concept_id_2.notnull()].shape)
print(joinedsoc.SNOMED_concept_id_2.nunique())
print(joinedsoc[joinedsoc.SNOMED_concept_id_3.notnull()].shape)
print(joinedsoc.SNOMED_concept_id_3.nunique())
print(joinedsoc[joinedsoc.SNOMED_concept_id_4.notnull()].shape)
print(joinedsoc.SNOMED_concept_id_4.nunique())

  MedDRA_concept_class_id_1 MedDRA_concept_class_id_2  \
0                        PT                       HLT   
1                        PT                       HLT   
2                        PT                       HLT   
3                        PT                       HLT   
4                        PT                       HLT   

  MedDRA_concept_class_id_3 MedDRA_concept_class_id_4  MedDRA_concept_code_1  \
0                      HLGT                       SOC               10052794   
1                      HLGT                       SOC               10052794   
2                      HLGT                       SOC               10022443   
3                      HLGT                       SOC               10022443   
4                      HLGT                       SOC               10020767   

   MedDRA_concept_code_2  MedDRA_concept_code_3  MedDRA_concept_code_4  \
0               10068300               10002861               10037175   
1               10068300    

In [147]:
joinedsoc.SNOMED_concept_code_1 = joinedsoc.SNOMED_concept_code_1.astype(int)
joinedsoc.SNOMED_concept_code_2 = joinedsoc.SNOMED_concept_code_2.astype(float)
joinedsoc.SNOMED_concept_code_3 = joinedsoc.SNOMED_concept_code_3.astype(float)
joinedsoc.SNOMED_concept_code_4 = joinedsoc.SNOMED_concept_code_4.astype(float)

In [148]:
standard_reactions = (pd.
                      read_csv(er_dir+'standard_reactions.csv.gz',
                               compression="gzip",
                              dtype={
                                  'safetyreportid' : 'str'
                              })
                     )
all_reports = (standard_reactions.safetyreportid.unique())
print(standard_reactions.shape)
print(standard_reactions.head())

(55164660, 6)
  MedDRA_concept_class_id  MedDRA_concept_code  MedDRA_concept_id  \
0                      PT             10012735           35708093   
1                      PT             10003239           36516812   
2                      PT             10019211           36718132   
3                      PT             10047700           35708208   
4                      PT             10013946           35708139   

  MedDRA_concept_name reaction_outcome safetyreportid  
0           Diarrhoea              NaN       10003300  
1          Arthralgia              NaN       10003300  
2            Headache              NaN       10003300  
3            Vomiting              NaN       10003300  
4           Dyspepsia              NaN       10003301  


In [150]:
standard_reactions_meddrapt_to_snomed = \
(joinedsoc.
 loc[:,['MedDRA_concept_id_1','SNOMED_concept_id_1',
   'SNOMED_concept_code_1','SNOMED_concept_name_1',
   'SNOMED_concept_class_id_1']].
 drop_duplicates().
 rename(
     columns={
         'SNOMED_concept_id_1' : 'SNOMED_concept_id',
         'SNOMED_concept_code_1' : 'SNOMED_concept_code',
         'SNOMED_concept_name_1' : 'SNOMED_concept_name',
         'SNOMED_concept_class_id_1' : 'SNOMED_concept_class_id'
     }
 ).
 set_index('MedDRA_concept_id_1').
 join(standard_reactions.
      drop_duplicates().
      set_index('MedDRA_concept_id')
     ).
 reset_index(drop=True).
 drop(['MedDRA_concept_code','MedDRA_concept_name',
      'MedDRA_concept_class_id'],axis=1).
 dropna()
)
standard_reactions_meddrapt_to_snomed = \
(standard_reactions_meddrapt_to_snomed.
 reindex(np.sort(standard_reactions_meddrapt_to_snomed.columns),
         axis=1))
print(standard_reactions_meddrapt_to_snomed.shape)
print(standard_reactions_meddrapt_to_snomed.head())

(71, 6)
  SNOMED_concept_class_id  SNOMED_concept_code  SNOMED_concept_id  \
0                Disorder            191722009             439786   
1                Disorder            191722009             439786   
2                Disorder            191722009             439786   
3                Disorder            191722009             439786   
4                Disorder            191722009             439786   

              SNOMED_concept_name      reaction_outcome safetyreportid  
0  Agoraphobia with panic attacks               Unknown       10184359  
1  Agoraphobia with panic attacks               Unknown       10192659  
2  Agoraphobia with panic attacks  Recovering/resolving       10564783  
3  Agoraphobia with panic attacks  Recovering/resolving       10569927  
4  Agoraphobia with panic attacks  Recovering/resolving       10575917  


In [151]:
print(
    len(
        np.intersect1d(
            all_reports,
            standard_reactions_meddrapt_to_snomed.safetyreportid.astype(str).unique()
        )
    )/len(all_reports)
)

2.2942757606351154e-06


In [152]:
(standard_reactions_meddrapt_to_snomed.
 to_csv(er_dir+'standard_reactions_snomed.csv.gz',
        compression='gzip',index=False)
)

In [153]:
standard_reactions_meddrahlt_to_snomed = \
(joinedsoc.
 query('MedDRA_concept_class_id_2=="HLT"').
 loc[:,['MedDRA_concept_id_1','SNOMED_concept_id_2',
   'SNOMED_concept_code_2','SNOMED_concept_name_2',
   'SNOMED_concept_class_id_2']].
 drop_duplicates().
 rename(
     columns={
         'SNOMED_concept_id_2' : 'SNOMED_concept_id',
         'SNOMED_concept_code_2' : 'SNOMED_concept_code',
         'SNOMED_concept_name_2' : 'SNOMED_concept_name',
         'SNOMED_concept_class_id_2' : 'SNOMED_concept_class_id'
     }
 ).
 set_index('MedDRA_concept_id_1').
 join(standard_reactions.
      drop_duplicates().
      set_index('MedDRA_concept_id')
     ).
 rename_axis('MedDRA_concept_id').
 reset_index().
 dropna(subset=['MedDRA_concept_id','SNOMED_concept_id','safetyreportid'])
)
standard_reactions_meddrahlt_to_snomed = \
(standard_reactions_meddrahlt_to_snomed.
 reindex(np.sort(standard_reactions_meddrahlt_to_snomed.columns),
         axis=1))
print(standard_reactions_meddrahlt_to_snomed.shape)
print(standard_reactions_meddrahlt_to_snomed.head())

(0, 10)
Empty DataFrame
Columns: [MedDRA_concept_class_id, MedDRA_concept_code, MedDRA_concept_id, MedDRA_concept_name, SNOMED_concept_class_id, SNOMED_concept_code, SNOMED_concept_id, SNOMED_concept_name, reaction_outcome, safetyreportid]
Index: []


In [154]:
print(
    len(
        np.intersect1d(
            all_reports,
            standard_reactions_meddrahlt_to_snomed.safetyreportid.astype(str).unique()
        )
    )/len(all_reports)
)

0.0


In [155]:
joinedsoc

Unnamed: 0,MedDRA_concept_class_id_1,MedDRA_concept_class_id_2,MedDRA_concept_class_id_3,MedDRA_concept_class_id_4,MedDRA_concept_code_1,MedDRA_concept_code_2,MedDRA_concept_code_3,MedDRA_concept_code_4,MedDRA_concept_id_1,MedDRA_concept_id_2,...,SNOMED_concept_id_2,SNOMED_concept_id_3,SNOMED_concept_id_4,SNOMED_concept_name_1,SNOMED_concept_name_2,SNOMED_concept_name_3,SNOMED_concept_name_4,relationship_id_12,relationship_id_23,relationship_id_34
0,PT,HLT,HLGT,SOC,10052794,10068300,10002861,10037175,36918906,36903587,...,,,,Agoraphobia with panic attacks,,,,Is a,Is a,Is a
1,PT,HLT,HLGT,SOC,10052794,10068300,10002861,10037175,36918906,36903587,...,,,,Panic disorder with agoraphobia,,,,Is a,Is a,Is a
2,PT,HLT,HLGT,SOC,10022443,10040993,10040991,10037175,36919216,36903650,...,,,,Insomnia disorder related to another mental di...,,,,Is a,Is a,Is a
3,PT,HLT,HLGT,SOC,10022443,10040994,10040991,10037175,36919216,36903652,...,,,,Insomnia disorder related to another mental di...,,,,Is a,Is a,Is a
4,PT,HLT,HLGT,SOC,10020767,10040993,10040991,10037175,36919215,36903650,...,,,,Hypersomnia disorder related to another mental...,,,,Is a,Is a,Is a
5,PT,HLT,HLGT,SOC,10020767,10040994,10040991,10037175,36919215,36903652,...,,,,Hypersomnia disorder related to another mental...,,,,Is a,Is a,Is a


In [156]:
standard_reactions_meddrahlgt_to_snomed = \
(joinedsoc.
 query('MedDRA_concept_class_id_2=="HLGT"').
 loc[:,['MedDRA_concept_id_1','SNOMED_concept_id_3',
   'SNOMED_concept_code_3','SNOMED_concept_name_3',
   'SNOMED_concept_class_id_2']].
 drop_duplicates().
 rename(
     columns={
         'SNOMED_concept_id_3' : 'SNOMED_concept_id',
         'SNOMED_concept_code_3' : 'SNOMED_concept_code',
         'SNOMED_concept_name_3' : 'SNOMED_concept_name',
         'SNOMED_concept_class_id_3' : 'SNOMED_concept_class_id'
     }
 ).
 set_index('MedDRA_concept_id_1').
 join(standard_reactions.
      drop_duplicates().
      set_index('MedDRA_concept_id')
     ).
 rename_axis('MedDRA_concept_id').
 reset_index().
 dropna(subset=['MedDRA_concept_id','SNOMED_concept_id','safetyreportid'])
)
standard_reactions_meddrahlgt_to_snomed = \
(standard_reactions_meddrahlgt_to_snomed.
 reindex(np.sort(standard_reactions_meddrahlgt_to_snomed.columns),
         axis=1))
print(standard_reactions_meddrahlgt_to_snomed.shape)
print(standard_reactions_meddrahlgt_to_snomed.head())

(0, 10)
Empty DataFrame
Columns: [MedDRA_concept_class_id, MedDRA_concept_code, MedDRA_concept_id, MedDRA_concept_name, SNOMED_concept_class_id_2, SNOMED_concept_code, SNOMED_concept_id, SNOMED_concept_name, reaction_outcome, safetyreportid]
Index: []


In [157]:
print(
    len(
        np.intersect1d(
            all_reports,
            standard_reactions_meddrahlgt_to_snomed.safetyreportid.astype(str).unique()
        )
    )/len(all_reports)
)

0.0


In [158]:
standard_reactions_meddrasoc_to_snomed = \
(joinedsoc.
 query('MedDRA_concept_class_id_4=="SOC"').
 loc[:,['MedDRA_concept_id_1','SNOMED_concept_id_4',
   'SNOMED_concept_code_4','SNOMED_concept_name_4',
   'SNOMED_concept_class_id_4']].
 drop_duplicates().
 rename(
     columns={
         'SNOMED_concept_id_4' : 'SNOMED_concept_id',
         'SNOMED_concept_code_4' : 'SNOMED_concept_code',
         'SNOMED_concept_name_4' : 'SNOMED_concept_name',
         'SNOMED_concept_class_id_4' : 'SNOMED_concept_class_id'
     }
 ).
 set_index('MedDRA_concept_id_1').
 join(standard_reactions.
      drop_duplicates().
      set_index('MedDRA_concept_id')
     ).
 rename_axis('MedDRA_concept_id').
 reset_index().
 dropna(subset=['MedDRA_concept_id','SNOMED_concept_id','safetyreportid'])
)
standard_reactions_meddrasoc_to_snomed = \
(standard_reactions_meddrasoc_to_snomed.
 reindex(np.sort(standard_reactions_meddrasoc_to_snomed.columns),
         axis=1))
print(standard_reactions_meddrasoc_to_snomed.shape)
print(standard_reactions_meddrasoc_to_snomed.head())

(0, 10)
Empty DataFrame
Columns: [MedDRA_concept_class_id, MedDRA_concept_code, MedDRA_concept_id, MedDRA_concept_name, SNOMED_concept_class_id, SNOMED_concept_code, SNOMED_concept_id, SNOMED_concept_name, reaction_outcome, safetyreportid]
Index: []


In [159]:
print(
    len(
        np.intersect1d(
            all_reports,
            standard_reactions_meddrasoc_to_snomed.safetyreportid.astype(str).unique()
        )
    )/len(all_reports)
)

0.0


In [160]:
del m_to_s_r
del df
del joinedpt
del joinedhlt
del joinedhlgt
del joinedsoc
del all_reports
del standard_reactions
del standard_reactions_meddrapt_to_snomed
del standard_reactions_meddrahlt_to_snomed
del standard_reactions_meddrahlgt_to_snomed
del standard_reactions_meddra_relationships