In [1]:
import glob
import numpy as np
import pandas as pd
import pickle

data_dir = "../../data/openFDA_drug_event/"
er_dir = data_dir+'er_tables_memory_efficient/'

In [2]:
primarykey='safetyreportid'

In [3]:
patients = pd.read_csv(er_dir+'patient.csv.gz',
                       compression='gzip',
                       index_col=0,dtype={
                           'safetyreportid' : 'str',
                           'patient_custom_master_age' : 'float'
                       })

In [4]:
age_col='patient_onsetage'
aged = patients[patients[age_col].notnull()].reset_index(drop=True).copy()

In [5]:
col = 'nichd'

neonate = aged[age_col].apply(lambda x : float(x)>0 and float(x)<=(1/12))
infant = aged[age_col].apply(lambda x : float(x)>(1/12) and float(x)<=1)
toddler = aged[age_col].apply(lambda x : float(x)>1 and float(x)<=2)
echildhood = aged[age_col].apply(lambda x : float(x)>2 and float(x)<=5)
mchildhood = aged[age_col].apply(lambda x : float(x)>5 and float(x)<=11)
eadolescence = aged[age_col].apply(lambda x : float(x)>11 and float(x)<=18)
ladolescence = aged[age_col].apply(lambda x : float(x)>18 and float(x)<=21)

aged[col] = np.nan

aged.loc[neonate,col] = 'term_neonatal'
aged.loc[infant,col] = 'infancy'
aged.loc[toddler,col] = 'toddler'
aged.loc[echildhood,col] = 'early_childhood'
aged.loc[mchildhood,col] = 'middle_childhood'
aged.loc[eadolescence,col] = 'early_adolescence'
aged.loc[ladolescence,col] = 'late_adolescence'

  aged.loc[neonate,col] = 'term_neonatal'


In [6]:
col = 'ich_ema'

term_newborn_infants = (aged[age_col].
                        apply(lambda x : float(x)>0 and float(x)<=(1/12)))
infants_and_toddlers = (aged[age_col].
                       apply(lambda x : float(x)>(1/12) and float(x)<=2))
children = aged[age_col].apply(lambda x : float(x)>2 and float(x)<=11)
adolescents = aged[age_col].apply(lambda x : float(x)>11 and float(x)<=17)

aged[col] = np.nan

aged.loc[term_newborn_infants,col] = 'term_newborn_infants'
aged.loc[infants_and_toddlers,col] = 'infants_and_toddlers'
aged.loc[children,col] = 'children'
aged.loc[adolescents,col] = 'adolescents'

  aged.loc[term_newborn_infants,col] = 'term_newborn_infants'


In [7]:
col = 'fda'

neonates = (aged[age_col].
                        apply(lambda x : float(x)>0 and float(x)<(1/12)))
infants = (aged[age_col].
                       apply(lambda x : float(x)>=(1/12) and float(x)<2))
children = aged[age_col].apply(lambda x : float(x)>=2 and float(x)<11)
adolescents = aged[age_col].apply(lambda x : float(x)>=11 and float(x)<16)

aged[col] = np.nan

aged.loc[neonates,col] = 'neonates'
aged.loc[infants,col] = 'infants'
aged.loc[children,col] = 'children'
aged.loc[adolescents,col] = 'adolescents'

  aged.loc[neonates,col] = 'neonates'


In [8]:
pediatric_patients = (aged.
                      dropna(subset=['nichd']).
                      reset_index(drop=True))
print(pediatric_patients.shape)
print(pediatric_patients.head())

(923409, 8)
   patient_onsetage patient_onsetageunit patient_sex  patient_weight  \
0              10.0                 Year        Male            28.0   
1              19.0                 Year      Female             NaN   
2              18.0                 Year      Female             NaN   
3              10.0                 Year        Male             NaN   
4               4.0                 Year        Male             NaN   

  safetyreportid              nichd   ich_ema       fda  
0       10003357   middle_childhood  children  children  
1       10003388   late_adolescence       NaN       NaN  
2       10003401  early_adolescence       NaN       NaN  
3       10003430   middle_childhood  children  children  
4       10003517    early_childhood  children  children  


In [9]:
del patients
del aged

In [10]:
pediatric_patients.head()

Unnamed: 0,patient_onsetage,patient_onsetageunit,patient_sex,patient_weight,safetyreportid,nichd,ich_ema,fda
0,10.0,Year,Male,28.0,10003357,middle_childhood,children,children
1,19.0,Year,Female,,10003388,late_adolescence,,
2,18.0,Year,Female,,10003401,early_adolescence,,
3,10.0,Year,Male,,10003430,middle_childhood,children,children
4,4.0,Year,Male,,10003517,early_childhood,children,children


In [11]:
report = (pd.read_csv(er_dir+'report.csv.gz',
                      compression='gzip',
                     dtype={
                         'safetyreportid' : 'str'
                     }))
report.head()

Unnamed: 0,safetyreportid,lastupdate_date,mostrecent_receive_date,receive_date
0,10003300,20141002,20140306,20140306
1,10003301,20141002,20140228,20140228
2,10003302,20141002,20140312,20140312
3,10003304,20141212,20140424,20140312
4,10003305,20141002,20140312,20140312


In [12]:
df1 = pediatric_patients.copy()
ped_reports = df1.safetyreportid.unique()
df2 = report.copy()
print(df1.shape)
print(df2.shape)
df1[primarykey] = df1[primarykey].astype(str)
df2[primarykey] = df2[primarykey].astype(str)
pediatric_patients_report = \
pd.merge(df1,
         df2,
         on=primarykey,
         how='inner').query('safetyreportid in @ped_reports')
print(pediatric_patients_report.shape)

(923409, 8)
(19026493, 4)
(923409, 11)


In [13]:
del pediatric_patients
del report

In [14]:
report_serious = pd.read_csv(er_dir+'report_serious.csv.gz',compression='gzip')
report_serious.head()

  report_serious = pd.read_csv(er_dir+'report_serious.csv.gz',compression='gzip')


Unnamed: 0,congenital_anomali,death,disabling,life_threatening,other,safetyreportid,serious
0,,,1.0,,,10003300,"The adverse event resulted in death, a life th..."
1,,,,,1.0,10003301,"The adverse event resulted in death, a life th..."
2,,,,,,10003302,The adverse event did not result in any of the...
3,,,,,,10003304,The adverse event did not result in any of the...
4,,,,,,10003305,The adverse event did not result in any of the...


In [15]:
df1 = pediatric_patients_report.copy()
df2 = report_serious.copy()
print(df1.shape)
print(df2.shape)
df1[primarykey] = df1[primarykey].astype(str)
df2[primarykey] = df2[primarykey].astype(str)
pediatric_patients_report_serious = \
pd.merge(df1,
         df2,
         on=primarykey,
         how='inner')
print(pediatric_patients_report_serious.shape)

(923409, 11)
(19026493, 7)
(923409, 17)


In [16]:
pediatric_patients_report_serious.head()

Unnamed: 0,patient_onsetage,patient_onsetageunit,patient_sex,patient_weight,safetyreportid,nichd,ich_ema,fda,lastupdate_date,mostrecent_receive_date,receive_date,congenital_anomali,death,disabling,life_threatening,other,serious
0,10.0,Year,Male,28.0,10003357,middle_childhood,children,children,20141002,20140312,20140312,,,,,1.0,"The adverse event resulted in death, a life th..."
1,19.0,Year,Female,,10003388,late_adolescence,,,20151125,20150812,20140312,,,,,,The adverse event did not result in any of the...
2,18.0,Year,Female,,10003401,early_adolescence,,,20151125,20150812,20140312,,,,,,The adverse event did not result in any of the...
3,10.0,Year,Male,,10003430,middle_childhood,children,children,20151125,20150812,20140312,,,,,,The adverse event did not result in any of the...
4,4.0,Year,Male,,10003517,early_childhood,children,children,20151125,20150812,20140312,,,,,,The adverse event did not result in any of the...


In [17]:
del report_serious
del pediatric_patients_report

In [18]:
reporter = pd.read_csv(er_dir+'reporter.csv.gz',compression='gzip')
reporter.head()

  reporter = pd.read_csv(er_dir+'reporter.csv.gz',compression='gzip')


Unnamed: 0,reporter_company,reporter_country,reporter_qualification,safetyreportid
0,1289378,US,Consumer or non-health professional,10003300
1,US-JNJFOC-20130719067,US,Consumer or non-health professional,10003301
2,US-PFIZER INC-2014068976,US,Consumer or non-health professional,10003302
3,US-PFIZER INC-2014063856,US,Physician,10003304
4,US-PFIZER INC-2014069067,US,Physician,10003305


In [19]:
df1 = pediatric_patients_report_serious.copy()
df2 = reporter.copy()
print(df1.shape)
print(df2.shape)
df1[primarykey] = df1[primarykey].astype(str)
df2[primarykey] = df2[primarykey].astype(str)
pediatric_patients_report_serious_reporter = \
pd.merge(df1,
         df2,
         on=primarykey,
         how='inner')
print(pediatric_patients_report_serious_reporter.shape)

(923409, 17)
(19026493, 4)
(923409, 20)


In [20]:
pediatric_patients_report_serious_reporter.head()

Unnamed: 0,patient_onsetage,patient_onsetageunit,patient_sex,patient_weight,safetyreportid,nichd,ich_ema,fda,lastupdate_date,mostrecent_receive_date,receive_date,congenital_anomali,death,disabling,life_threatening,other,serious,reporter_company,reporter_country,reporter_qualification
0,10.0,Year,Male,28.0,10003357,middle_childhood,children,children,20141002,20140312,20140312,,,,,1.0,"The adverse event resulted in death, a life th...",US-ACTAVIS-2014-04163,US,Other health professional
1,19.0,Year,Female,,10003388,late_adolescence,,,20151125,20150812,20140312,,,,,,The adverse event did not result in any of the...,US-GILEAD-2012-0061242,US,Consumer or non-health professional
2,18.0,Year,Female,,10003401,early_adolescence,,,20151125,20150812,20140312,,,,,,The adverse event did not result in any of the...,US-GILEAD-2012-0063166,US,Consumer or non-health professional
3,10.0,Year,Male,,10003430,middle_childhood,children,children,20151125,20150812,20140312,,,,,,The adverse event did not result in any of the...,US-GILEAD-2012-0061944,US,Consumer or non-health professional
4,4.0,Year,Male,,10003517,early_childhood,children,children,20151125,20150812,20140312,,,,,,The adverse event did not result in any of the...,US-GILEAD-2012-0059314,US,Physician


In [21]:
pediatric_patients_report_serious_reporter.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 923409 entries, 0 to 923408
Data columns (total 20 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   patient_onsetage         923409 non-null  float64
 1   patient_onsetageunit     922017 non-null  object 
 2   patient_sex              891024 non-null  object 
 3   patient_weight           253303 non-null  float64
 4   safetyreportid           923409 non-null  object 
 5   nichd                    923409 non-null  object 
 6   ich_ema                  690228 non-null  object 
 7   fda                      572253 non-null  object 
 8   lastupdate_date          923409 non-null  int64  
 9   mostrecent_receive_date  923409 non-null  int64  
 10  receive_date             923409 non-null  int64  
 11  congenital_anomali       261342 non-null  float64
 12  death                    301449 non-null  float64
 13  disabling                267255 non-null  float64
 14  life

In [22]:
del reporter

In [23]:
del pediatric_patients_report_serious

In [24]:
(pediatric_patients_report_serious_reporter.
 to_csv('../../data/pediatric_patients_report_serious_reporter.csv.gz',
       compression='gzip')
)

In [25]:
ped_reports = pediatric_patients_report_serious_reporter.safetyreportid.astype(str).unique()
len(ped_reports)

923409

In [44]:
pediatric_patients_report_serious_reporter = \
(pd.
 read_csv('../../data/pediatric_patients_report_serious_reporter.csv.gz',
       compression='gzip',
         index_col=0)
)
pediatric_patients_report_serious_reporter.head()

  read_csv('../../data/pediatric_patients_report_serious_reporter.csv.gz',


Unnamed: 0,patient_onsetage,patient_onsetageunit,patient_sex,patient_weight,safetyreportid,nichd,ich_ema,fda,lastupdate_date,mostrecent_receive_date,receive_date,congenital_anomali,death,disabling,life_threatening,other,serious,reporter_company,reporter_country,reporter_qualification
0,10.0,Year,Male,28.0,10003357,middle_childhood,children,children,20141002,20140312,20140312,,,,,1.0,"The adverse event resulted in death, a life th...",US-ACTAVIS-2014-04163,US,Other health professional
1,19.0,Year,Female,,10003388,late_adolescence,,,20151125,20150812,20140312,,,,,,The adverse event did not result in any of the...,US-GILEAD-2012-0061242,US,Consumer or non-health professional
2,18.0,Year,Female,,10003401,early_adolescence,,,20151125,20150812,20140312,,,,,,The adverse event did not result in any of the...,US-GILEAD-2012-0063166,US,Consumer or non-health professional
3,10.0,Year,Male,,10003430,middle_childhood,children,children,20151125,20150812,20140312,,,,,,The adverse event did not result in any of the...,US-GILEAD-2012-0061944,US,Consumer or non-health professional
4,4.0,Year,Male,,10003517,early_childhood,children,children,20151125,20150812,20140312,,,,,,The adverse event did not result in any of the...,US-GILEAD-2012-0059314,US,Physician


In [34]:
# pediatric_standard_drugs_atc = (pd.
#                             read_csv('../../data/openFDA_drug_event/er_tables/standard_drugs_atc.csv.gz',
#                                      compression='gzip',
#                                     dtype={
#                                         'safetyreportid' : 'str'
#                                     }).
#                             query('safetyreportid in @ped_reports')
#                            )
# pediatric_standard_drugs_atc.safetyreportid = pediatric_standard_drugs_atc.safetyreportid.astype(str) 
# pediatric_standard_drugs_atc.ATC_concept_id = pediatric_standard_drugs_atc.ATC_concept_id.astype(int)
# pediatric_standard_drugs_atc.head()

chunk_size = 1000000
chunks=[]
ped_report = set(ped_reports)

for chunk in pd.read_csv("../../data/openFDA_drug_event/er_tables_memory_efficient/standard_drugs_atc.csv.gz", compression='gzip', dtype={'safetyreportid': 'str'}, chunksize=chunk_size):
    chunk_filtered = chunk[chunk['safetyreportid'].isin(ped_reports)]
    chunks.append(chunk_filtered)
pediatric_standard_drugs_atc = pd.concat(chunks)


In [27]:
print(pediatric_standard_drugs_atc.head())
print(pediatric_standard_drugs_atc.shape)


     safetyreportid    rxcui  entry  RxNorm_concept_id  \
677        20243495  1927285      0            1592762   
678        20243495  1927290      0            1592767   
679        20243495   310994      0           19078524   
680        20243495   213361      0             937369   
1308       20192482   248310      0           19061283   

                               RxNorm_concept_name RxNorm_concept_class_id  \
677               infliximab-abda 100 MG Injection           Clinical Drug   
678   infliximab-abda 100 MG Injection [Renflexis]            Branded Drug   
679                    infliximab 100 MG Injection           Clinical Drug   
680         infliximab 100 MG Injection [Remicade]            Branded Drug   
1308         ethinyl estradiol 0.01 MG Oral Tablet           Clinical Drug   

     ATC_concept_name ATC_concept_code ATC_concept_class_id  
677               NaN              NaN                  NaN  
678               NaN              NaN                  Na

In [35]:
pediatric_standard_reactions = (pd.
                  read_csv(er_dir+'standard_reactions.csv.gz',
                           compression='gzip')
                      ).query('safetyreportid in @ped_reports')
pediatric_standard_reactions.safetyreportid = pediatric_standard_reactions.safetyreportid.astype(str) 
pediatric_standard_reactions.MedDRA_concept_id = pediatric_standard_reactions.MedDRA_concept_id.astype(int)
pediatric_standard_reactions.head()

  read_csv(er_dir+'standard_reactions.csv.gz',


Unnamed: 0,MedDRA_concept_class_id,MedDRA_concept_code,MedDRA_concept_id,MedDRA_concept_name,reaction_outcome,safetyreportid
41943045,PT,10024641,36110162,Listeriosis,Unknown,25137408
41943203,PT,10049119,36919046,Emotional Distress,Recovering/resolving,25137467
41943204,PT,10042464,36919236,Suicide Attempt,Recovering/resolving,25137467
41943228,PT,10039906,36776613,Seizure,Unknown,25137477
41943229,PT,10022523,36211492,Intentional Overdose,Unknown,25137477


In [36]:
print(pediatric_patients_report_serious_reporter.head())
print(pediatric_standard_drugs_atc.head())
print(pediatric_standard_reactions.head())

   patient_onsetage patient_onsetageunit patient_sex  patient_weight  \
0              10.0                 Year        Male            28.0   
1              19.0                 Year      Female             NaN   
2              18.0                 Year      Female             NaN   
3              10.0                 Year        Male             NaN   
4               4.0                 Year        Male             NaN   

  safetyreportid              nichd   ich_ema       fda  lastupdate_date  \
0       10003357   middle_childhood  children  children         20141002   
1       10003388   late_adolescence       NaN       NaN         20151125   
2       10003401  early_adolescence       NaN       NaN         20151125   
3       10003430   middle_childhood  children  children         20151125   
4       10003517    early_childhood  children  children         20151125   

   mostrecent_receive_date  receive_date  congenital_anomali  death  \
0                 20140312      2014031

In [37]:
len(np.intersect1d(
    pediatric_standard_drugs_atc.safetyreportid.astype(str).unique(),
    pediatric_standard_reactions.safetyreportid.astype(str).unique()
))

0

In [39]:
pediatric_standard_drugs_atc

Unnamed: 0,ATC_concept_class_id,ATC_concept_code,ATC_concept_id,ATC_concept_name,safetyreportid
48,ATC 5th,G03FA01,21602568,norethisterone and estrogen; systemic,10023755
49,ATC 5th,G03AA05,21602478,norethisterone and ethinylestradiol; systemic,10023755
50,ATC 5th,G03FB05,21602590,norethisterone and estrogen; systemic,10023755
51,ATC 5th,G03AB04,21602492,norethisterone and ethinylestradiol; systemic,10023755
70,ATC 5th,D10AE51,21602303,"benzoyl peroxide, combinations; topical",10041934
90,ATC 5th,G03AA07,21602480,levonorgestrel and ethinylestradiol; oral,10166086
91,ATC 5th,G03FB09,21602594,levonorgestrel and estrogen; systemic,10166086
92,ATC 5th,G03AB03,21602491,levonorgestrel and ethinylestradiol; systemic,10166086
93,ATC 5th,G03FA11,21602578,levonorgestrel and estrogen; systemic,10166086
101,ATC 5th,G03FA01,21602568,norethisterone and estrogen; systemic,10367372


In [40]:
pediatric_patients_report_serious_reporter_drugs_reactions = \
(pediatric_patients_report_serious_reporter.
 set_index('safetyreportid').
 join(pediatric_standard_drugs_atc.
      set_index('safetyreportid')
     ).
 dropna(subset=['ATC_concept_id']).
 join(pediatric_standard_reactions.
     set_index('safetyreportid')
     ).
 dropna(subset=['MedDRA_concept_id']).
 reset_index()
)
pediatric_patients_report_serious_reporter_drugs_reactions = \
(pediatric_patients_report_serious_reporter_drugs_reactions.
 reindex(np.sort(pediatric_patients_report_serious_reporter_drugs_reactions.columns),axis=1))

pediatric_patients_report_serious_reporter_drugs_reactions.ATC_concept_id = \
pediatric_patients_report_serious_reporter_drugs_reactions.ATC_concept_id.astype(int).copy()

pediatric_patients_report_serious_reporter_drugs_reactions.MedDRA_concept_code = \
pediatric_patients_report_serious_reporter_drugs_reactions.MedDRA_concept_code.astype(int).copy()

pediatric_patients_report_serious_reporter_drugs_reactions.MedDRA_concept_id = \
pediatric_patients_report_serious_reporter_drugs_reactions.MedDRA_concept_id.astype(int).copy()

print(pediatric_patients_report_serious_reporter_drugs_reactions.shape)
print(pediatric_patients_report_serious_reporter_drugs_reactions.head())
print(pediatric_patients_report_serious_reporter_drugs_reactions.safetyreportid.nunique())

(0, 29)
Empty DataFrame
Columns: [ATC_concept_class_id, ATC_concept_code, ATC_concept_id, ATC_concept_name, MedDRA_concept_class_id, MedDRA_concept_code, MedDRA_concept_id, MedDRA_concept_name, congenital_anomali, death, disabling, fda, ich_ema, lastupdate_date, life_threatening, mostrecent_receive_date, nichd, other, patient_onsetage, patient_onsetageunit, patient_sex, patient_weight, reaction_outcome, receive_date, reporter_company, reporter_country, reporter_qualification, safetyreportid, serious]
Index: []

[0 rows x 29 columns]
0


In [41]:
(pediatric_patients_report_serious_reporter_drugs_reactions.
 to_csv('../../data/pediatric_patients_report_serious_reporter_drugs_reactions.csv.gz',
       compression='gzip')
)

In [42]:
del pediatric_patients_report_serious_reporter

In [43]:
pediatric_standard_drugs = (pd.
                            read_csv('../../data/openFDA_drug_event/er_tables_memory_efficient/standard_drugs.csv.gz',
                                     compression='gzip',
                                    dtype={
                                        'safetyreportid' : 'str'
                                    }).
                            query('safetyreportid in @ped_reports')
                           )
pediatric_standard_drugs.safetyreportid = pediatric_standard_drugs.safetyreportid.astype(str) 
pediatric_standard_drugs.RxNorm_concept_id = pediatric_standard_drugs.RxNorm_concept_id.astype(int)
pediatric_standard_drugs.head()

Unnamed: 0,RxNorm_concept_class_id,RxNorm_concept_code,RxNorm_concept_id,RxNorm_concept_name,safetyreportid
1567,Clinical Drug,311373,1107883,loratadine 10 MG Disintegrating Oral Tablet,10003615
1568,Branded Drug,744830,19127569,loratadine 10 MG Disintegrating Oral Tablet [C...,10003615
1569,Clinical Drug,672558,19125354,loratadine 5 MG Disintegrating Oral Tablet,10003615
1570,Branded Drug,904026,40174950,loratadine 5 MG Disintegrating Oral Tablet [Cl...,10003615
1760,Quant Clinical Drug,2467148,37003644,4 ML amisulpride 2.5 MG/ML Injection,10003649


In [44]:
import os
rxfiles = os.listdir('../../RxNorm_relationships_tables/')
rxfile_dict={}
for rxfile in rxfiles:
    key=rxfile.split('.')[0]
    rxfile_dict[key] = pd.read_csv('../../RxNorm_relationships_tables/'+rxfile,engine='c',index_col=0)

In [45]:
tobrand=[]
for rxfile in rxfile_dict.keys():
    tobrand.append(rxfile_dict[rxfile].query('concept_class_id_2=="Brand Name"'))

In [46]:
a = pediatric_standard_drugs.copy()
print(a[primarykey].nunique())
m = \
(pd.merge(
    a,
    pd.concat(tobrand),
    left_on='RxNorm_concept_id',
    right_on='concept_id_1'
)
)
m[primarykey].nunique()

222


157

In [47]:
m_renamed = \
(m.
 loc[:,
     [primarykey,'concept_class_id_2','concept_code_2','concept_name_2','concept_id_2']
    ].
 rename(columns={
     'concept_class_id_2' : 'RxNorm_concept_class_id',
     'concept_code_2' : 'RxNorm_concept_code',
     'concept_name_2' : 'RxNorm_concept_name',
     'concept_id_2' : 'RxNorm_concept_id'})
)

In [48]:
(m_renamed.
 to_csv('../../data/pediatric_patients_report_drug_brands.csv.gz',
       compression='gzip')
)