### Merge Data

#### Merge Pediatrics data with Drug and Reaction
- Pediatrics data from 3_Pediatrics_data_merge.ipynb file
- Raw Drug file from drugcharacteristics.csv.gz (Drug)
- Reaction file from reactions.csv.gz (Meddra reaction PT)

In [54]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#### Raw Data

In [55]:
ped_report_data = pd.read_csv('../../data/pediatric_patients_report_serious_reporter.csv.gz',compression='gzip', index_col=0)

  ped_report_data = pd.read_csv('../../data/pediatric_patients_report_serious_reporter.csv.gz',compression='gzip', index_col=0)


In [56]:
drug_characteristics_df = pd.read_csv('../../data/openFDA_drug_event/er_tables_memory_efficient/drugcharacteristics.csv.gz',compression='gzip',index_col=0)

  drug_characteristics_df = pd.read_csv('../../data/openFDA_drug_event/er_tables_memory_efficient/drugcharacteristics.csv.gz',compression='gzip',index_col=0)


In [57]:
reaction_df = pd.read_csv("../../data/openFDA_drug_event/er_tables_memory_efficient/reactions.csv.gz", compression='gzip', low_memory=False, index_col=0)

##### Merge Data

In [58]:
ped_data = ped_report_data.copy()

In [59]:
# ped_report_data.isna().sum()
ped_report_data.shape

(859343, 21)

In [60]:
ped_data_merge = ped_report_data\
    .merge(drug_characteristics_df, on='safetyreportid', how='inner')\
    .merge(reaction_df, on='safetyreportid', how='inner')

##### Pediatrics Data Merge

In [61]:
ped_data_merge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1754671 entries, 0 to 1754670
Data columns (total 25 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   patient_custom_master_age  float64
 1   patient_onsetage           float64
 2   patient_onsetageunit       object 
 3   patient_sex                object 
 4   patient_weight             float64
 5   safetyreportid             object 
 6   nichd                      object 
 7   ich_ema                    object 
 8   fda                        object 
 9   lastupdate_date            int64  
 10  mostrecent_receive_date    int64  
 11  receive_date               int64  
 12  congenital_anomali         float64
 13  death                      float64
 14  disabling                  float64
 15  life_threatening           float64
 16  other                      float64
 17  serious                    object 
 18  reporter_company           object 
 19  reporter_country           object 
 20  re

##### Missing Data

In [62]:
missing_data_merge = ped_data_merge.isna().sum().sort_values(ascending=False)
missing_data_merge

congenital_anomali           1718001
disabling                    1654300
reaction_outcome             1653173
life_threatening             1537713
death                        1429485
patient_weight                906664
drug_indication               873247
other                         866181
fda                           699833
ich_ema                       479374
reporter_country              292501
reporter_qualification        288592
reporter_company               99597
patient_sex                    37526
medicinal_product                 34
patient_custom_master_age          0
patient_onsetage                   0
safetyreportid                     0
nichd                              0
patient_onsetageunit               0
receive_date                       0
mostrecent_receive_date            0
lastupdate_date                    0
serious                            0
drug_characterization              0
dtype: int64

#### Age Overview

In [63]:
# check  age unit
ped_data_merge['patient_onsetageunit'].value_counts(ascending=False)

patient_onsetageunit
Year      1498240
Month      157629
Day         78514
Week        13428
Hour         5810
Decade       1050
Name: count, dtype: int64

In [64]:
#Recheck Age after filter
ped_data_merge.loc[(ped_data_merge['patient_onsetageunit'] == 'Month'),['patient_custom_master_age','patient_onsetage','patient_onsetageunit']].sort_values(by='patient_onsetage',ascending=False)

Unnamed: 0,patient_custom_master_age,patient_onsetage,patient_onsetageunit
1224610,20.916667,251.0,Month
1162528,20.916667,251.0,Month
1162529,20.916667,251.0,Month
1162526,20.916667,251.0,Month
1162527,20.916667,251.0,Month
...,...,...,...
723418,0.083333,1.0,Month
723419,0.083333,1.0,Month
723420,0.083333,1.0,Month
723421,0.083333,1.0,Month


In [65]:
# print(type(ped_data_merge))

In [66]:
# print(ped_data_merge)

In [67]:
# Add new column with 'Year' as the value for all rows
ped_data_merge['patient_custom_master_age_unit'] = 'Year'

# Reorder columns to place the new column after 'patient_custom_master_age'
cols = list(ped_data_merge.columns)
cols.insert(cols.index('patient_custom_master_age') + 1, cols.pop(cols.index('patient_custom_master_age_unit')))
ped_data_merge = ped_data_merge[cols]

In [68]:
ped_data_merge.isna().sum().sort_values(ascending=False)

congenital_anomali                1718001
disabling                         1654300
reaction_outcome                  1653173
life_threatening                  1537713
death                             1429485
patient_weight                     906664
drug_indication                    873247
other                              866181
fda                                699833
ich_ema                            479374
reporter_country                   292501
reporter_qualification             288592
reporter_company                    99597
patient_sex                         37526
medicinal_product                      34
patient_custom_master_age_unit          0
patient_custom_master_age               0
patient_onsetage                        0
safetyreportid                          0
nichd                                   0
patient_onsetageunit                    0
receive_date                            0
mostrecent_receive_date                 0
lastupdate_date                   

In [69]:
ped_data_merge.shape

(1754671, 26)

#### Data Cleaning

#### Fill NA and Unknow

In [70]:
#fill NA in columns congenital_anomali,disabling,life_threatening,death,other
cols = ['congenital_anomali','disabling','life_threatening','death','other']
for i in cols:
    print(ped_data_merge[i].unique())

existing_cols = [col for col in cols if col in ped_data_merge.columns]

for col in existing_cols:
    ped_data_merge[col] = ped_data_merge[col].fillna(0).astype(int)
    
# After filling na
print('')
print('After filling NA : \n')
for i in cols:
    print(ped_data_merge[i].unique())

[ 2.  1. nan]
[ 2.  1. nan]
[ 2.  1. nan]
[ 2.  1. nan]
[ 2.  1. nan]

After filling NA : 

[2 1 0]
[2 1 0]
[2 1 0]
[2 1 0]
[2 1 0]


In [71]:
# fill unknown value in columns about patient
categorical_cols_to_fill_unknown = ['patient_sex']
for col in categorical_cols_to_fill_unknown:
    ped_data_merge[col] = ped_data_merge[col].fillna('Unknown')

In [72]:
# fill unknown value in columns about reporter
categorical_cols_to_fill_unknown = ['reporter_company', 'reporter_qualification', 'reporter_country']
for col in categorical_cols_to_fill_unknown:
    ped_data_merge[col] = ped_data_merge[col].fillna('Unknown')

In [73]:
# fill unknown value in columns about reaction
categorical_cols_to_fill_unknown = ['reaction_outcome']
for col in categorical_cols_to_fill_unknown:
    ped_data_merge[col] = ped_data_merge[col].fillna('Unknown')

In [74]:
ped_data_merge.isna().sum().sort_values(ascending=False)

patient_weight                    906664
drug_indication                   873247
fda                               699833
ich_ema                           479374
medicinal_product                     34
patient_custom_master_age              0
patient_sex                            0
patient_onsetageunit                   0
patient_custom_master_age_unit         0
patient_onsetage                       0
safetyreportid                         0
nichd                                  0
lastupdate_date                        0
mostrecent_receive_date                0
death                                  0
disabling                              0
receive_date                           0
congenital_anomali                     0
other                                  0
life_threatening                       0
serious                                0
reporter_company                       0
reporter_qualification                 0
reporter_country                       0
drug_characteriz

In [75]:
ped_data_merge.shape

(1754671, 26)

#### Droup Data

In [76]:
# Droup medicinal_product because have missing 34 rows
ped_data_merge = ped_data_merge.dropna(subset=['medicinal_product'])

In [77]:
# ลบคอลัมน์ 'patient_weight'
ped_data_merge = ped_data_merge.drop(columns=['patient_weight'])

In [78]:
ped_data_merge =ped_data_merge.dropna(subset=['drug_indication'])

In [79]:
ped_data_merge.isna().sum().sort_values(ascending=False)

fda                               351537
ich_ema                           241452
patient_custom_master_age              0
patient_custom_master_age_unit         0
patient_onsetage                       0
patient_sex                            0
patient_onsetageunit                   0
nichd                                  0
safetyreportid                         0
lastupdate_date                        0
mostrecent_receive_date                0
receive_date                           0
congenital_anomali                     0
death                                  0
disabling                              0
life_threatening                       0
other                                  0
serious                                0
reporter_company                       0
reporter_country                       0
reporter_qualification                 0
drug_characterization                  0
drug_indication                        0
medicinal_product                      0
reaction_outcome

In [80]:
ped_data_merge.shape

(881424, 25)

#### Save Files

In [81]:
ped_data_merge.to_csv('../../data/pediatric_patients_report_drug_reaction.csv.gz',compression='gzip')