In [1]:
import pandas as pd
import yaml

In [18]:
# Load the YAML file
with open('config.yaml', 'r') as file:
    config = yaml.safe_load(file)

In [19]:
dti_pd = pd.read_parquet(config["paths"]["DTI_DATASET"])

In [7]:
# Convert
dti_pd['rxcui'] = pd.to_numeric(dti_pd['rxcui'], errors='coerce').astype('int64')

# Verify the conversion
print("\nAfter conversion:")
print(dti_pd['rxcui'].dtype)
print(dti_pd.info())


After conversion:
int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34820 entries, 0 to 34819
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   drug_chembl_id     34820 non-null  object
 1   target_uniprot_id  34820 non-null  object
 2   label              34820 non-null  int64 
 3   smiles             34820 non-null  object
 4   sequence           34820 non-null  object
 5   molfile_3d         34763 non-null  object
 6   rxcui              34820 non-null  int64 
dtypes: int64(2), object(5)
memory usage: 1.9+ MB
None


In [20]:
# Check null counts per column
print("Null counts before dropping:")
print(dti_pd.isnull().sum())

# Drop nulls
dti_pd = dti_pd.dropna()

Null counts before dropping:
drug_chembl_id        0
target_uniprot_id     0
label                 0
smiles                0
sequence              0
molfile_3d           57
rxcui                 0
dtype: int64


In [21]:
print(f"\nShape after dropping nulls: {dti_pd.shape}")
print("\nNull counts after dropping:")
print(dti_pd.info())


Shape after dropping nulls: (34763, 7)

Null counts after dropping:
<class 'pandas.core.frame.DataFrame'>
Index: 34763 entries, 0 to 34819
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   drug_chembl_id     34763 non-null  object
 1   target_uniprot_id  34763 non-null  object
 2   label              34763 non-null  int64 
 3   smiles             34763 non-null  object
 4   sequence           34763 non-null  object
 5   molfile_3d         34763 non-null  object
 6   rxcui              34763 non-null  object
dtypes: int64(1), object(6)
memory usage: 2.1+ MB
None


In [22]:
# load meddra data
adr_pd = pd.read_parquet(config["paths"]["ADR_DATASET"])
adr_pd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1985510 entries, 0 to 1985509
Data columns (total 3 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   rxnorm_product_id  object
 1   meddra_id          int64 
 2   meddra_name        object
dtypes: int64(1), object(2)
memory usage: 45.4+ MB


In [23]:
common_rxnorm_ids = dti_pd['rxcui'].unique()

# Create subset of adr_pd
adr_subset = adr_pd[adr_pd['rxnorm_product_id'].isin(common_rxnorm_ids)].copy()

print(f"Original adr_pd shape: {adr_pd.shape}")
print(f"Subset adr_pd shape: {adr_subset.shape}")
print(f"Number of unique rxnorm_ids in common: {len(common_rxnorm_ids)}")

Original adr_pd shape: (1985510, 3)
Subset adr_pd shape: (6733, 3)
Number of unique rxnorm_ids in common: 1031


In [24]:
adr_subset.to_parquet("../Data/final_rxnorm_meddra_v2.parquet", index=False)