#Extracting Kidney Failure DDIs from **TWOSIDES**

This code downloads the TWOSIDES database from TDCommons and counts the number of each type of inteaction. There are 1301 different types of interactions in this dataset including anemia, dizziness, head ache, etc. Kidney failure had one of the most interactions with 18892 and was an interesting DDI effect. This code selects for DDIs that lead to kidney failure, represented by index 756.

Since this data only shows interactin drug pairs, we synthetically created a database consisting of all possible drug combinations between the 589 unique drugs. Any drugs pairs present in the kidney failure DDI dataset were labeled as interacting ("1"), and all other combinations were assumed to be non-interacting ("0").

In [None]:
# Installs

%%capture
!pip install rdkit==2022.3.4
!pip install deepchem==2.5.0
!pip install PyTDC # TDC dataset

In [None]:
# Imports

import numpy as np
import pandas as pd

In [None]:
# Download the TWOSIDES database from TDCommons
from tdc.multi_pred import DDI
data = DDI(name = 'TWOSIDES')
split = data.get_split()
df = data.get_data() # this gives a dataframe

# Look at shape and head of the dataframe
print("Shape of dataframe : ", df.shape)
df.head()

Downloading...
100%|██████████| 677M/677M [00:35<00:00, 19.3MiB/s]
Loading...
Done!


Shape of dataframe :  (4649441, 5)


Unnamed: 0,Drug1_ID,Drug1,Drug2_ID,Drug2,Y
0,CID000002173,CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=CC=C3)N)C(=O...,CID000003345,CCC(=O)N(C1CCN(CC1)CCC2=CC=CC=C2)C3=CC=CC=C3,1024
1,CID000002173,CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=CC=C3)N)C(=O...,CID000003345,CCC(=O)N(C1CCN(CC1)CCC2=CC=CC=C2)C3=CC=CC=C3,767
2,CID000002173,CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=CC=C3)N)C(=O...,CID000003345,CCC(=O)N(C1CCN(CC1)CCC2=CC=CC=C2)C3=CC=CC=C3,79
3,CID000002173,CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=CC=C3)N)C(=O...,CID000003345,CCC(=O)N(C1CCN(CC1)CCC2=CC=CC=C2)C3=CC=CC=C3,25
4,CID000002173,CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=CC=C3)N)C(=O...,CID000003345,CCC(=O)N(C1CCN(CC1)CCC2=CC=CC=C2)C3=CC=CC=C3,85


In [None]:
# Get the unique values and their counts from column 'Y'
value_counts = df['Y'].value_counts()

# Convert the value_counts series to a DataFrame for sorting
value_counts_df = pd.DataFrame({'Value': value_counts.index, 'Count': value_counts.values})

# Sort the DataFrame by the 'Count' column in descending order
sorted_value_counts = value_counts_df.sort_values(by='Count', ascending=False)

# Display the sorted values and counts
sorted_value_counts.head(30)

Unnamed: 0,Value,Count
0,464,28568
1,37,27006
2,258,26037
3,600,25190
4,692,24430
5,315,24260
6,648,23894
7,238,23848
8,74,23515
9,926,23043


In [None]:
# Integer to Type mapping
from tdc.utils import get_label_map
labels = get_label_map(name = 'TWOSIDES', task = 'DDI', name_column = 'Side Effect Name')

In [None]:
# Let's see what sort of side effects we can choose from

for i in range(30):
  print(labels[sorted_value_counts['Value'][i]])

arterial pressure NOS decreased
anaemia
Difficulty breathing
nausea
neumonia
Fatigue
Pain
diarrhea
asthenia
emesis
edema extremities
body temperature increased
pleural pain
abdominal pain
Hypoventilation
chest pain
dizziness
Back Ache
Head ache
High blood pressure
confusion
dehydration
Anxiety
kidney failure
loss of weight
hyperglycaemia
edema
Anorexia
Aching joints
acute kidney failure


In [None]:
# 756 = Kidney failure
kf_df = df[df.Y == 756]

# Concatenating Drug IDs and Drug smiles from both Drug1 and Drug2 columns
drug_info_1 = kf_df[['Drug1_ID', 'Drug1']].rename(columns={'Drug1_ID': 'ID', 'Drug1': 'Smiles'})
drug_info_2 = kf_df[['Drug2_ID', 'Drug2']].rename(columns={'Drug2_ID': 'ID', 'Drug2': 'Smiles'})

# Combining drug_info_1 and drug_info_2 DataFrames
combined_drug_info = pd.concat([drug_info_1, drug_info_2])

# Removing duplicates and keeping only unique drug IDs with their associated smiles
unique_drug_info = combined_drug_info.drop_duplicates().reset_index(drop=True)
print(len(unique_drug_info))

589


In [None]:
# This is where we add negative data
import itertools

# Create all possible drug combinations
all_drug_combinations = list(itertools.combinations(unique_drug_info['ID'], 2))

# Remove combinations with the same drug paired with itself and flipped duplicates
filtered_combinations = [(drug1, drug2) for drug1, drug2 in all_drug_combinations if drug1 != drug2 and (drug2, drug1) not in all_drug_combinations]

# Create a DataFrame with the structure of the original DataFrame and set 'Y' column values to 0
new_df = pd.DataFrame(columns=['Drug1_ID', 'Drug1', 'Drug2_ID', 'Drug2', 'Y'])

# Add drug combinations with 'Y' column values set to 0
for idx, (drug1, drug2) in enumerate(filtered_combinations):
    new_df.loc[idx] = [drug1, unique_drug_info.loc[unique_drug_info['ID'] == drug1, 'Smiles'].values[0],
                       drug2, unique_drug_info.loc[unique_drug_info['ID'] == drug2, 'Smiles'].values[0], 0]

In [None]:
# Iterate through rows of new_df to check and update 'Y' column values
for idx, row in new_df.iterrows():
    drug1_id = row['Drug1_ID']
    drug2_id = row['Drug2_ID']

    # Check if drug pair or its flipped version exists in kf_df
    if ((kf_df['Drug1_ID'] == drug1_id) & (kf_df['Drug2_ID'] == drug2_id)).any() or \
       ((kf_df['Drug1_ID'] == drug2_id) & (kf_df['Drug2_ID'] == drug1_id)).any():
        new_df.at[idx, 'Y'] = 1

# Display the updated new_df with 'Y' column values modified
print(new_df)

            Drug1_ID                                              Drug1  \
0       CID000005090   CS(=O)(=O)C1=CC=C(C=C1)C2=C(C(=O)OC2)C3=CC=CC=C3   
1       CID000005090   CS(=O)(=O)C1=CC=C(C=C1)C2=C(C(=O)OC2)C3=CC=CC=C3   
2       CID000005090   CS(=O)(=O)C1=CC=C(C=C1)C2=C(C(=O)OC2)C3=CC=CC=C3   
3       CID000005090   CS(=O)(=O)C1=CC=C(C=C1)C2=C(C(=O)OC2)C3=CC=CC=C3   
4       CID000005090   CS(=O)(=O)C1=CC=C(C=C1)C2=C(C(=O)OC2)C3=CC=CC=C3   
...              ...                                                ...   
173161  CID004479097  CC1=CC2=C(C=C1C)N(C=N2)C3C(C(C(O3)CO)OP(=O)([O...   
173162  CID004479097  CC1=CC2=C(C=C1C)N(C=N2)C3C(C(C(O3)CO)OP(=O)([O...   
173163  CID000002019  CC1C(C(=O)NC(C(=O)N2CCCC2C(=O)N(CC(=O)N(C(C(=O...   
173164  CID000002019  CC1C(C(=O)NC(C(=O)N2CCCC2C(=O)N(CC(=O)N(C(C(=O...   
173165  CID009571074  CC1=C(SC=N1)C=CC2=C(N3C(C(C3=O)NC(=O)C(=NOC)C4...   

            Drug2_ID                                              Drug2  Y  
0       CID000004946  

In [None]:
# Stats on DataFrame
print(len(new_df[new_df.Y==1])) # Interacting
print(len(new_df)) # Total
print(len(new_df[new_df.Y==0])) # Non-Interacting

18892
173166
154274


In [None]:
# Connect notebook with Google Drive
from google.colab import drive
drive.mount('drive')

Drive already mounted at drive; to attempt to forcibly remount, call drive.mount("drive", force_remount=True).


In [None]:
# Create and copy .csv to my drive
new_df.to_csv("kf_ddi_df.csv")
!cp kf_ddi_df.csv "drive/My Drive/"