# Dataset Cleaning for Drug-Aftereffect Prediction

In [3]:
import pandas as pd

## Data Processing

In [4]:
snap_df = pd.read_csv("Dataset/ChSe-Decagon_monopharmacy.csv")
mappings_df = pd.read_csv("Dataset/drug-mappings.tsv", sep='\t')

In [5]:
print(snap_df.columns)
print(mappings_df.columns)

Index(['# STITCH', 'Individual Side Effect', 'Side Effect Name'], dtype='object')
Index(['drugbankId', 'name', 'ttd_id', 'pubchem_cid', 'cas_num', 'chembl_id',
       'zinc_id', 'chebi_id', 'kegg_cid', 'kegg_id', 'bindingDB_id',
       'UMLS_cuis', 'stitch_id'],
      dtype='object')


In [6]:
snap_df["# STITCH"].unique()

array(['CID003062316', 'CID000003117', 'CID000003114', 'CID000003730',
       'CID000003736', 'CID000003734', 'CID000002646', 'CID000028112',
       'CID004183806', 'CID000002462', 'CID000005381', 'CID000125889',
       'CID000005245', 'CID000004034', 'CID000003937', 'CID005362070',
       'CID000003121', 'CID000003382', 'CID000003381', 'CID000003386',
       'CID000003384', 'CID000003385', 'CID000002656', 'CID000002650',
       'CID000003706', 'CID000003702', 'CID000005978', 'CID000002471',
       'CID000002477', 'CID000002476', 'CID000002474', 'CID000002478',
       'CID000005394', 'CID000005391', 'CID000041693', 'CID000057537',
       'CID000004739', 'CID000004736', 'CID000004730', 'CID000004044',
       'CID000004046', 'CID005311297', 'CID000000815', 'CID000003394',
       'CID000003397', 'CID000003393', 'CID000002662', 'CID000002666',
       'CID000003249', 'CID000003715', 'CID000004943', 'CID000004946',
       'CID000005496', 'CID000060852', 'CID000004547', 'CID000004543',
      

In [7]:
# For each STITCH # in snap_df, use mappings_df to find the corresponding drugbankId and name

stitches = snap_df["# STITCH"].unique()
results = []
not_found = []

for stitch in stitches:
    mapping = mappings_df[mappings_df["stitch_id"] == stitch]
    if not mapping.empty:
        drugbankId = mapping["drugbankId"].values[0]
        name = mapping["name"].values[0]
        results.append((stitch, drugbankId, name))
    else:
        not_found.append(stitch)

In [8]:
print(len(results))
print(len(not_found))

321
318


In [24]:
not_found

['CID000003734',
 'CID000002646',
 'CID000028112',
 'CID004183806',
 'CID000002462',
 'CID000125889',
 'CID000003937',
 'CID005362070',
 'CID000003382',
 'CID000003381',
 'CID000003384',
 'CID000002656',
 'CID000002650',
 'CID000003706',
 'CID000002476',
 'CID000057537',
 'CID000004739',
 'CID000004736',
 'CID000004730',
 'CID000004046',
 'CID005311297',
 'CID000000815',
 'CID000002666',
 'CID000003249',
 'CID000005496',
 'CID000004547',
 'CID000004542',
 'CID000005234',
 'CID000004727',
 'CID003468412',
 'CID005281007',
 'CID000004053',
 'CID000004052',
 'CID004659568',
 'CID000005544',
 'CID000002675',
 'CID000002676',
 'CID000003279',
 'CID000005005',
 'CID000005482',
 'CID000005486',
 'CID000060754',
 'CID000004536',
 'CID000170361',
 'CID000003877',
 'CID000000838',
 'CID000000298',
 'CID005282044',
 'CID000060696',
 'CID000002609',
 'CID000012536',
 'CID000005038',
 'CID000005039',
 'CID000003510',
 'CID000024486',
 'CID000001134',
 'CID000000191',
 'CID000000772',
 'CID000003075

In [25]:
dn = pd.read_csv("Dataset/drug_names.csv", header=None)

In [26]:
dnl = list(dn[0])

In [27]:
still_nf = []
for name in not_found:
    if name not in dnl:
        still_nf.append(name)

In [28]:
mydf = pd.DataFrame(not_found, columns=["stitch"])

In [31]:
test_list = []
for name in stitches:
    if name not in dnl:
        test_list.append(name)

In [33]:
for name in dnl:
    if name not in stitches:
        print(name)

CID000002673
CID000003562
CID000003928
CID000004723
CID000005052
CID000123620


**ALL THE STITCHES ARE IN THE drug_names.csv FILE AND HAVE CORRESPONDING NAMES**

See if all of these names exist in the drug-mappings.tsv file

In [34]:
all_names = list(dn[1])

In [35]:
mapNames = mappings_df["name"].unique()

In [36]:
missing = []
for name in all_names:
    if name not in mapNames:
        missing.append(name)

In [38]:
all_names

['carnitine',
 'GABA',
 'leucovorin',
 'PGE2',
 'prostacyclin',
 'adenosine',
 'galactose',
 'prostaglandin E1',
 'calcium',
 'chloramphenicol',
 'bupropion',
 'estradiol',
 'mannitol',
 'epsilon-aminocaproic acid',
 'N-acetylcysteine',
 'cytosine arabinoside',
 'mesna',
 'heparin',
 'kanamycin',
 'epinephrine',
 'thyroxine',
 'triiodothyronine',
 'nicotinic acid',
 'nicotine',
 'pyrazinamide',
 'quinidine',
 'vitamin A',
 'sulfate',
 'thymidine',
 'methamphetamine',
 'naproxen',
 '2-chlorodeoxyadenosine',
 'doxorubicin',
 'phenytoin',
 'abacavir',
 'amphotericin B',
 'acebutolol',
 'acetaminophen',
 'acetazolamide',
 'actinomycin D',
 'acyclovir',
 'salbutamol',
 'alendronate',
 'alfuzosin',
 'alosetron',
 'alprazolam',
 'amantadine',
 'amifostine',
 'amikacin',
 'theophylline',
 'amiodarone',
 'amitriptyline',
 'amlodipine',
 'amoxapine',
 'amoxicillin',
 'ampicillin',
 'amprenavir',
 'anagrelide',
 'anastrozole',
 'apomorphine',
 'argatroban',
 'aspirin',
 'atenolol',
 'atorvastatin

In [40]:
sim = pd.read_csv("Dataset/drug-similarities-raw.txt", sep='\t', header=None)

In [41]:
sim_names = set(list(sim[0]))

In [44]:
found_df = pd.DataFrame(results, columns=["STITCH #", "DrugID", "Name"])

In [45]:
found_names = list(found_df["Name"])

In [46]:
sim = pd.read_csv("Dataset/drug-similarities-raw.txt", sep='\t', header=None)
sim_names = set(list(sim[0]))

In [50]:
print(found_df.shape)
found_df.head(15)

(290, 3)


Unnamed: 0,STITCH #,DrugID,Name
0,CID003062316,DB01254,Dasatinib
1,CID000003117,DB00822,Disulfiram
2,CID000003114,DB00280,Disopyramide
3,CID000003730,DB01362,Iohexol
4,CID000005381,DB00799,Tazarotene
5,CID000004034,DB00737,Meclizine
6,CID000003386,DB00472,Fluoxetine
7,CID000003385,DB00544,Fluorouracil
8,CID000003702,DB00808,Indapamide
9,CID000005978,DB00541,Vincristine


In [51]:
found_names = found_df["Name"].unique()

# Create a boolean mask for rows where both columns match the values in 'found_stitches' and 'found_names'
mask = sim[0].isin(found_names) & sim[1].isin(found_names)
snap_similarities = sim[mask].reset_index(drop=True)

# Create a dictionary from the 'found_df' DataFrame
name_to_stitch = found_df.set_index('Name')['STITCH #'].to_dict()

# Replace the names in the 'snap_similarities' DataFrame using the dictionary
snap_similarities[0] = snap_similarities[0].map(name_to_stitch)
snap_similarities[1] = snap_similarities[1].map(name_to_stitch)

In [52]:
snap_similarities

Unnamed: 0,0,1,2
0,CID000071158,CID000001978,0.507569
1,CID000071158,CID000001983,0.517090
2,CID000071158,CID000001986,0.447357
3,CID000071158,CID000060164,0.299784
4,CID000071158,CID000051263,0.409945
...,...,...,...
41900,CID000005719,CID000005734,0.592338
41901,CID000005719,CID000005735,0.729570
41902,CID000005732,CID000005734,0.637753
41903,CID000005732,CID000005735,0.837077


## Final Datasets:

In [54]:
# Drug-Aftereffect Relation Dataset
snap_df.to_csv('Dataset/Cleaned_Datasets/snap_dse.csv', index=False)

# Drug-Drug Similarity Dataset
snap_similarities.to_csv('Dataset/Cleaned_Datasets/snap_similarities.csv', index=False)