In [1]:
import csv
import vetee

In [2]:
raw_train = "../rawdata/drugsComTrain_raw.tsv"
raw_test = "../rawdata/drugsComTest_raw.tsv"

In [3]:
# first determine the names of all of the dieases
# and/or conditions and drug names
# keep track of frequency
conditions = dict()
drug_names = dict()

# debugging
#max = 10

for f in (raw_train, raw_test):
    with open(f, "r") as fin:
        fcsv = csv.DictReader(fin, delimiter='\t')
        for line in fcsv:
            cond = line["condition"]
            if "</span>" not in cond:
                try:
                    conditions[cond] += 1
                except KeyError:
                    conditions[cond] = 1
            try:
                drug_names[line["drugName"]] += 1
            except KeyError:
                drug_names[line["drugName"]] = 1

            # for debugging
            #max -= 1
            #if max < 0:
            #    break

# write to text files
with open("conditons.txt", "w") as out:
    for condition, freq in conditions.items():
        line = f"{condition:<70} {freq}\n"
        out.write(line)

with open("drugs.txt", "w") as out:
    for drug, freq in drug_names.items():
        line = f"{drug:<70} {freq}\n"
        out.write(line)

In [4]:
ones = 0
gt10 = 0
for freq in drug_names.values():
    if freq == 1:
        ones += 1
    elif freq > 10:
        gt10 += 1
print(f"Single-appearance drugs: {ones}")
print(f"Drugs where occurrence > 10: {gt10}")

Single-appearance drugs: 798
Drugs where occurrence > 10: 1499


In [5]:
# now to determine which drugs are in the pubchem database
drugs = dict()
failed_names = dict()
for drug, freq in drug_names.items():
    print(drug)
    try:
        drugs[drug] = (freq, vetee.structure.Structure("name", drug))
    except vetee.structure.StructureError:
        failed_names[drug] = freq
print(failed_names)

Valsartan
Guanfacine
Lybrel
Ortho Evra
Buprenorphine / naloxone
Cialis
Levonorgestrel
Aripiprazole
Keppra
Ethinyl estradiol / levonorgestrel
Topiramate
L-methylfolate
Pentasa
Dextromethorphan
Nexplanon
Liraglutide
Trimethoprim
Amitriptyline
Lamotrigine
Nilotinib
Atripla
Trazodone
Etonogestrel
Etanercept
Tioconazole
Azithromycin
Eflornithine
Daytrana
Ativan
Imitrex
Sertraline
Toradol
Viberzi
Mobic
Dulcolax
Morphine
MoviPrep
Trilafon
Fluconazole
Contrave
Clonazepam
Metaxalone
Venlafaxine
Ledipasvir / sofosbuvir
Symbyax
Tamsulosin
Doxycycline
Dulaglutide
Intuniv
Buprenorphine
Qvar
Opdivo
Pyridium
Latuda
Bupropion
Implanon
Effexor XR
Drospirenone / ethinyl estradiol
NuvaRing
Prepopik
Tretinoin
Gildess Fe 1 / 20
Ethinyl estradiol / norgestimate
Elbasvir / grazoprevir
Clomiphene
Docusate / senna
Amitiza
Sildenafil
Lo Loestrin Fe
Oxcarbazepine
Wellbutrin
Phillips' Milk of Magnesia
Nature-Throid
Lithium
Oxycodone
Estradiol
Sronyx
Augmentin XR
Monistat 7-Day Combination Pack
Plan B One-Step
Alp

Invega
Cutar
Serzone
Biaxin XL
Coreg
Ortho Cyclen
Lorcet 10 / 650
Letrozole
Cefuroxime
Sectral
Belladonna / opium
Flomax
My Way
Belsomra
Adapalene
Promethazine
Fentanyl Transdermal System
Desoxyn
Tegretol
Latisse
Oseltamivir
Kombiglyze XR
Minoxidil
Enbrel
Adalimumab
Xulane
Elavil
Endocet
Unisom SleepGels
Invokana
Naphazoline
Hydrochlorothiazide / telmisartan
Mycophenolate mofetil
Eucrisa
Biltricide
Bystolic
Ibuprofen / pseudoephedrine
Alesse
Bisoprolol / hydrochlorothiazide
Fexofenadine
Fentora
Guaifenesin
Modafinil
Kadian
Dexamethasone
Atropine / diphenoxylate
Metformin / sitagliptin
Fluorouracil
Clobetasol
Commit
Tri-Lo-Sprintec
Guaifenesin / phenylephrine
Dexbrompheniramine / pseudoephedrine
FreshKote
Racepinephrine
Keflex
Fluticasone
Levemir
Alprostadil
Carbidopa / levodopa
Tranexamic acid
Esomeprazole
Voltaren Gel
Adderall XR
Hydrocortisone
Remeron
Genvoya
Podofilox
Tri-Previfem
Atorvastatin
Carisoprodol
Gildess Fe 1.5 / 30
Viagra
Famotidine / ibuprofen
Selenium sulfide
Aubra
Toci

Diprivan
Exalgo
Chlorzoxazone
Ambien CR
Victrelis
Mebaral
Dilantin
Econazole
Tylenol PM
Levsin SL
Palbociclib
Creon
Nalbuphine
Ranibizumab
Tenex
Transderm-Scop
Gildess 24 Fe
Prometrium
Gabapentin enacarbil
Omacor
Ixekizumab
Zutripro
Naftifine
Epitol
Luvox
Diphenhydramine / naproxen
Clotrimazole
Imatinib
Lessina
Fenofibrate
Ultravate
Capsaicin
Omeprazole / sodium bicarbonate
Lexiscan
Xeljanz
Cyclafem 1 / 35
Factive
Rituxan
Menthol
Scullcap
Tussionex Pennkinetic
Gas-X
Rheumatrex Dose Pack
Zovia
Sutent
Mepivacaine
Pyridostigmine
Amerge
Xopenex
Brompheniramine / dextromethorphan / pseudoephedrine
Levoxyl
Hydralazine
Dextromethorphan / guaifenesin / pseudoephedrine
Evoxac
MetroGel
Dimethyl fumarate
Fluticasone / salmeterol
Eskalith
Fioricet with Codeine
Dofetilide
Tretin-X
Effient
Quillivant XR
Vaniqa
Coal tar
Lacrisert
Nasacort Allergy 24HR
Acetaminophen / caffeine / isometheptene mucate
Dalmane
Terazosin
Atazanavir
Benadryl Allergy
Mestinon
Bevacizumab
Insulin glulisine
WinRho SDF
Sulinda

Tylenol with Codeine
Botulinum toxin type b
Stiolto Respimat
Ashlyna
Senokot
Norgesic
Dronedarone
Avonex Pen
Neostigmine
Darvocet A500
Undecylenic acid
Drospirenone / estradiol
Drixoral Cold / Flu
Gadavist
Vagistat-1
Magnesium oxide
Valdecoxib
Methamphetamine
Lotemax
Naropin Polyamp
Humulin N
Alavert
Sipuleucel-T
Rozex
Collagenase
Savaysa
Humatin
Aspirin / carisoprodol
Griseofulvin
Microgestin 1.5 / 30
Vasodilan
Talwin Nx
Klonopin Wafer
Aliskiren
Zubsolv
Soma Compound
Coreg CR
Aplenzin
Viekira Pak
Fesoterodine
Vivarin
Detrol
Cetuximab
Nasacort AQ
Rynatan
Rectiv
Verelan
Flanax Pain Reliever
Paclitaxel protein-bound
Hyoscyamine / methenamine / methylene blue / sodium biphosphate
Foradil Aerolizer
Raloxifene
Revia
Polocaine
Ludiomil
Echinacea
Larin Fe 1 / 20
Vusion
Silver sulfadiazine
Avandamet
Analpram-HC
Lariam
Methylergonovine
Oxaprozin
Tanzeum
Vibramycin
Rifaximin
Hydrochlorothiazide / irbesartan
Truvada
Azelex
Ibrutinib
Estradiol / norethindrone
Pimozide
Enalapril
Influenza virus vac

Anaprox-DS
Tivicay
Nutraplus
Dramamine Less Drowsy
FML Forte Liquifilm
Motofen
Dextrostat
Oxymetholone
Limbrel 500
Insulin lispro / insulin lispro protamine
Zenpep
Bosulif
Felodipine
Kaletra
Excedrin PM
Fleet Glycerin Suppositories Adult
Molindone
Aveeno
Insulin aspart / insulin aspart protamine
Xigduo XR
Dificid
Cefzil
Tracleer
Podophyllum resin
Procyclidine
Synalar
Ortho-Cept
Blistex Lip Balm
Cobicistat / darunavir
Zinc sulfate
Xopenex HFA
Pepcid
Taxol
Florinef
Monurol
Damiana
Neoral
Ofloxacin
Dapagliflozin / metformin
Pramosone
Synjardy
ProFe
Compro
Ultracet
Perindopril
Aveed
Rebif Rebidose
Hydrocortisone / neomycin / polymyxin b
Twynsta
Junel Fe 24
Primlev
Tandem DHA
Amiloride
Loestrin 21 1.5 / 30
Spectracef
Uroqid-Acid No2
Lonox
Metolazone
Amitriptyline / perphenazine
Plendil
Zoster vaccine live
Glucovance
Acrivastine / pseudoephedrine
Qualaquin
Pilocarpine
Chlorpheniramine / methscopolamine / pseudoephedrine
Droperidol
Interferon alfa-2b
Alsuma
Promacta
Miacalcin
Cefditoren
Tapaz

Cuvitru
Vitamin C
Alphanine SD
Busulfan
Vitafol Ultra
Unasyn
Lagesic
Cheratussin DAC
Ampicillin / sulbactam
Coagulation factor ix
Avar
Clioquinol / hydrocortisone
Lorcet
Lorabid
Fostex Medicated
Estrasorb
Nisoldipine
Ethacrynic acid
Bag Balm
Hexalen
BenzEFoam
Selsun Blue
Camptosar
Sucroferric oxyhydroxide
Fortical
Cerefolin NAC
Hyosyne
Lactic acid
Nexium 24HR
Humalog Mix 75 / 25
Dexchlorpheniramine / dextromethorphan / pseudoephedrine
Fleet Phospho Soda
Pomalidomide
Headache Relief PM
Pyrimethamine
Univasc
Ozurdex
Zinc gluconate
Chlorcyclizine / phenylephrine
Tylenol Arthritis Pain
Thioridazine
Pulmicort Respules
Pima
Necitumumab
PanOxyl 10% Acne Foaming Wash
Novolin N
Reese's Pinworm Medicine
Valerian
Polysporin First Aid Antibiotic Ointment
Diprosone
Chondroitin / glucosamine / methylsulfonylmethane
Apokyn
Trental
Vancocin HCl
Hyzine
Activella
Pyridoxine
Dabrafenib
Benzalkonium chloride / lidocaine
Zetran
Dalteparin
Glyset
Cortef
Leustatin
Ifosfamide
Zeasorb-AF Drying Gel
BP Wash
Fan

In [6]:
import pickle

In [8]:
# store results since the web requests take a long time (~5s each)
with open("drugs.pickle", "wb") as f:
    pickle.dump(drugs, f)

In [9]:
with open("failed-drugs.txt", "w") as out:
    for drug, freq in failed_names.items():
        line = f"{drug:<70} {freq}\n"
        out.write(line)

In [10]:
cids = set()
# now determine which drugs are the "same" compound (i.e. same pubchem molecule)
for drug in drugs:
    cids.add(drugs[drug][1].cid)

In [12]:
# so we have 1370 unique drugs
len(cids)

1370

In [13]:
# and 1171 drug names that are not found in pubchem
len(failed_names)

1171

In [18]:
# collect the data from the original dataset that matches drugs found in pubchem
outfile = "drug_data_trimmed.csv"
with open(outfile, "w") as o, open(raw_test, "r") as f1, open(raw_train, "r") as f2:
    f1csv = csv.DictReader(f1, delimiter='\t')
    f2csv = csv.DictReader(f2, delimiter='\t')
    ocsv = csv.writer(o)
    # first fieldname is empty (and contains seemingly useless numbers as values)
    ocsv.writerow(f1csv.fieldnames[1:]+["cid"])
    for line in f1csv:
        if line["drugName"] in drugs:
            del line[""] # get rid of random number entry
            line["cid"] = drugs[line["drugName"]][1].cid
            ocsv.writerow(line.values())
    for line in f2csv:
        if line["drugName"] in drugs:
            del line[""] # get rid of random number entry
            line["cid"] = drugs[line["drugName"]][1].cid
            ocsv.writerow(line.values())

In [20]:
import pybel
import openbabel

In [25]:
def make_pybel_mol(struct_obj):
    """Given a vetee structure object, make a pybel mol object.
    
    NOTE: doesn't work. don't use this to get fingerprints"""
    # generate an openbabel molecule
    obmol = openbabel.OBMol()
    # add coordinates for each atom
    for atom in struct_obj.coords:
        obatom = openbabel.OBAtom()
        atomicnum = vetee.gaussian_options.periodic_table(atom[0])
        obatom.SetAtomicNum(atomicnum)
        obatom.SetVector(atom[1], atom[2], atom[3])
        obmol.AddAtom(obatom)
    # set charge, multiplicity, and comments (title)
    obmol.SetTotalCharge(struct_obj.charge)
    obmol.SetTotalSpinMultiplicity(struct_obj.multip)
    obmol.SetTitle(struct_obj.comments)
    # convert the obmol to a pybel Molecule
    pybelmol = pybel.Molecule(obmol)
    return pybelmol

In [47]:
# test make_pybel_mol
for drug in drugs:
    test = drugs[drug][1]
    break
pybelmol = make_pybel_mol(test)
fp = pybelmol.calcfp()


# okay so for some reason you need to give a smiles string otherwise
# the fingerprint will not work (get all zeros - I assume you need
# to explicitly specify the connectivity)
test.smiles_str
pbmol2 = pybel.readstring("smi", test.smiles_str)
fp2 = pbmol2.calcfp()
print(fp2)


134226432, 2684633600, 393217, 16644, 2415919168, 268435456, 3223322624, 0, 5259288, 318799872, 402689536, 2147877120, 268435969, 71303168, 33556480, 268566528, 532492, 2684879872, 2097152, 2248181890, 1073852416, 2129920, 2240, 256, 570953728, 4, 134348808, 128, 67072, 1074790656, 25186329, 106504


In [45]:
# make a dictionary where the keys are cids and the values are vetee structure objects
cids_dict = dict()
for drug in drugs:
    cids_dict[drugs[drug][1].cid] = drugs[drug][1]

In [59]:
# make a fingerprint distance matrix for clustering
outfile = "fingerprint_distance_fp3.csv"
with open(outfile, "w") as o:
    ocsv = csv.writer(o)
    header = [""] + [cid for cid in cids_dict]
    ocsv.writerow(header)
    for cid, struct in cids_dict.items():
        line = [cid]
        mol1 = pybel.readstring("smi", struct.smiles_str)
        fp1 = mol1.calcfp("FP3")
        for struct2 in cids_dict.values():
            mol2 = pybel.readstring("smi", struct2.smiles_str)
            # calculate the other fingerprint
            fp2 = mol2.calcfp("FP3")
            line.append(fp1|fp2) # distance bw two fingerprints
        ocsv.writerow(line)

In [74]:
structures = []
for struct in cids_dict.values():
    structures.append(struct)

In [79]:
print(len(structures))
print(structures[-1])
print(structures[1369])

1370
<vetee.structure.Structure object at 0x7fabe6816a20>
<vetee.structure.Structure object at 0x7fabe6816a20>


In [80]:
# make a fingerprint distance matrix for clustering
# try to clean this up a little by reducing fingerprint calculations

########################
import numpy as np
matrix_fp2 = np.full((1370,1370), -1.0)
matrix_fp3 = np.full((1370,1370), -1.0)
matrix_fp4 = np.full((1370,1370), -1.0)
matrix_maccs = np.full((1370,1370), -1.0)

for i, struct in enumerate(structures):
    matrix_fp2[i][i] = 1.0
    matrix_fp3[i][i] = 1.0
    matrix_fp4[i][i] = 1.0
    matrix_maccs[i][i] = 1.0
    mol1 = pybel.readstring("smi", struct.smiles_str)
    fp1a = mol1.calcfp("FP3")
    fp1b = mol1.calcfp("FP4")
    fp1c = mol1.calcfp("MACCS")
    fp1d = mol1.calcfp("FP2")
    for j in range(i+1, 1370):
        mol2 = pybel.readstring("smi", structures[j].smiles_str)
        fp2a = mol2.calcfp("FP3")
        fp2b = mol2.calcfp("FP4")
        fp2c = mol2.calcfp("MACCS")
        fp2d = mol2.calcfp("FP2")
        similarity_fp3 = fp1a|fp2a
        similarity_fp4 = fp1b|fp2b
        similarity_maccs = fp1c|fp2c
        similarity_fp2 = fp1d|fp2d
        matrix_fp3[i][j] = similarity_fp3
        matrix_fp3[j][i] = similarity_fp3
        matrix_fp4[i][j] = similarity_fp4
        matrix_fp4[j][i] = similarity_fp4
        matrix_maccs[i][j] = similarity_maccs
        matrix_maccs[j][i] = similarity_maccs
        matrix_fp2[i][j] = similarity_fp2
        matrix_fp2[j][i] = similarity_fp2
        
# pickle the data for later use
with open("fingerprints_fp2.pickle", "wb") as fout:
    pickle.dump(matrix_fp2, fout)
with open("fingerprints_fp3.pickle", "wb") as fout:
    pickle.dump(matrix_fp3, fout)
with open("fingerprints_fp4.pickle", "wb") as fout:
    pickle.dump(matrix_fp4, fout)
with open("fingerprints_maccs.pickle", "wb") as fout:
    pickle.dump(matrix_maccs, fout)

############################

In [84]:
# wrte fingerprint matrices to files
outfile = "fingerprint_distance_maccs.csv"
with open(outfile, "w") as o:
    ocsv = csv.writer(o)
    header = [""] + [struct.cid for struct in structures]
    ocsv.writerow(header)
    for i, row in enumerate(matrix_maccs):
        line = [structures[i].cid]
        for col in row:
            line.append(col)
        ocsv.writerow(line)

In [87]:
# since fingerprint | distance is a measure of similarity, change values to 1-similarity
# to get a dissimilarity matrix
infile = "fingerprint_distance_maccs.csv"
outfile = "maccs_dissimilarity.csv"
with open(infile, "r") as f, open(outfile, "w") as o:
    fcsv = csv.reader(f)
    ocsv = csv.writer(o)
    header = next(fcsv)
    ocsv.writerow(header)
    for line in fcsv:
        newline = [line[0]]
        for item in line[1:]:
            item = float(item)
            newline.append(1-item)
        ocsv.writerow(newline)