In [1]:
import pubchempy as pcp
import pandas as pd
import argparse
import os
import time

In [2]:
def get_compound_from_smiles(smiles):
    attempts = 5
    time_delay = 1 # in seconds
    while attempts >= 1:
        try:
            compounds = pcp.get_compounds(smiles, namespace='smiles')
            cid = compounds[0].cid
            if cid == None:
                print('No PubChem record') # https://pubchempy.readthedocs.io/en/latest/guide/gettingstarted.html
                return None
            compound = pcp.Compound.from_cid(cid)
        except:
            attempts -= 1
            print('Could not get compound. ' + str(attempts) + ' attempts remaining.')
            time.sleep(time_delay)
        else:
            return compound
    
    if attempts <= 0:
        print('Failed to get compound from smiles after exhausting all attempts')
        return None

def compound_series(smiles_file):
    compound_series_list = []
    for index, line in enumerate(smiles_file):
        # parse line
        parsed_line = line.strip().split('\t')
        smiles, ncats_id, label = tuple(parsed_line)
        
        # get compound
        print('Getting compound ' + str(index))
        compound = get_compound_from_smiles(smiles)
        
        # check for empty compound (e.g. failed to get compound from smiles code)
        if compound == None:
            continue
        
        # convert compound to pandas Series
        compound_series = compound.to_series()
        
        # append label, smiles, and ncats_id to pandas Series
        compound_series['label'] = label
        compound_series['smiles'] = smiles
        compound_series['ncats_id'] = ncats_id
        
        # add Series to list of compound Series
        compound_series_list.append(compound_series)
    
    return compound_series_list

def compound_series_to_dataframe(compound_series_list):
    df = pd.concat(compound_series_list, axis=1)
    df = df.T
    # df = df.set_index('cid')
    return df

In [3]:
input_filename = 'train/sr-are'
input_directory = 'data/'
output_directory = 'features-pandas/'
output_filename = output_directory + input_filename + '.features'

# get list of pandas Series
with open('data/'+ input_filename + ".smiles", 'r') as smiles_file:
    compound_series_list = compound_series(smiles_file)

df = compound_series_to_dataframe(compound_series_list)
df.to_csv(output_filename, sep='\t')

Getting compound 0
Getting compound 1
Getting compound 2
Getting compound 3
Getting compound 4
Getting compound 5
Getting compound 6
Getting compound 7
Getting compound 8
Getting compound 9
Getting compound 10
Getting compound 11
Getting compound 12
Getting compound 13
Getting compound 14
Getting compound 15
Getting compound 16
Getting compound 17
Getting compound 18
Getting compound 19
Getting compound 20
Getting compound 21
Getting compound 22
Getting compound 23
Getting compound 24
Getting compound 25
Getting compound 26
Getting compound 27
Getting compound 28
Getting compound 29
Getting compound 30
Getting compound 31
Getting compound 32
Getting compound 33
Getting compound 34
Getting compound 35
Getting compound 36
Getting compound 37
Getting compound 38
Getting compound 39
Getting compound 40
Getting compound 41
Getting compound 42
Getting compound 43
Getting compound 44
Getting compound 45
Getting compound 46
Getting compound 47
Getting compound 48
Getting compound 49
Getting co

Getting compound 395
Getting compound 396
Getting compound 397
Getting compound 398
Getting compound 399
Getting compound 400
Getting compound 401
Getting compound 402
Getting compound 403
Getting compound 404
Getting compound 405
Getting compound 406
Getting compound 407
Getting compound 408
Getting compound 409
Getting compound 410
Getting compound 411
Getting compound 412
Getting compound 413
Getting compound 414
Getting compound 415
Getting compound 416
Getting compound 417
Getting compound 418
Getting compound 419
Getting compound 420
Getting compound 421
Getting compound 422
Getting compound 423
Getting compound 424
Getting compound 425
Getting compound 426
Getting compound 427
Getting compound 428
Getting compound 429
Getting compound 430
Getting compound 431
Getting compound 432
Getting compound 433
Getting compound 434
Getting compound 435
Getting compound 436
Getting compound 437
Getting compound 438
Getting compound 439
Getting compound 440
Getting compound 441
Getting compo

Getting compound 786
Getting compound 787
Getting compound 788
Getting compound 789
Getting compound 790
Getting compound 791
Getting compound 792
Getting compound 793
Getting compound 794
Getting compound 795
Getting compound 796
Getting compound 797
Getting compound 798
Getting compound 799
Getting compound 800
Getting compound 801
Getting compound 802
Getting compound 803
Getting compound 804
Getting compound 805
Getting compound 806
Getting compound 807
Getting compound 808
Getting compound 809
Getting compound 810
Getting compound 811
Getting compound 812
Getting compound 813
Getting compound 814
Getting compound 815
Getting compound 816
Getting compound 817
Getting compound 818
Getting compound 819
Getting compound 820
Getting compound 821
Getting compound 822
Getting compound 823
Getting compound 824
Getting compound 825
Getting compound 826
Getting compound 827
Getting compound 828
Getting compound 829
Getting compound 830
Getting compound 831
Getting compound 832
Getting compo

Getting compound 1169
Getting compound 1170
Getting compound 1171
Getting compound 1172
Getting compound 1173
Getting compound 1174
Getting compound 1175
Getting compound 1176
Getting compound 1177
Getting compound 1178
Getting compound 1179
Getting compound 1180
Getting compound 1181
Getting compound 1182
Getting compound 1183
Getting compound 1184
Getting compound 1185
Getting compound 1186
Getting compound 1187
Getting compound 1188
Getting compound 1189
Getting compound 1190
Getting compound 1191
Getting compound 1192
Getting compound 1193
Getting compound 1194
Getting compound 1195
Getting compound 1196
Getting compound 1197
Getting compound 1198
Getting compound 1199
Getting compound 1200
Getting compound 1201
Getting compound 1202
Getting compound 1203
Getting compound 1204
Getting compound 1205
Getting compound 1206
Getting compound 1207
Getting compound 1208
Getting compound 1209
Getting compound 1210
Getting compound 1211
Getting compound 1212
Getting compound 1213
Getting co

Getting compound 1542
Getting compound 1543
Getting compound 1544
Getting compound 1545
Getting compound 1546
Getting compound 1547
Getting compound 1548
Getting compound 1549
Getting compound 1550
Getting compound 1551
Getting compound 1552
Getting compound 1553
Getting compound 1554
Getting compound 1555
Getting compound 1556
Getting compound 1557
Getting compound 1558
Getting compound 1559
Getting compound 1560
Getting compound 1561
Getting compound 1562
Getting compound 1563
Getting compound 1564
Getting compound 1565
Getting compound 1566
Getting compound 1567
Getting compound 1568
Getting compound 1569
Getting compound 1570
Getting compound 1571
Getting compound 1572
Getting compound 1573
Getting compound 1574
Getting compound 1575
Getting compound 1576
Getting compound 1577
Getting compound 1578
Getting compound 1579
Getting compound 1580
Getting compound 1581
Getting compound 1582
Getting compound 1583
Getting compound 1584
Getting compound 1585
Getting compound 1586
Getting co

Getting compound 1913
Getting compound 1914
Getting compound 1915
Getting compound 1916
Getting compound 1917
Getting compound 1918
Getting compound 1919
Getting compound 1920
Getting compound 1921
Getting compound 1922
Getting compound 1923
Getting compound 1924
Getting compound 1925
Getting compound 1926
Getting compound 1927
Getting compound 1928
Getting compound 1929
Getting compound 1930
Getting compound 1931
Getting compound 1932
Getting compound 1933
Getting compound 1934
Getting compound 1935
Getting compound 1936
Getting compound 1937
Getting compound 1938
Getting compound 1939
Getting compound 1940
Getting compound 1941
Getting compound 1942
Getting compound 1943
Getting compound 1944
Getting compound 1945
Getting compound 1946
Getting compound 1947
Getting compound 1948
Getting compound 1949
Getting compound 1950
Getting compound 1951
Getting compound 1952
Getting compound 1953
Getting compound 1954
Getting compound 1955
Getting compound 1956
Getting compound 1957
Getting co

Getting compound 2286
Getting compound 2287
Getting compound 2288
Getting compound 2289
Getting compound 2290
Getting compound 2291
Getting compound 2292
Getting compound 2293
Getting compound 2294
Getting compound 2295
Getting compound 2296
Getting compound 2297
Getting compound 2298
Getting compound 2299
Getting compound 2300
Getting compound 2301
Getting compound 2302
Getting compound 2303
Getting compound 2304
Getting compound 2305
Getting compound 2306
Getting compound 2307
Getting compound 2308
Getting compound 2309
Getting compound 2310
Getting compound 2311
Getting compound 2312
Getting compound 2313
Getting compound 2314
Getting compound 2315
Getting compound 2316
Getting compound 2317
Getting compound 2318
Getting compound 2319
Getting compound 2320
Getting compound 2321
Getting compound 2322
Getting compound 2323
Getting compound 2324
Getting compound 2325
Getting compound 2326
Getting compound 2327
Getting compound 2328
Getting compound 2329
Getting compound 2330
Getting co

Getting compound 2645
Getting compound 2646
Getting compound 2647
Getting compound 2648
Getting compound 2649
Getting compound 2650
Getting compound 2651
Getting compound 2652
Getting compound 2653
Getting compound 2654
Getting compound 2655
Getting compound 2656
Getting compound 2657
Getting compound 2658
Getting compound 2659
Getting compound 2660
Getting compound 2661
Getting compound 2662
Getting compound 2663
Getting compound 2664
Getting compound 2665
Getting compound 2666
Getting compound 2667
Getting compound 2668
Getting compound 2669
Getting compound 2670
Getting compound 2671
Getting compound 2672
Getting compound 2673
Getting compound 2674
Getting compound 2675
Getting compound 2676
Getting compound 2677
Getting compound 2678
Getting compound 2679
Getting compound 2680
Getting compound 2681
Getting compound 2682
Getting compound 2683
Getting compound 2684
Getting compound 2685
Getting compound 2686
Getting compound 2687
Getting compound 2688
Getting compound 2689
Getting co

Getting compound 3018
Getting compound 3019
Getting compound 3020
Getting compound 3021
Getting compound 3022
Getting compound 3023
Getting compound 3024
Getting compound 3025
Getting compound 3026
Getting compound 3027
Getting compound 3028
Getting compound 3029
Getting compound 3030
Getting compound 3031
Getting compound 3032
Getting compound 3033
Getting compound 3034
Getting compound 3035
Getting compound 3036
Getting compound 3037
Getting compound 3038
Getting compound 3039
Getting compound 3040
Getting compound 3041
Getting compound 3042
Getting compound 3043
Getting compound 3044
Getting compound 3045
Getting compound 3046
Getting compound 3047
Getting compound 3048
Getting compound 3049
Getting compound 3050
Getting compound 3051
Getting compound 3052
Getting compound 3053
Getting compound 3054
Getting compound 3055
Getting compound 3056
Getting compound 3057
Getting compound 3058
Getting compound 3059
Getting compound 3060
Getting compound 3061
Getting compound 3062
Getting co

Getting compound 3377
Getting compound 3378
Getting compound 3379
Getting compound 3380
Getting compound 3381
Getting compound 3382
Getting compound 3383
Getting compound 3384
Getting compound 3385
Getting compound 3386
Getting compound 3387
Getting compound 3388
Getting compound 3389
Getting compound 3390
Getting compound 3391
Getting compound 3392
Getting compound 3393
Getting compound 3394
Getting compound 3395
Getting compound 3396
Getting compound 3397
Getting compound 3398
Getting compound 3399
Getting compound 3400
Getting compound 3401
Getting compound 3402
Getting compound 3403
Getting compound 3404
Getting compound 3405
Getting compound 3406
Getting compound 3407
Getting compound 3408
Getting compound 3409
Getting compound 3410
Getting compound 3411
Getting compound 3412
Getting compound 3413
Getting compound 3414
Getting compound 3415
Getting compound 3416
Getting compound 3417
Getting compound 3418
Getting compound 3419
Getting compound 3420
Getting compound 3421
Getting co

Getting compound 3750
Getting compound 3751
Getting compound 3752
Getting compound 3753
Getting compound 3754
Getting compound 3755
Getting compound 3756
Getting compound 3757
Getting compound 3758
Getting compound 3759
Getting compound 3760
Getting compound 3761
Getting compound 3762
Getting compound 3763
Getting compound 3764
Getting compound 3765
Getting compound 3766
Getting compound 3767
Getting compound 3768
Getting compound 3769
Getting compound 3770
Getting compound 3771
Getting compound 3772
Getting compound 3773
Getting compound 3774
Getting compound 3775
Getting compound 3776
Getting compound 3777
Getting compound 3778
Getting compound 3779
Getting compound 3780
Getting compound 3781
Getting compound 3782
Getting compound 3783
Getting compound 3784
Getting compound 3785
Getting compound 3786
Getting compound 3787
Getting compound 3788
Getting compound 3789
Getting compound 3790
Getting compound 3791
Getting compound 3792
Getting compound 3793
Getting compound 3794
Getting co

Getting compound 4123
Getting compound 4124
Getting compound 4125
Getting compound 4126
Getting compound 4127
Getting compound 4128
Getting compound 4129
Getting compound 4130
Getting compound 4131
Getting compound 4132
Getting compound 4133
Getting compound 4134
Getting compound 4135
Getting compound 4136
Getting compound 4137
Getting compound 4138
Getting compound 4139
Getting compound 4140
Getting compound 4141
Getting compound 4142
Getting compound 4143
Getting compound 4144
Getting compound 4145
Getting compound 4146
Getting compound 4147
Getting compound 4148
Getting compound 4149
Getting compound 4150
Getting compound 4151
Getting compound 4152
Getting compound 4153
Getting compound 4154
Getting compound 4155
Getting compound 4156
Getting compound 4157
Getting compound 4158
Getting compound 4159
Getting compound 4160
Getting compound 4161
Getting compound 4162
Getting compound 4163
Getting compound 4164
Getting compound 4165
Getting compound 4166
Getting compound 4167
Getting co

Getting compound 4496
Getting compound 4497
Getting compound 4498
Getting compound 4499
Getting compound 4500
Getting compound 4501
Getting compound 4502
Getting compound 4503
Getting compound 4504
Getting compound 4505
Getting compound 4506
Getting compound 4507
Getting compound 4508
Getting compound 4509
Getting compound 4510
Getting compound 4511
Getting compound 4512
Getting compound 4513
Getting compound 4514
Getting compound 4515
Getting compound 4516
Getting compound 4517
Getting compound 4518
Getting compound 4519
Getting compound 4520
Getting compound 4521
Getting compound 4522
No PubChem record
Getting compound 4523
Getting compound 4524
Getting compound 4525
Getting compound 4526
Getting compound 4527
Getting compound 4528
Getting compound 4529
Getting compound 4530
Getting compound 4531
Getting compound 4532
Getting compound 4533
Getting compound 4534
Getting compound 4535
Getting compound 4536
Getting compound 4537
Getting compound 4538
Getting compound 4539
Getting compou

Getting compound 4868
Getting compound 4869
Getting compound 4870
Getting compound 4871
Getting compound 4872
Getting compound 4873
Getting compound 4874
Getting compound 4875
Getting compound 4876
Getting compound 4877
Getting compound 4878
Getting compound 4879
Getting compound 4880
Getting compound 4881
Getting compound 4882
Getting compound 4883
Getting compound 4884
Getting compound 4885
Getting compound 4886
Getting compound 4887
Getting compound 4888
Getting compound 4889
Getting compound 4890
Getting compound 4891
Getting compound 4892
Getting compound 4893
Getting compound 4894
Getting compound 4895
Getting compound 4896
Getting compound 4897
Getting compound 4898
Getting compound 4899
Getting compound 4900
Getting compound 4901
Getting compound 4902
Getting compound 4903
Getting compound 4904
Getting compound 4905
Getting compound 4906
Getting compound 4907
Getting compound 4908
Getting compound 4909
Getting compound 4910
Getting compound 4911
Getting compound 4912
Getting co

Getting compound 5233
Getting compound 5234
Getting compound 5235
Getting compound 5236
Getting compound 5237
Getting compound 5238
Getting compound 5239
Getting compound 5240
Getting compound 5241
Getting compound 5242
Getting compound 5243
Getting compound 5244
Getting compound 5245
Getting compound 5246
Getting compound 5247
Getting compound 5248
Getting compound 5249
Getting compound 5250
Getting compound 5251
Getting compound 5252
Getting compound 5253
Getting compound 5254
Getting compound 5255
Getting compound 5256
Getting compound 5257
Getting compound 5258
Getting compound 5259
Getting compound 5260
Getting compound 5261
Getting compound 5262
Getting compound 5263
Getting compound 5264
Getting compound 5265
Getting compound 5266
Getting compound 5267
Getting compound 5268
Getting compound 5269
Getting compound 5270
Getting compound 5271
Getting compound 5272
Getting compound 5273
Getting compound 5274
Getting compound 5275
Getting compound 5276
Getting compound 5277
Getting co

Getting compound 5606
Getting compound 5607
Getting compound 5608
Getting compound 5609
No PubChem record
Getting compound 5610
Getting compound 5611
Getting compound 5612
Getting compound 5613
Getting compound 5614
Getting compound 5615
Getting compound 5616
Getting compound 5617
Getting compound 5618
Getting compound 5619
Getting compound 5620
Getting compound 5621
Getting compound 5622
Getting compound 5623
Getting compound 5624
Getting compound 5625
Getting compound 5626
Getting compound 5627
Getting compound 5628
Getting compound 5629
Getting compound 5630
Getting compound 5631
Getting compound 5632
Getting compound 5633
Getting compound 5634
Getting compound 5635
Getting compound 5636
Getting compound 5637
Getting compound 5638
Getting compound 5639
Getting compound 5640
Getting compound 5641
Getting compound 5642
Getting compound 5643
Getting compound 5644
Getting compound 5645
Getting compound 5646
Getting compound 5647
Getting compound 5648
Getting compound 5649
Getting compou

Getting compound 5978
Getting compound 5979
Getting compound 5980
Getting compound 5981
Getting compound 5982
Getting compound 5983
Getting compound 5984
Getting compound 5985
Getting compound 5986
Getting compound 5987
Getting compound 5988
Getting compound 5989
Getting compound 5990
Getting compound 5991
Getting compound 5992
Getting compound 5993
Getting compound 5994
Getting compound 5995
Getting compound 5996
Getting compound 5997
Getting compound 5998
Getting compound 5999
Getting compound 6000
Getting compound 6001
Getting compound 6002
Getting compound 6003
Getting compound 6004
Getting compound 6005
Getting compound 6006
Getting compound 6007
Getting compound 6008
Getting compound 6009
Getting compound 6010
Getting compound 6011
Getting compound 6012
Getting compound 6013
Getting compound 6014
Getting compound 6015
Getting compound 6016
Getting compound 6017
Getting compound 6018
Getting compound 6019
Getting compound 6020
Getting compound 6021
Getting compound 6022
Getting co

Getting compound 6351
Getting compound 6352
Getting compound 6353
Getting compound 6354
Getting compound 6355
Getting compound 6356
Getting compound 6357
Getting compound 6358
Getting compound 6359
Getting compound 6360
Getting compound 6361
Getting compound 6362
Getting compound 6363
Getting compound 6364
Getting compound 6365
Getting compound 6366
Getting compound 6367
Getting compound 6368
Getting compound 6369
Getting compound 6370
Getting compound 6371
Getting compound 6372
Getting compound 6373
Getting compound 6374
Getting compound 6375
Getting compound 6376
Getting compound 6377
Getting compound 6378
Getting compound 6379
Getting compound 6380
Getting compound 6381
Getting compound 6382
Getting compound 6383
Getting compound 6384
Getting compound 6385
Getting compound 6386
Getting compound 6387
Getting compound 6388
Getting compound 6389
Getting compound 6390
Getting compound 6391
Getting compound 6392
Getting compound 6393
Getting compound 6394
Getting compound 6395
Getting co

Getting compound 6723
Getting compound 6724
Getting compound 6725
Getting compound 6726
Getting compound 6727
Getting compound 6728
Getting compound 6729
Getting compound 6730
Getting compound 6731
Getting compound 6732
Getting compound 6733
Getting compound 6734
Getting compound 6735
Getting compound 6736
Getting compound 6737
Getting compound 6738
Getting compound 6739
Getting compound 6740
Getting compound 6741
Getting compound 6742
Getting compound 6743
Getting compound 6744
Getting compound 6745
Getting compound 6746
Getting compound 6747
Getting compound 6748
Getting compound 6749
Getting compound 6750
Getting compound 6751
Getting compound 6752
Getting compound 6753
Getting compound 6754
Getting compound 6755
Getting compound 6756
Getting compound 6757
Getting compound 6758
Getting compound 6759
Getting compound 6760
Getting compound 6761
Getting compound 6762
Getting compound 6763
Getting compound 6764
Getting compound 6765
Getting compound 6766
Getting compound 6767
Getting co

Getting compound 7096
Getting compound 7097
Getting compound 7098
Getting compound 7099
Getting compound 7100
Getting compound 7101
Getting compound 7102
Getting compound 7103
Getting compound 7104
Getting compound 7105
Getting compound 7106
Getting compound 7107
Getting compound 7108
Getting compound 7109
Getting compound 7110
Getting compound 7111
Getting compound 7112
Getting compound 7113
Getting compound 7114
Getting compound 7115
Getting compound 7116
Getting compound 7117
Getting compound 7118
Getting compound 7119
Getting compound 7120
Getting compound 7121
Getting compound 7122
Getting compound 7123
Getting compound 7124
Getting compound 7125
Getting compound 7126
Getting compound 7127
Getting compound 7128
Getting compound 7129
Getting compound 7130
Getting compound 7131
Getting compound 7132
Getting compound 7133
Getting compound 7134
Getting compound 7135
Getting compound 7136
Getting compound 7137
Getting compound 7138
Getting compound 7139
Getting compound 7140
Getting co