In [None]:
import pandas as pd
import cirpy
from tqdm.notebook import tqdm
data = pd.read_csv("data.csv")

In [None]:
for i in tqdm(data["SMILES"]):
    print(i)
    break

In [35]:
data.head()

Unnamed: 0.1,Unnamed: 0,Formula,IUPAC_chemname,SMILES,# Atoms,RawFreq [cm-1],ScaledFreq [cm-1],Intensity,mu_A [D],mu_B [D],mu_C [D],mu_tot [D],A [cm-1],B [cm-1],C [cm-1]
0,0,CO,Carbon monoxide,[C-]#[O+],2,[2209.851],[2129.890310508303],[81.1924],0.1035419,0.0,0.0,0.1035419,0.0,1.934415,1.934415
1,1,O2,dioxygen,O=O,2,[1645.8692],[1611.5673552674284],[0.0],1e-07,0.0,0.0,1e-07,0.0,1.454663,1.454663
2,2,N2,dinitrogen,N#N,2,[2438.602],[2350.364242198306],[0.0],0.0,0.0,0.0,0.0,0.0,2.013151,2.013151
3,3,H2S,hydrogen sulfide,S,3,"[1208.2455, 2674.3332, 2688.5077]","[1183.0642465080284, 2577.5658040974995, 2591....","[0.458, 0.0413, 0.0459]",0.0,-0.991938,0.0,0.9919383,10.280928,8.877414,4.763881
4,4,H2O,water,O,3,"[1632.0511, 3820.4156, 3922.7919]","[1598.0372406800595, 3682.1786484947465, 3780....","[76.3428, 3.6852, 57.5517]",0.0,-1.843688,0.0,1.843688,27.294469,14.394592,9.42436


In [36]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2743 entries, 0 to 2742
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         2743 non-null   int64  
 1   Formula            2743 non-null   object 
 2   IUPAC_chemname     2743 non-null   object 
 3   SMILES             2743 non-null   object 
 4   # Atoms            2743 non-null   int64  
 5   RawFreq [cm-1]     2743 non-null   object 
 6   ScaledFreq [cm-1]  2743 non-null   object 
 7   Intensity          2743 non-null   object 
 8   mu_A [D]           2743 non-null   float64
 9   mu_B [D]           2743 non-null   float64
 10  mu_C [D]           2743 non-null   float64
 11  mu_tot [D]         2743 non-null   float64
 12  A [cm-1]           2743 non-null   float64
 13  B [cm-1]           2743 non-null   float64
 14  C [cm-1]           2743 non-null   float64
dtypes: float64(7), int64(2), object(6)
memory usage: 321.6+ KB


In [40]:
def smiles_to_cas(df: pd.DataFrame) -> pd.DataFrame:
    """ 
    Converts a column of smiles strings to cas numbers
    
    Parameters
    ----------
    df : pd.DataFrame
        A pandas dataframe with a column of smiles strings

    Returns
    -------
    pd.DataFrame
        A pandas dataframe with a column of cas numbers
    """

    if "SMILES" not in df.columns:
        raise ValueError("SMILES column not found in data.csv")
    
    cas = []
    for i in df["SMILES"]:
        cs = cirpy.resolve(i, "cas")
        match cs:
            case None:
                result = float("nan")
                cas.append(result)
            case other:
                result = cs
                if type(result) == list:
                    cas.append(result[0])
                else:
                    cas.append(result)

    df["CAS"] = cas
    return df

def iupac_to_cas(df: pd.DataFrame) -> pd.DataFrame:
    """
    Converts a column of iupac names to cas numbers

    Parameters
    ----------
    df : pd.DataFrame
        A pandas dataframe with a column of iupac names

    Returns
    -------
    pd.DataFrame
        A pandas dataframe with a column of cas numbers
    
    """

    cas = []
    for i in df["IUPAC_chemname"]:
        cs = cirpy.resolve(i, "cas")
        match cs:
            case None:
                result = float("nan")
                cas.append(result)
            case other:
                result = cs
                if type(result) == list:
                    cas.append(result[0])
                else:
                    cas.append(result)

    df["CAS"] = cas
    return df

In [42]:
cirpy.resolve("Carbon monoxide", "cas")

['50-00-0',
 '30525-89-4',
 '12795-06-1',
 '112068-71-0',
 '8005-38-7',
 '8006-07-3',
 '8013-13-6',
 '53026-80-5']

In [45]:
sample = data.sample(10)
sample

Unnamed: 0.1,Unnamed: 0,Formula,IUPAC_chemname,SMILES,# Atoms,RawFreq [cm-1],ScaledFreq [cm-1],Intensity,mu_A [D],mu_B [D],mu_C [D],mu_tot [D],A [cm-1],B [cm-1],C [cm-1]
1089,1089,C2H8NP,"N,N-dimethylphosphanamine",N(P)(C)C,12,"[167.793, 208.9358, 237.5836, 270.869, 332.242...","[167.020458208152, 207.9738311615312, 236.4897...","[2.8141, 0.5214, 1.5114, 2.5289, 0.4472, 1.181...",1.231442,-0.526854,-0.384789,1.393587,0.285857,0.155094,0.108742
535,535,C2H6O2,hydroperoxyethane,CCOO,10,"[155.0744, 218.1688, 227.6695, 370.0774, 503.7...","[154.3604163722816, 217.1643211738432, 226.621...","[3.2301, 82.3357, 26.1542, 5.6283, 5.6425, 2.9...",0.47867,-0.608249,-1.464937,1.656844,0.531931,0.181964,0.153809
1799,1799,C4H8O2,2-ethoxyacetaldehyde,CCOCC=O,14,"[60.6101, 91.0519, 121.7145, 147.0604, 245.429...","[60.331042856626404, 90.6326846693416, 121.154...","[7.0994, 7.3055, 7.3233, 1.7218, 0.5587, 11.51...",-3.091222,-0.693126,0.001005,3.167977,0.638759,0.044916,0.042983
2616,2616,C4H11NO,2-(dimethylamino)ethan-1-ol,OCCN(C)C,17,"[72.8788, 121.6678, 236.4203, 247.5394, 278.98...","[72.5432560932832, 121.10762490197921, 235.331...","[1.8276, 3.9624, 0.2385, 0.688, 0.2241, 10.473...",-2.263317,-0.381512,-1.070088,2.532438,0.192424,0.073423,0.067538
2191,2191,C3H9N3,1-(2-methylhydrazineyl)ethen-1-amine,C=C(N)NNC,15,"[113.1426, 139.3894, 190.4591, 315.5206, 329.6...","[112.6216760822064, 138.7476322454416, 189.582...","[1.2825, 1.1589, 1.4891, 21.6517, 10.4148, 11....",-0.584306,0.064924,-0.79773,0.99096,0.240686,0.07595,0.064501
1550,1550,C5H8,"penta-1,4-diene",C=CCC=C,13,"[86.6526, 101.2121, 298.3392, 370.9034, 449.42...","[86.2536396448464, 100.74610572675441, 296.965...","[0.0139, 0.0006, 0.126, 0.5387, 3.3758, 12.531...",4e-06,0.141448,-5.1e-05,0.141448,0.665657,0.078167,0.077585
2271,2271,C6H10,"hexa-2,3-diene",CC=C=CCC,16,"[80.3089, 108.6452, 156.1929, 195.125, 223.102...","[79.9391469023896, 108.1449827234528, 155.4737...","[0.3259, 0.9956, 0.834, 1.2152, 0.2641, 2.1746...",0.0794,0.202283,-0.136838,0.256802,0.242059,0.058999,0.051965
267,267,CH2N4O,"1,2-dihydro-5H-tetrazol-5-one",O=C1/N=N\NN1,8,"[258.8279, 484.0017, 513.3919, 549.17, 615.700...","[257.6362211478056, 481.7732903489688, 511.028...","[0.8888, 43.7065, 86.0779, 253.9859, 142.7972,...",-3.496937,4.202852,0.102684,5.468371,0.313298,0.135312,0.095247
307,307,C4H3N,cycloprop-2-ene-1-carbonitrile,C1=CC1C#N,8,"[216.3002, 227.7414, 554.9058, 566.9232, 641.0...","[215.3043244623728, 226.6928476215696, 552.350...","[3.8192, 1.3122, 17.6035, 3.7223, 69.7647, 1.3...",4.550212,-3e-06,-0.691521,4.60246,0.668845,0.117142,0.113325
687,687,C6H4,"hexa-1-en-3,5-diyne",C=CC#CC#C,10,"[106.1709, 136.6476, 253.2898, 323.283, 481.15...","[105.6820747371576, 136.01845586552642, 252.12...","[3.0514, 5.5788, 1.5197, 0.0496, 0.8057, 1.207...",0.77247,0.058167,6e-06,0.774657,1.385334,0.045485,0.044039


In [46]:
cas = iupac_to_cas(sample)

In [51]:
from tqdm import tqdm
tqdm.pandas(desc="Progress")

In [None]:
sample.progress_apply

In [61]:
n = 500
list_df = [data[i:i+n] for i in range(0, data.shape[0], n)]
list_df[5]

Unnamed: 0.1,Unnamed: 0,Formula,IUPAC_chemname,SMILES,# Atoms,RawFreq [cm-1],ScaledFreq [cm-1],Intensity,mu_A [D],mu_B [D],mu_C [D],mu_tot [D],A [cm-1],B [cm-1],C [cm-1]
2500,2500,C5H11N,N-tert-butylmethanimine,CC(N=C)(C)C,17,"[101.485, 218.2183, 262.7917, 291.051, 316.508...","[101.01774925804, 217.21359326911121, 261.5817...","[8.4331, 0.0864, 0.238, 0.0067, 0.9609, 3.4925...",3.707720e-02,1.472936e+00,2.730000e-05,1.473403,0.147562,0.095473,0.093790
2501,2501,C5H11N,N-methylbut-2-en-1-amine,C/C=C/CNC,17,"[76.8766, 106.5772, 150.2524, 210.5005, 241.37...","[76.5226496783824, 106.0865040767008, 149.5606...","[0.9027, 1.4171, 0.9402, 0.6551, 1.0987, 2.217...",2.734289e-01,3.872101e-01,8.687273e-01,0.989637,0.542021,0.041510,0.041009
2502,2502,C5H11N,"N,N-dimethylprop-1-en-1-amine",C/C=C/N(C)C,17,"[133.8776, 161.9955, 194.5971, 204.865, 228.28...","[133.2612093222464, 161.249650686612, 193.7011...","[6.266, 2.8333, 0.3363, 2.263, 1.8714, 1.4369,...",1.061550e+00,-7.513280e-02,-2.955185e-01,1.104475,0.258414,0.063505,0.053195
2503,2503,C5H11N,2-methylbut-2-en-1-amine,C/C=C(C)/CN,17,"[83.1843, 111.0803, 124.1982, 202.8093, 253.41...","[82.8013081697352, 110.5688711918792, 123.6263...","[0.5362, 0.5589, 0.0961, 6.4898, 35.623, 2.537...",3.642980e-02,-9.618271e-01,-2.090125e-01,0.984949,0.212626,0.067952,0.057465
2504,2504,C5H11N,"1,2,2-trimethylaziridine",CC1(C)N(C)C1,17,"[174.9819, 177.1589, 197.3491, 233.9135, 293.9...","[174.1762595348616, 176.34323633078958, 196.44...","[0.1488, 0.0455, 0.5216, 1.3115, 0.4835, 0.124...",-1.388754e-01,1.038454e+00,-7.150939e-01,1.268476,0.168818,0.101111,0.086172
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2738,2738,C6H14,"2,2-dimethylbutane",CC(C)(CC)C,20,"[85.7068, 203.2187, 242.9084, 251.4934, 263.97...","[85.3121942366752, 202.28305346745682, 241.790...","[0.0021, 0.0085, 0.0011, 0.0265, 0.0405, 0.0, ...",-4.641050e-02,-3.803990e-02,1.000000e-07,0.060008,0.142542,0.082673,0.082388
2739,2739,C6H14,"2,3-dimethylbutane",CC(C(C)C)C,20,"[64.9398, 204.3436, 237.2037, 237.7633, 255.12...","[64.6408083289872, 203.4027742748704, 236.1115...","[0.0067, 0.0014, 0.0004, 0.0941, 0.0104, 0.015...",1.000000e-07,0.000000e+00,1.529379e-01,0.152938,0.140878,0.084481,0.073967
2740,2740,C6H14,2-methylpentane,CC(CCC)C,20,"[72.9757, 108.6177, 177.9321, 220.4624, 240.60...","[72.6397099525048, 108.1176093371928, 177.1128...","[0.0109, 0.0177, 0.0059, 0.0111, 0.002, 0.0014...",-2.188710e-02,4.572050e-02,-1.332669e-01,0.142581,0.220327,0.056609,0.048711
2741,2741,C6H14,3-methylpentane,CC(CC)CC,20,"[82.8775, 82.9958, 198.5945, 206.4731, 221.275...","[82.49592071866, 82.61367604937121, 197.680143...","[0.0003, 0.0036, 0.0011, 0.0429, 0.0145, 0.008...",-1.800000e-06,9.668610e-02,2.568050e-02,0.100038,0.217884,0.061364,0.052830


In [62]:
test = list_df[0]['IUPAC_chemname'].progress_apply(cirpy.resolve, args=("cas",))

Progress: 100%|██████████| 500/500 [15:39<00:00,  1.88s/it]


In [66]:
list_df[0].loc[:,"CAS"] = test
list_df[0]

Unnamed: 0.1,Unnamed: 0,Formula,IUPAC_chemname,SMILES,# Atoms,RawFreq [cm-1],ScaledFreq [cm-1],Intensity,mu_A [D],mu_B [D],mu_C [D],mu_tot [D],A [cm-1],B [cm-1],C [cm-1],CAS
0,0,CO,Carbon monoxide,[C-]#[O+],2,[2209.851],[2129.890310508303],[81.1924],1.035419e-01,0.000000,0.000000e+00,1.035419e-01,0.000000,1.934415,1.934415,"[50-00-0, 30525-89-4, 12795-06-1, 112068-71-0,..."
1,1,O2,dioxygen,O=O,2,[1645.8692],[1611.5673552674284],[0.0],1.000000e-07,0.000000,0.000000e+00,1.000000e-07,0.000000,1.454663,1.454663,"[1338-93-8, 14797-70-7, 80217-98-7, 80937-33-3..."
2,2,N2,dinitrogen,N#N,2,[2438.602],[2350.364242198306],[0.0],0.000000e+00,0.000000,0.000000e+00,0.000000e+00,0.000000,2.013151,2.013151,"[7727-37-9, 156457-45-3, 161728-27-4, 93037-13-9]"
3,3,H2S,hydrogen sulfide,S,3,"[1208.2455, 2674.3332, 2688.5077]","[1183.0642465080284, 2577.5658040974995, 2591....","[0.458, 0.0413, 0.0459]",0.000000e+00,-0.991938,0.000000e+00,9.919383e-01,10.280928,8.877414,4.763881,"[11144-15-3, 13465-07-1, 7783-06-4, 12673-82-4..."
4,4,H2O,water,O,3,"[1632.0511, 3820.4156, 3922.7919]","[1598.0372406800595, 3682.1786484947465, 3780....","[76.3428, 3.6852, 57.5517]",0.000000e+00,-1.843688,0.000000e+00,1.843688e+00,27.294469,14.394592,9.424360,"[14314-42-2, 558440-22-5, 13670-17-2, 7732-18-..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,495,C5H3N,pent-2-en-4-ynenitrile,C#C\C=C\C#N,9,"[121.9232, 131.6525, 250.283, 362.8397, 514.83...","[121.3618490056448, 131.04635398526, 249.13066...","[1.3594, 2.3048, 7.2294, 10.4292, 8.0329, 0.03...",-4.133100e+00,-0.598713,1.090000e-05,4.176239e+00,1.576258,0.049032,0.047553,
496,496,CH4N2OS,1-hydroxythiourea,S=C(N)NO,9,"[173.8581, 288.9157, 323.18, 377.1545, 432.813...","[173.0576336628984, 287.5854928246648, 321.692...","[0.7872, 5.2269, 55.4559, 26.0998, 172.3834, 1...",4.297166e+00,-1.391903,4.208103e-01,4.536531e+00,0.303044,0.087437,0.068696,42008-54-8
497,497,CH4N2O2,1-hydroxyurea,ONC(N)=O,9,"[186.0801, 294.9747, 355.5738, 437.4814, 520.1...","[185.2233619127064, 293.61659636464077, 353.93...","[1.285, 8.9421, 37.2351, 224.9778, 11.3186, 11...",-3.002820e+00,-1.965330,-7.050520e-02,3.589487e+00,0.339777,0.145138,0.103734,127-07-1
498,498,CH5NO2,(hydroxyamino)methanol,OCNO,9,"[175.0207, 323.9515, 354.1051, 440.4926, 581.2...","[174.2148808943848, 322.45998323659603, 352.47...","[2.7168, 51.8725, 1.4843, 166.8201, 39.802, 55...",-1.183320e+00,-0.368517,6.715722e-01,1.409631e+00,0.555482,0.190041,0.158900,


In [64]:
test.info()

<class 'pandas.core.series.Series'>
RangeIndex: 500 entries, 0 to 499
Series name: IUPAC_chemname
Non-Null Count  Dtype 
--------------  ----- 
225 non-null    object
dtypes: object(1)
memory usage: 4.0+ KB


In [54]:
lol2 = sample['IUPAC_chemname'].progress_apply(cirpy.resolve, args=("cas",))

Progress: 100%|██████████| 10/10 [00:14<00:00,  1.47s/it]


In [47]:
cas

Unnamed: 0.1,Unnamed: 0,Formula,IUPAC_chemname,SMILES,# Atoms,RawFreq [cm-1],ScaledFreq [cm-1],Intensity,mu_A [D],mu_B [D],mu_C [D],mu_tot [D],A [cm-1],B [cm-1],C [cm-1],CAS
1089,1089,C2H8NP,"N,N-dimethylphosphanamine",N(P)(C)C,12,"[167.793, 208.9358, 237.5836, 270.869, 332.242...","[167.020458208152, 207.9738311615312, 236.4897...","[2.8141, 0.5214, 1.5114, 2.5289, 0.4472, 1.181...",1.231442,-0.526854,-0.384789,1.393587,0.285857,0.155094,0.108742,
535,535,C2H6O2,hydroperoxyethane,CCOO,10,"[155.0744, 218.1688, 227.6695, 370.0774, 503.7...","[154.3604163722816, 217.1643211738432, 226.621...","[3.2301, 82.3357, 26.1542, 5.6283, 5.6425, 2.9...",0.47867,-0.608249,-1.464937,1.656844,0.531931,0.181964,0.153809,3031-74-1
1799,1799,C4H8O2,2-ethoxyacetaldehyde,CCOCC=O,14,"[60.6101, 91.0519, 121.7145, 147.0604, 245.429...","[60.331042856626404, 90.6326846693416, 121.154...","[7.0994, 7.3055, 7.3233, 1.7218, 0.5587, 11.51...",-3.091222,-0.693126,0.001005,3.167977,0.638759,0.044916,0.042983,22056-82-2
2616,2616,C4H11NO,2-(dimethylamino)ethan-1-ol,OCCN(C)C,17,"[72.8788, 121.6678, 236.4203, 247.5394, 278.98...","[72.5432560932832, 121.10762490197921, 235.331...","[1.8276, 3.9624, 0.2385, 0.688, 0.2241, 10.473...",-2.263317,-0.381512,-1.070088,2.532438,0.192424,0.073423,0.067538,108-01-0
2191,2191,C3H9N3,1-(2-methylhydrazineyl)ethen-1-amine,C=C(N)NNC,15,"[113.1426, 139.3894, 190.4591, 315.5206, 329.6...","[112.6216760822064, 138.7476322454416, 189.582...","[1.2825, 1.1589, 1.4891, 21.6517, 10.4148, 11....",-0.584306,0.064924,-0.79773,0.99096,0.240686,0.07595,0.064501,
1550,1550,C5H8,"penta-1,4-diene",C=CCC=C,13,"[86.6526, 101.2121, 298.3392, 370.9034, 449.42...","[86.2536396448464, 100.74610572675441, 296.965...","[0.0139, 0.0006, 0.126, 0.5387, 3.3758, 12.531...",4e-06,0.141448,-5.1e-05,0.141448,0.665657,0.078167,0.077585,39610-14-5
2271,2271,C6H10,"hexa-2,3-diene",CC=C=CCC,16,"[80.3089, 108.6452, 156.1929, 195.125, 223.102...","[79.9391469023896, 108.1449827234528, 155.4737...","[0.3259, 0.9956, 0.834, 1.2152, 0.2641, 2.1746...",0.0794,0.202283,-0.136838,0.256802,0.242059,0.058999,0.051965,592-49-4
267,267,CH2N4O,"1,2-dihydro-5H-tetrazol-5-one",O=C1/N=N\NN1,8,"[258.8279, 484.0017, 513.3919, 549.17, 615.700...","[257.6362211478056, 481.7732903489688, 511.028...","[0.8888, 43.7065, 86.0779, 253.9859, 142.7972,...",-3.496937,4.202852,0.102684,5.468371,0.313298,0.135312,0.095247,16421-52-6
307,307,C4H3N,cycloprop-2-ene-1-carbonitrile,C1=CC1C#N,8,"[216.3002, 227.7414, 554.9058, 566.9232, 641.0...","[215.3043244623728, 226.6928476215696, 552.350...","[3.8192, 1.3122, 17.6035, 3.7223, 69.7647, 1.3...",4.550212,-3e-06,-0.691521,4.60246,0.668845,0.117142,0.113325,
687,687,C6H4,"hexa-1-en-3,5-diyne",C=CC#CC#C,10,"[106.1709, 136.6476, 253.2898, 323.283, 481.15...","[105.6820747371576, 136.01845586552642, 252.12...","[3.0514, 5.5788, 1.5197, 0.0496, 0.8057, 1.207...",0.77247,0.058167,6e-06,0.774657,1.385334,0.045485,0.044039,


In [None]:
nulls = data[data["CAS"].isnull()]

In [32]:
cirpy.resolve("OCC1CSC1", "cas")

In [None]:
data.to_csv("data_with_cas.csv", index=False)