In [18]:
import pandas as pd, numpy as np

In [19]:
df = pd.read_csv("data.csv", index_col='Index', encoding='ISO-8859-1')

In [20]:
np.shape(df)

(655, 304)

In [21]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 655 entries, 0 to 654
Columns: 304 entries, Polymer system Notes to approxMW(kDa)
dtypes: float64(190), int64(72), object(42)
memory usage: 1.5+ MB
None


In [22]:
feature_set = ['Polymer', 'SMILES descriptor 1', 'Conductivity at 30C', 'Conductivity at 60C',]

In [23]:
homopolymers = df.loc[df['SMILES descriptor 1'] == df['SMILES descriptor 2']]

In [24]:
homopolymers.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 448 entries, 33 to 591
Columns: 304 entries, Polymer system Notes to approxMW(kDa)
dtypes: float64(190), int64(72), object(42)
memory usage: 1.0+ MB


In [25]:
db = homopolymers[feature_set].copy()

In [26]:
db['Conductivity at 30C'] = db['Conductivity at 30C'].apply(np.log10)
db['Conductivity at 60C'] = db['Conductivity at 60C'].apply(np.log10)

In [27]:
print(db)

                                                 Polymer  \
Index                                                      
33                                polyethylene carbonate   
34                                polyethylene carbonate   
35                                polyethylene carbonate   
36                                polyethylene carbonate   
37              poly(butyl ether 1,2-glycerol carbonate)   
...                                                  ...   
587    ï»¿poly[bis(2-(2-methoxyethoxy)ethoxy)phosphaz...   
588    ï»¿poly[bis(2-(2-methoxyethoxy)ethoxy)phosphaz...   
589    ï»¿poly[bis(2-(2-methoxyethoxy)ethoxy)phosphaz...   
590    ï»¿poly[bis(2-(2-methoxyethoxy)ethoxy)phosphaz...   
591    ï»¿poly[bis(2-(2-methoxyethoxy)ethoxy)phosphaz...   

           SMILES descriptor 1  Conductivity at 30C  Conductivity at 60C  
Index                                                                     
33                   COC(=O)OC           -10.970616            -8.440

In [28]:
db.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 448 entries, 33 to 591
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Polymer              448 non-null    object 
 1   SMILES descriptor 1  448 non-null    object 
 2   Conductivity at 30C  201 non-null    float64
 3   Conductivity at 60C  262 non-null    float64
dtypes: float64(2), object(2)
memory usage: 17.5+ KB


In [29]:
db = db.dropna(axis=0, how='all', subset=['Conductivity at 30C', 'Conductivity at 60C',])

In [30]:
db.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 271 entries, 33 to 591
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Polymer              271 non-null    object 
 1   SMILES descriptor 1  271 non-null    object 
 2   Conductivity at 30C  201 non-null    float64
 3   Conductivity at 60C  262 non-null    float64
dtypes: float64(2), object(2)
memory usage: 10.6+ KB


In [31]:
db = db.sort_values(by=['Conductivity at 30C', 'Conductivity at 60C'], ascending=False)

In [32]:
db.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 271 entries, 227 to 459
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Polymer              271 non-null    object 
 1   SMILES descriptor 1  271 non-null    object 
 2   Conductivity at 30C  201 non-null    float64
 3   Conductivity at 60C  262 non-null    float64
dtypes: float64(2), object(2)
memory usage: 10.6+ KB


In [33]:
peo = db[db['Polymer'].str.contains("PEO")].copy()
every_peo = peo.index
peo.sort_values(by=['Conductivity at 30C', 'Conductivity at 60C'], ascending=False, inplace=True)
peo = peo.dropna(axis=0, how='any', subset=['Conductivity at 30C', 'Conductivity at 60C'])
benchmark = db.loc[392]
db = db.drop(every_peo)
print(peo)
typical_conductivity_at_30C = peo['Conductivity at 30C'].mean()
typical_conductivity_at_60C = peo['Conductivity at 60C'].mean()

print(f"The best performing PEO system, ", benchmark)
print(f"The typical PEO has conductivities, {typical_conductivity_at_30C} and {typical_conductivity_at_60C}\nat 30 and 60C respectively.")

      Polymer SMILES descriptor 1  Conductivity at 30C  Conductivity at 60C
Index                                                                      
392       PEO                 COC            -5.062482            -3.558844
279       PEO                 COC            -6.251812            -4.204120
391       PEO                 COC            -6.378824            -3.970275
378       PEO                 COC            -6.596879            -5.344862
278       PEO                 COC            -6.879426            -3.855943
276       PEO                 COC            -6.892790            -3.590928
285       PEO                 COC            -6.920819            -3.352546
269       PEO                 COC            -6.924453            -5.261219
455       PEO                 COC            -6.946922            -4.454693
267       PEO                 COC            -6.946922            -4.514279
466       PEO                 COC            -7.015023            -4.417937
266       PE

In [34]:
print(db)

                                                 Polymer  \
Index                                                      
227    poly(1,4-bis(bromomethyl)-2,3,5,6-teramethylbe...   
69                          poly carbosilane dicarbonate   
590    ï»¿poly[bis(2-(2-methoxyethoxy)ethoxy)phosphaz...   
589    ï»¿poly[bis(2-(2-methoxyethoxy)ethoxy)phosphaz...   
588    ï»¿poly[bis(2-(2-methoxyethoxy)ethoxy)phosphaz...   
...                                                  ...   
205                           poly(N-methylethylenimine)   
208                           poly(N-methylethylenimine)   
150                            Poly (ethylene succinate)   
145                             poly(vinylene carbonate)   
144                             poly(vinylene carbonate)   

                                     SMILES descriptor 1  Conductivity at 30C  \
Index                                                                           
227    C1(=C(C)C(C)=CC(C)=C1(C))COCP(=S)(CC1=CC=CC=C1... 

In [35]:
db.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 188 entries, 227 to 144
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Polymer              188 non-null    object 
 1   SMILES descriptor 1  188 non-null    object 
 2   Conductivity at 30C  158 non-null    float64
 3   Conductivity at 60C  184 non-null    float64
dtypes: float64(2), object(2)
memory usage: 7.3+ KB


In [36]:
db = db.drop_duplicates(subset=['SMILES descriptor 1'], keep='first')

In [38]:
db.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 52 entries, 227 to 150
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Polymer              52 non-null     object 
 1   SMILES descriptor 1  52 non-null     object 
 2   Conductivity at 30C  43 non-null     float64
 3   Conductivity at 60C  52 non-null     float64
dtypes: float64(2), object(2)
memory usage: 2.0+ KB


In [39]:
Y = (db['Conductivity at 60C'] > typical_conductivity_at_60C) & (db['Conductivity at 30C'] > typical_conductivity_at_30C)
print(Y.value_counts(normalize=True))

True     0.673077
False    0.326923
dtype: float64


In [40]:
db['Y'] = Y

In [791]:
print(db)

                                                 Polymer  \
Index                                                      
227    poly(1,4-bis(bromomethyl)-2,3,5,6-teramethylbe...   
69                          poly carbosilane dicarbonate   
590    ï»¿poly[bis(2-(2-methoxyethoxy)ethoxy)phosphaz...   
58                            poly carbosilane carbonate   
558                                    polyepoxide ether   
556                                      polyvinyl ether   
559             oxymethylene-linked poly(ethylene oxide)   
537                          poly (allyl glycidyl ether)   
547    ï»¿poly(methoxy polyethylene glycol monomethac...   
541     poly (allyl glycidyl ether tri(ethylene glycol))   
555                                      polyvinyl ether   
105                      Poly(1,6-hexamethylene adipate)   
540      poly (allyl glycidyl ether di(ethylene glycol))   
552                                   polyacrylate ether   
94                                Poly(e

In [792]:
db.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 52 entries, 227 to 150
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Polymer              52 non-null     object 
 1   SMILES descriptor 1  52 non-null     object 
 2   Conductivity at 30C  43 non-null     float64
 3   Conductivity at 60C  52 non-null     float64
 4   Y                    52 non-null     bool   
dtypes: bool(1), float64(2), object(2)
memory usage: 2.1+ KB


In [43]:
db['Y'] = db['Y'].map({True: 1, False: 0})

In [44]:
data = db[['SMILES descriptor 1', 'Y']]
print(data)

                                     SMILES descriptor 1  Y
Index                                                      
227    C1(=C(C)C(C)=CC(C)=C1(C))COCP(=S)(CC1=CC=CC=C1...  1
69                 C[Si](CCC1COC(=O)O1)(CCC2COC(=O)O2)CC  1
590                              P(OCCOCCOC)(OCCOCCOC)=N  1
58                             C[Si](C)(CCC1COC(=O)O1)CC  1
558                              C(COCCOCCOCCOCCOCCOC)OC  1
556                                C(OCCOCCOCCOCCOCCOC)C  1
559                        COCCOCCOCCOCCOCCOCCOCCOCCOCOC  1
537                                          C(COCC=C)OC  1
547    CC(C(=O)OCCOCCOCCOCCOCCOCCOCCOCCOCCOCCOCCOCCOC...  1
541                                C(COCCCSCCOCCOCCOC)OC  1
555                                   C(OCCOCCOCCOCCOC)C  1
105                               COC(=O)CCCCC(=O)OCCCCC  1
540                                   C(COCCCSCCOCCOC)OC  1
552                            C(C(=O)OCCOCCOCCOCCOCCOC)  1
94                                    CO

In [45]:
data.sample(frac=1)

Unnamed: 0_level_0,SMILES descriptor 1,Y
Index,Unnamed: 1_level_1,Unnamed: 2_level_1
58,C[Si](C)(CCC1COC(=O)O1)CC,1
559,COCCOCCOCCOCCOCCOCCOCCOCCOCOC,1
87,COC(=O)CC(=O)OC,1
185,CC(C(=O)OCCO),0
51,CC(C(=O)OCCCC),1
101,COC(=O)CCCCC(=O)OCCC,1
541,C(COCCCSCCOCCOCCOC)OC,1
244,[Si](C)(CCSCCCC(=O)NCCN1C=NC=C1)O,0
154,C1C(N(C(=O)C)C(=O)N1(C(=O)C)),1
259,[Si](C)(CCSCCCCCCCN1C=NC=C1)O,1


In [46]:
data.to_csv("ml_ready.csv", index=False)

In [47]:
data = pd.read_csv("ml_ready.csv")

In [48]:
np.shape(data)

(52, 2)

In [49]:
data = data.sample(frac=1, random_state=40)

In [50]:
training_set = data.iloc[0:39]

In [51]:
training_set['Y'].value_counts(normalize=True)

1    0.666667
0    0.333333
Name: Y, dtype: float64

In [52]:
testing_set = data[39:]

In [53]:
testing_set['Y'].value_counts(normalize=True)

1    0.692308
0    0.307692
Name: Y, dtype: float64

In [54]:
training_set.to_csv("training_set.csv", index=False)

In [55]:
testing_set.to_csv("testing_set.csv", index=True)