Set Up Environment and Import Family Data

In [44]:
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cross_decomposition import PLSRegression
from sklearn.feature_selection import SelectFromModel, RFE, RFECV
from sklearn.linear_model import ElasticNetCV, Lasso, LassoCV
from sklearn.metrics import r2_score
from abess import LinearRegression
from kennard_stone import train_test_split
# from genetic_selection import GeneticSelectionCV
import pandas as pd
import numpy as np

In [68]:
def extractDragonDescriptors(family_Y_df, descriptorFileString):
    family_X = pd.read_csv(descriptorFileString, sep='\t', index_col=0)
    colsX = family_X.columns
    family_X_resetIndex = pd.DataFrame(family_X.to_numpy(), index=family_Y_df.index, columns=family_X.columns).replace('na', np.NaN)
    return family_X_resetIndex[colsX[1:]].dropna(axis=1, how='any')

In [39]:
triph_Y = pd.read_csv('Data/exactPLS/triphenylamines.csv', index_col=0)
porph_Y = pd.read_csv('Data/exactPLS/porphyrins.csv', index_col=0)
pheno_Y = pd.read_csv('Data/exactPLS/phenothiazines.csv', index_col=0)
indol_Y = pd.read_csv('Data/exactPLS/indolines.csv', index_col=0)
couma_Y = pd.read_csv('Data/exactPLS/coumarins.csv', index_col=0)
carba_Y = pd.read_csv('Data/exactPLS/carbazoles.csv', index_col=0)
diphe_Y = pd.read_csv('Data/exactPLS/diphenylamines.csv', index_col=0)

Unnamed: 0_level_0,Molecule SMILE,PCE
S.No,Unnamed: 1_level_1,Unnamed: 2_level_1
1,CCCCCCC(Cc1cc(sc1c1cc2c(s1)c1sc(cc1c1c2non1)c1...,6.18
2,N#C/C(=C\c1ccc(cc1)/C=C/c1cccc(c1)N(c1ccccc1)c...,2.23
3,N#CC(=Cc1ccc(cc1)C=Cc1ccc(cc1)N(c1ccccc1)c1ccc...,4.12
4,N#CC(=Cc1cccc(c1)N(c1ccccc1)c1ccccc1)C(=O)O,1.27
5,N#CC(=Cc1ccc(cc1)N(c1ccccc1)c1ccccc1)C(=O)O,2.92
...,...,...
240,Oc1nc2ccc(cc2nc1O)c1ccc(cc1)N(c1ccccc1)c1ccccc1,1.18
241,Oc1nc2ccc(cc2nc1O)c1ccc(s1)c1ccc(cc1)N(c1ccccc...,1.58
242,CCCCC(COc1cc(OCC(CCCC)CC)ccc1c1ccc(cc1)N(c1ccc...,5.87
243,CCCCC(CC1(CC(CCCC)CC)c2cc(sc2c2c1cc(s2)c1ccc(c...,6.69


In [71]:
triph_X = extractDragonDescriptors(triph_Y, 'Data/exactPLS/desc_triphenylamines.txt')
porph_X = extractDragonDescriptors(porph_Y, 'Data/exactPLS/desc_porphyrins.txt')
pheno_X = extractDragonDescriptors(pheno_Y, 'Data/exactPLS/desc_phenothiazines.txt')
indol_X = extractDragonDescriptors(indol_Y, 'Data/exactPLS/desc_indolines.txt')
couma_X = extractDragonDescriptors(couma_Y, 'Data/exactPLS/desc_coumarins.txt')
carba_X = extractDragonDescriptors(carba_Y, 'Data/exactPLS/desc_carbazoles.txt')
diphe_X = extractDragonDescriptors(diphe_Y, 'Data/exactPLS/desc_diphenylamines.txt')

Unnamed: 0_level_0,MW,AMW,Se,Sp,Si,Me,Mp,Mi,GD,nAT,...,TPSA(NO),TPSA(Tot),LOGP99,LOGPcons,ESOL,SAacc,SAdon,Vx,VvdwMG,SAscore
S.No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,926.16,10.406292,92.7410,67.8244,100.5316,1.042034,0.762072,1.129569,0.036210,89,...,71.90,71.90,13.64730,10.274202,-12.507319,236.040349,42.683343,947.757475,391.802244,6.966613
2,974.70,7.273881,131.3807,94.8079,148.7023,0.980453,0.707522,1.109719,0.032712,134,...,71.90,71.90,15.65250,11.648990,-12.829856,101.399329,42.683343,1272.524917,524.903654,6.710840
3,1323.28,6.892083,187.8383,132.2403,213.9269,0.978324,0.688752,1.114203,0.023684,192,...,97.68,97.68,23.22050,16.095679,-18.089831,134.970388,42.683343,1770.431894,728.963891,7.544736
5,992.62,7.877937,124.2827,92.2509,139.2077,0.986371,0.732150,1.104823,0.032473,126,...,97.68,97.68,15.38834,10.642230,-13.174928,134.970388,42.683343,1241.029900,511.995861,7.051546
7,1230.14,6.910899,174.0857,122.9504,198.1830,0.978010,0.690733,1.113388,0.025281,178,...,75.14,75.14,21.56210,15.721231,-16.562790,104.523643,42.683343,1656.644518,682.329721,7.341659
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
296,837.30,8.907447,93.3449,73.6312,102.5574,0.993031,0.783311,1.091036,0.038798,94,...,75.14,75.14,12.12650,9.051484,-11.076743,104.523643,42.683343,1001.295681,413.744132,6.521043
297,853.30,8.982105,94.6722,74.0857,103.7668,0.996549,0.779849,1.092282,0.038075,95,...,95.37,95.37,11.83210,8.785619,-10.983116,147.206986,85.366686,1011.046512,417.740374,6.518551
298,888.35,8.973232,98.4467,77.6369,108.0557,0.994411,0.784211,1.091472,0.036058,99,...,98.93,98.93,12.41988,8.857075,-11.128602,135.582999,42.683343,1066.661130,440.533250,6.635670
299,907.35,8.983663,100.8831,78.3016,110.3914,0.998843,0.775263,1.092984,0.035431,101,...,112.44,112.44,11.98090,8.674081,-11.011094,172.351913,85.366686,1076.727575,444.658842,6.634920


In [72]:
triph_X.shape,porph_X.shape,pheno_X.shape,indol_X.shape,couma_X.shape,carba_X.shape,diphe_X.shape

((229, 851),
 (281, 723),
 (207, 673),
 (160, 554),
 (56, 630),
 (179, 603),
 (35, 481))

Divide Family Datasets Into Train and Test (0.3)

In [81]:
from kennard_stone import train_test_split

triph_X_train, triph_X_test, triph_Y_train, triph_Y_test = train_test_split(triph_X, triph_Y['PCE'], test_size=0.3)
porph_X_train, porph_X_test, porph_Y_train, porph_Y_test = train_test_split(porph_X, porph_Y['PCE'], test_size=0.3)
pheno_X_train, pheno_X_test, pheno_Y_train, pheno_Y_test = train_test_split(pheno_X, pheno_Y['PCE'], test_size=0.3)
indol_X_train, indol_X_test, indol_Y_train, indol_Y_test = train_test_split(indol_X, indol_Y['PCE'], test_size=0.3)
couma_X_train, couma_X_test, couma_Y_train, couma_Y_test = train_test_split(couma_X, couma_Y['PCE'], test_size=0.3)
carba_X_train, carba_X_test, carba_Y_train, carba_Y_test = train_test_split(carba_X, carba_Y['PCE'], test_size=0.3)

((196, 723), (69, 851))

In [85]:
porph_X_train.shape, triph_X_train.shape

((196, 723), (160, 851))

Remove Intercorrelated Descriptors

In [95]:
def returnNextRow(rowDf, to_drop):
    max = -200
    nextRow='None'
    for i in range(len(rowDf.columns)):
        if (rowDf[rowDf.columns[i]][0] > max) and (rowDf.columns[i] not in to_drop) and (rowDf[rowDf.columns[i]][0] != np.NaN):
            max = rowDf[rowDf.columns[i]][0]
            nextRow = rowDf.columns[i]
    return nextRow

def getTo_dropOfRow(rowDf, to_drop=[], threshold=0.95):
    for i in range(len(rowDf.columns)):
        if (rowDf[rowDf.columns[i]][0] > threshold) and (rowDf[rowDf.columns[i]][0] != np.NaN) and (rowDf.columns[i] not in to_drop):
            to_drop.append(rowDf.columns[i])
    return to_drop

def vWSPFeatureSelect(rowName,corr_matrix,to_drop=[],thresh=0.95):
    if rowName=='None':
        return to_drop
    else:
        to_DropThisRow = getTo_dropOfRow(corr_matrix.loc[[rowName]], to_drop, threshold=thresh) #array of column names
        nextRow = returnNextRow(corr_matrix.loc[[rowName]], to_DropThisRow)
        updateTo_drop = to_DropThisRow
        return vWSPFeatureSelect(rowName=nextRow, corr_matrix=corr_matrix,to_drop=updateTo_drop, thresh=0.95)
    
def vWSP(X_train, threshold=0.95):
    cor_matrix = X_train.corr().abs()
    upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape), k=1).astype(bool))
    to_drop=vWSPFeatureSelect('MW', corr_matrix=upper_tri, to_drop=[], thresh=threshold) #MW chosen as seed
    X_train_rm = X_train.drop(to_drop, axis=1)
    return X_train_rm

In [96]:
porph_X_train_rmCorr = vWSP(porph_X_train, threshold=0.95)
triph_X_train_rmCorr = vWSP(triph_X_train, threshold=0.95)
pheno_X_train_rmCorr = vWSP(pheno_X_train, threshold=0.95)
indol_X_train_rmCorr = vWSP(indol_X_train, threshold=0.95)
couma_X_train_rmCorr = vWSP(couma_X_train, threshold=0.95)
carba_X_train_rmCorr = vWSP(carba_X_train, threshold=0.95)

In [94]:
porph_X_train_rmCorr.shape, triph_X_train_rmCorr.shape, pheno_X_train_rmCorr.shape, 

((196, 674), (160, 802), (144, 620))