In [13]:
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cross_decomposition import PLSRegression
from sklearn.feature_selection import SelectFromModel, RFE, RFECV
from sklearn.linear_model import ElasticNet, ElasticNetCV, Lasso, LassoCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from kennard_stone import train_test_split
from sklearn.model_selection import train_test_split as sktrain_test_split
# from genetic_selection import GeneticSelectionCV
import pandas as pd
import numpy as np

In [2]:
def extractDragonDescriptors(family_Y_df, descriptorFileString):
    family_X = pd.read_csv(descriptorFileString, sep='\t', index_col=0)
    colsX = family_X.columns
    family_X_resetIndex = pd.DataFrame(family_X.to_numpy(), index=family_Y_df.index, columns=family_X.columns).replace('na', np.NaN)
    return family_X_resetIndex[colsX[1:]].dropna(axis=1, how='any')

In [8]:
from sklearn.feature_selection import VarianceThreshold

def remove_low_variance(input_data, threshold=0.1):
    selection = VarianceThreshold(threshold)
    selection.fit(input_data)
    return input_data[input_data.columns[selection.get_support(indices=True)]]

In [3]:
triph_Y = pd.read_csv('Data/exactPLS/triphenylamines.csv', index_col=0)
porph_Y = pd.read_csv('Data/exactPLS/porphyrins.csv', index_col=0)
pheno_Y = pd.read_csv('Data/exactPLS/phenothiazines.csv', index_col=0)
indol_Y = pd.read_csv('Data/exactPLS/indolines.csv', index_col=0)
couma_Y = pd.read_csv('Data/exactPLS/coumarins.csv', index_col=0)
carba_Y = pd.read_csv('Data/exactPLS/carbazoles.csv', index_col=0)
diphe_Y = pd.read_csv('Data/exactPLS/diphenylamines.csv', index_col=0)

In [4]:
triph_X = extractDragonDescriptors(triph_Y, 'Data/exactPLS/desc_triphenylamines.txt')
porph_X = extractDragonDescriptors(porph_Y, 'Data/exactPLS/desc_porphyrins.txt')
pheno_X = extractDragonDescriptors(pheno_Y, 'Data/exactPLS/desc_phenothiazines.txt')
indol_X = extractDragonDescriptors(indol_Y, 'Data/exactPLS/desc_indolines.txt')
couma_X = extractDragonDescriptors(couma_Y, 'Data/exactPLS/desc_coumarins.txt')
carba_X = extractDragonDescriptors(carba_Y, 'Data/exactPLS/desc_carbazoles.txt')
diphe_X = extractDragonDescriptors(diphe_Y, 'Data/exactPLS/desc_diphenylamines.txt')

In [5]:
triph_X.shape,porph_X.shape,pheno_X.shape,indol_X.shape,couma_X.shape,carba_X.shape,diphe_X.shape

((229, 851),
 (281, 723),
 (207, 673),
 (160, 554),
 (56, 630),
 (179, 603),
 (35, 481))

In [19]:
triph_X_var = remove_low_variance(triph_X, 0.01)
porph_X_var = remove_low_variance(porph_X, 0.01)
pheno_X_var = remove_low_variance(pheno_X, 0.01)
indol_X_var = remove_low_variance(indol_X, 0.01)
couma_X_var = remove_low_variance(couma_X, 0.01)
carba_X_var = remove_low_variance(carba_X, 0.01)
diphe_X_var = remove_low_variance(diphe_X, 0.01)

triph_X_var.shape,porph_X_var.shape,pheno_X_var.shape,indol_X_var.shape,couma_X_var.shape,carba_X_var.shape,diphe_X_var.shape

((229, 657),
 (281, 602),
 (207, 568),
 (160, 492),
 (56, 590),
 (179, 512),
 (35, 439))

In [20]:
def returnNextRow(rowDf, to_drop):
    max = -200
    nextRow='None'
    for i in range(len(rowDf.columns)):
        if (rowDf[rowDf.columns[i]][0] > max) and (rowDf.columns[i] not in to_drop) and (rowDf[rowDf.columns[i]][0] != np.NaN):
            max = rowDf[rowDf.columns[i]][0]
            nextRow = rowDf.columns[i]
    return nextRow

def getTo_dropOfRow(rowDf, to_drop=[], threshold=0.95):
    for i in range(len(rowDf.columns)):
        if (rowDf[rowDf.columns[i]][0] > threshold) and (rowDf[rowDf.columns[i]][0] != np.NaN) and (rowDf.columns[i] not in to_drop):
            to_drop.append(rowDf.columns[i])
    return to_drop

def vWSPFeatureSelect(rowName,corr_matrix,to_drop=[],thresh=0.95):
    if rowName=='None':
        return to_drop
    else:
        to_DropThisRow = getTo_dropOfRow(corr_matrix.loc[[rowName]], to_drop, threshold=thresh) #array of column names
        nextRow = returnNextRow(corr_matrix.loc[[rowName]], to_DropThisRow)
        updateTo_drop = to_DropThisRow
        return vWSPFeatureSelect(rowName=nextRow, corr_matrix=corr_matrix,to_drop=updateTo_drop, thresh=0.95)
    
def vWSP(X_train, threshold=0.95):
    cor_matrix = X_train.corr().abs()
    upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape), k=1).astype(bool))
    to_drop=vWSPFeatureSelect('MW', corr_matrix=upper_tri, to_drop=[], thresh=threshold) #MW chosen as seed
    X_train_rm = X_train.drop(to_drop, axis=1)
    return X_train_rm

In [42]:
triph_X_sansCorr = vWSP(triph_X_var, threshold=0.5)
porph_X_sansCorr = vWSP(porph_X_var, threshold=0.5)
pheno_X_sansCorr = vWSP(pheno_X_var, threshold=0.5)
indol_X_sansCorr = vWSP(indol_X_var, threshold=0.5)
couma_X_sansCorr = vWSP(couma_X_var, threshold=0.5)
carba_X_sansCorr = vWSP(carba_X_var, threshold=0.5)
diphe_X_sansCorr = vWSP(diphe_X_var, threshold=0.5)

triph_X_train_sansCorr.shape,porph_X_train_sansCorr.shape,pheno_X_train_sansCorr.shape,indol_X_train_sansCorr.shape,couma_X_train_sansCorr.shape,carba_X_train_sansCorr.shape,diphe_X_train_sansCorr.shape

((229, 488),
 (281, 454),
 (207, 414),
 (160, 346),
 (56, 434),
 (179, 398),
 (35, 265))

In [25]:
def multipleSplit(input_X, input_Y, test_size=0.3):
    familySplits=[]
    # 0 --> input_X_train, 1 --> input_X_test, 2 --> input_Y_train, 3 --> input_Y_test
    for i in range(10):
        familySplits.append(sktrain_test_split(input_X, input_Y['PCE'], test_size=0.3, random_state=i))
    return familySplits

In [43]:
triph_split_sets = multipleSplit(triph_X_sansCorr, triph_Y)
porph_split_sets = multipleSplit(porph_X_sansCorr, porph_Y)
pheno_split_sets = multipleSplit(pheno_X_sansCorr, pheno_Y)
indol_split_sets = multipleSplit(indol_X_sansCorr, indol_Y)
couma_split_sets = multipleSplit(couma_X_sansCorr, couma_Y)
carba_split_sets = multipleSplit(carba_X_sansCorr, carba_Y)
diphe_split_sets = multipleSplit(diphe_X_sansCorr, diphe_Y)

triph_split_sets[0][0].shape


(160, 488)

In [48]:
def trainOneRF(X_train_X_test_Y_train_Y_test, n_trees=500):
    model = RandomForestRegressor(n_estimators=n_trees)
    model.fit(X_train_X_test_Y_train_Y_test[0], X_train_X_test_Y_train_Y_test[2])
    predict_train = model.predict(X_train_X_test_Y_train_Y_test[0])
    predict_test = model.predict(X_train_X_test_Y_train_Y_test[1])
    r2_train = r2_score(X_train_X_test_Y_train_Y_test[2],predict_train)
    r2_test= r2_score(X_train_X_test_Y_train_Y_test[3],predict_test)
    return r2_train, r2_test

def applyRF(multiSplit):
    r2_train_scores=[]
    r2_test_scores=[]
    for i in range(len(multiSplit)):
        r2_both = trainOneRF(multiSplit[i])
        r2_train_scores.append(r2_both[0])
        r2_test_scores.append(r2_both[1])
    print(f"All R2 Train:\n{r2_train_scores}\n\nAverage: {np.average(r2_train_scores)}, Std. Dev: {np.std(r2_train_scores)}, Variance {np.var(r2_train_scores)}\n\nAll R2 Test:\n{r2_test_scores}\n\nAverage: {np.average(r2_test_scores)}, Std. Dev: {np.std(r2_test_scores)}, Variance {np.var(r2_test_scores)}")
        

In [49]:
applyRF(triph_split_sets)

All R2 Train:
[0.9076208040824673, 0.9155136510916342, 0.9300335308316516, 0.9220246300225923, 0.9263571415544483, 0.9347266712221804, 0.9151309604955882, 0.925525721605722, 0.9248930169057817, 0.9328199548755177]

Average: 0.9234646082687584, Std. Dev: 0.008111286443286198, Variance 6.579296776503846e-05

All R2 Test:
[0.5390656829465347, 0.5614405065825772, 0.3008020110107522, 0.5355753382527924, 0.4170205031798476, 0.4014962951180394, 0.3761165785660019, 0.6042856450041034, 0.48359758989842405, 0.3736603979996891]

Average: 0.4593060548558762, Std. Dev: 0.09419577694742219, Variance 0.008872844394728515


In [50]:
applyRF(porph_split_sets)

All R2 Train:
[0.9475844914295827, 0.9474025802936217, 0.952190636132594, 0.9448290277704248, 0.9444824500597089, 0.9399562267345891, 0.9444803543722374, 0.9453175062188899, 0.9502176880123949, 0.9487491074979133]

Average: 0.9465210068521956, Std. Dev: 0.003292571529426981, Variance 1.0841027276393131e-05

All R2 Test:
[0.5920042529496501, 0.6070179773899489, 0.4608648404546757, 0.6837114246367801, 0.6195415042887735, 0.6932695309676967, 0.6819835941082599, 0.6417732777542344, 0.5976273931062218, 0.6265479556230344]

Average: 0.6204341751279274, Std. Dev: 0.06368802048081781, Variance 0.004056163952765069
