In [47]:
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cross_decomposition import PLSRegression
from sklearn.feature_selection import SelectFromModel, RFE, RFECV
from sklearn.linear_model import ElasticNet, ElasticNetCV, Lasso, LassoCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import LinearSVC
from sklearn.metrics import r2_score
from kennard_stone import train_test_split
from sklearn.model_selection import train_test_split as sktrain_test_split
from sklearn.pipeline import Pipeline
# from genetic_selection import GeneticSelectionCV
import pandas as pd
import numpy as np

In [2]:
def extractDragonDescriptors(family_Y_df, descriptorFileString):
    family_X = pd.read_csv(descriptorFileString, sep='\t', index_col=0)
    colsX = family_X.columns
    family_X_resetIndex = pd.DataFrame(family_X.to_numpy(), index=family_Y_df.index, columns=family_X.columns).replace('na', np.NaN)
    return family_X_resetIndex[colsX[1:]].dropna(axis=1, how='any')

In [3]:
from sklearn.feature_selection import VarianceThreshold

def remove_low_variance(input_data, threshold=0.1):
    selection = VarianceThreshold(threshold)
    selection.fit(input_data)
    return input_data[input_data.columns[selection.get_support(indices=True)]]

In [4]:
triph_Y = pd.read_csv('Data/exactPLS/triphenylamines.csv', index_col=0)
porph_Y = pd.read_csv('Data/exactPLS/porphyrins.csv', index_col=0)
pheno_Y = pd.read_csv('Data/exactPLS/phenothiazines.csv', index_col=0)
indol_Y = pd.read_csv('Data/exactPLS/indolines.csv', index_col=0)
couma_Y = pd.read_csv('Data/exactPLS/coumarins.csv', index_col=0)
carba_Y = pd.read_csv('Data/exactPLS/carbazoles.csv', index_col=0)
diphe_Y = pd.read_csv('Data/exactPLS/diphenylamines.csv', index_col=0)

In [5]:
triph_X = extractDragonDescriptors(triph_Y, 'Data/exactPLS/desc_triphenylamines.txt')
porph_X = extractDragonDescriptors(porph_Y, 'Data/exactPLS/desc_porphyrins.txt')
pheno_X = extractDragonDescriptors(pheno_Y, 'Data/exactPLS/desc_phenothiazines.txt')
indol_X = extractDragonDescriptors(indol_Y, 'Data/exactPLS/desc_indolines.txt')
couma_X = extractDragonDescriptors(couma_Y, 'Data/exactPLS/desc_coumarins.txt')
carba_X = extractDragonDescriptors(carba_Y, 'Data/exactPLS/desc_carbazoles.txt')
diphe_X = extractDragonDescriptors(diphe_Y, 'Data/exactPLS/desc_diphenylamines.txt')

In [6]:
triph_X.shape,porph_X.shape,pheno_X.shape,indol_X.shape,couma_X.shape,carba_X.shape,diphe_X.shape

((229, 851),
 (281, 723),
 (207, 673),
 (160, 554),
 (56, 630),
 (179, 603),
 (35, 481))

In [7]:
triph_X_var = remove_low_variance(triph_X, 0.01)
porph_X_var = remove_low_variance(porph_X, 0.01)
pheno_X_var = remove_low_variance(pheno_X, 0.01)
indol_X_var = remove_low_variance(indol_X, 0.01)
couma_X_var = remove_low_variance(couma_X, 0.01)
carba_X_var = remove_low_variance(carba_X, 0.01)
diphe_X_var = remove_low_variance(diphe_X, 0.01)

triph_X_var.shape,porph_X_var.shape,pheno_X_var.shape,indol_X_var.shape,couma_X_var.shape,carba_X_var.shape,diphe_X_var.shape

((229, 657),
 (281, 602),
 (207, 568),
 (160, 492),
 (56, 590),
 (179, 512),
 (35, 439))

In [8]:
def returnNextRow(rowDf, to_drop):
    max = -200
    nextRow='None'
    for i in range(len(rowDf.columns)):
        if (rowDf[rowDf.columns[i]][0] > max) and (rowDf.columns[i] not in to_drop) and (rowDf[rowDf.columns[i]][0] != np.NaN):
            max = rowDf[rowDf.columns[i]][0]
            nextRow = rowDf.columns[i]
    return nextRow

def getTo_dropOfRow(rowDf, to_drop=[], threshold=0.95):
    for i in range(len(rowDf.columns)):
        if (rowDf[rowDf.columns[i]][0] > threshold) and (rowDf[rowDf.columns[i]][0] != np.NaN) and (rowDf.columns[i] not in to_drop):
            to_drop.append(rowDf.columns[i])
    return to_drop

def vWSPFeatureSelect(rowName,corr_matrix,to_drop=[],thresh=0.95):
    if rowName=='None':
        return to_drop
    else:
        to_DropThisRow = getTo_dropOfRow(corr_matrix.loc[[rowName]], to_drop, threshold=thresh) #array of column names
        nextRow = returnNextRow(corr_matrix.loc[[rowName]], to_DropThisRow)
        updateTo_drop = to_DropThisRow
        return vWSPFeatureSelect(rowName=nextRow, corr_matrix=corr_matrix,to_drop=updateTo_drop, thresh=thresh)
    
def vWSP(X_train, threshold=0.95):
    cor_matrix = X_train.corr().abs()
    upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape), k=1).astype(bool))
    to_drop=vWSPFeatureSelect('MW', corr_matrix=upper_tri, to_drop=[], thresh=threshold) #MW chosen as seed
    X_train_rm = X_train.drop(to_drop, axis=1)
    return X_train_rm

In [90]:
triph_X_sansCorr = vWSP(triph_X_var, threshold=0.5)
porph_X_sansCorr = vWSP(porph_X_var, threshold=0.5)
pheno_X_sansCorr = vWSP(pheno_X_var, threshold=0.5)
indol_X_sansCorr = vWSP(indol_X_var, threshold=0.5)
couma_X_sansCorr = vWSP(couma_X_var, threshold=0.5)
carba_X_sansCorr = vWSP(carba_X_var, threshold=0.5)
diphe_X_sansCorr = vWSP(diphe_X_var, threshold=0.5)

triph_X_sansCorr.shape,porph_X_sansCorr.shape,pheno_X_sansCorr.shape,indol_X_sansCorr.shape,couma_X_sansCorr.shape,carba_X_sansCorr.shape,diphe_X_sansCorr.shape

((229, 486),
 (281, 444),
 (207, 407),
 (160, 345),
 (56, 424),
 (179, 395),
 (35, 257))

In [12]:
def multipleSplit(input_X, input_Y, test_size=0.3):
    familySplits=[]
    # 0 --> input_X_train, 1 --> input_X_test, 2 --> input_Y_train, 3 --> input_Y_test
    for i in range(10):
        familySplits.append(sktrain_test_split(input_X, input_Y['PCE'], test_size=0.3, random_state=i))
    return familySplits

In [91]:
triph_split_sets = multipleSplit(triph_X_sansCorr, triph_Y)
porph_split_sets = multipleSplit(porph_X_sansCorr, porph_Y)
pheno_split_sets = multipleSplit(pheno_X_sansCorr, pheno_Y)
indol_split_sets = multipleSplit(indol_X_sansCorr, indol_Y)
couma_split_sets = multipleSplit(couma_X_sansCorr, couma_Y)
carba_split_sets = multipleSplit(carba_X_sansCorr, carba_Y)
diphe_split_sets = multipleSplit(diphe_X_sansCorr, diphe_Y)

triph_split_sets[0][0].shape


(160, 486)

In [129]:
def trainOneRF(X_train_X_test_Y_train_Y_test, n_trees=500):
    model = RandomForestRegressor(n_estimators=n_trees, max_depth=4)
    model.fit(X_train_X_test_Y_train_Y_test[0], X_train_X_test_Y_train_Y_test[2])
    predict_train = model.predict(X_train_X_test_Y_train_Y_test[0])
    predict_test = model.predict(X_train_X_test_Y_train_Y_test[1])
    r2_train = r2_score(X_train_X_test_Y_train_Y_test[2],predict_train)
    r2_test= r2_score(X_train_X_test_Y_train_Y_test[3],predict_test)
    return r2_train, r2_test

def applyMultiRF(multiSplit):
    r2_train_scores=[]
    r2_test_scores=[]
    for i in range(len(multiSplit)):
        r2_both = trainPPL_RF(multiSplit[i])
        r2_train_scores.append(r2_both[0])
        r2_test_scores.append(r2_both[1])
    print(f"All R2 Train:\n{r2_train_scores}\n\nAverage: {np.average(r2_train_scores)}, Std. Dev: {np.std(r2_train_scores)}, Variance {np.var(r2_train_scores)}\n\nAll R2 Test:\n{r2_test_scores}\n\nAverage: {np.average(r2_test_scores)}, Std. Dev: {np.std(r2_test_scores)}, Variance {np.var(r2_test_scores)}")

def trainPPL_RF(X_train_X_test_Y_train_Y_test):
    rfRegressor = Pipeline([
        ('feature_selection', SelectFromModel(RandomForestRegressor(), max_features=X_train_X_test_Y_train_Y_test[0].shape[0])),
        ('regression', RandomForestRegressor(n_estimators=500, max_depth=6))
    ])
    rfRegressor.fit(X_train_X_test_Y_train_Y_test[0], X_train_X_test_Y_train_Y_test[2])
    predict_train = rfRegressor.predict(X_train_X_test_Y_train_Y_test[0])
    predict_test = rfRegressor.predict(X_train_X_test_Y_train_Y_test[1])
    r2_train = r2_score(X_train_X_test_Y_train_Y_test[2],predict_train)
    r2_test= r2_score(X_train_X_test_Y_train_Y_test[3],predict_test)
    return r2_train, r2_test

In [130]:
applyMultiRF(triph_split_sets)

All R2 Train:
[0.8397504298420176, 0.8455377745157084, 0.8784896671155281, 0.8669908573080264, 0.8714959265633708, 0.8727318492854703, 0.8557125131997672, 0.8735486034029318, 0.8736882667864991, 0.8900404016410693]

Average: 0.8667986289660389, Std. Dev: 0.014607276490478044, Variance 0.00021337252646927256

All R2 Test:
[0.5144202176817128, 0.5294728249073122, 0.2949728139291896, 0.5048082960072011, 0.3998660821829436, 0.3573067102898221, 0.35244150348297376, 0.57074141413201, 0.45670770389571513, 0.3516269078278016]

Average: 0.4332364474336682, Std. Dev: 0.08922834117477889, Variance 0.007961696868802742


In [131]:
applyMultiRF(porph_split_sets)

All R2 Train:
[0.9236253758632269, 0.9228552080408673, 0.9286371443027728, 0.9204178006355683, 0.9232681288989085, 0.914355645925328, 0.9165947745217463, 0.9237069012364062, 0.9228772985769708, 0.9191001651130112]

Average: 0.9215438443114806, Std. Dev: 0.003862647489515929, Variance 1.4920045628263708e-05

All R2 Test:
[0.5604066938758197, 0.5919986313629078, 0.45746038801552813, 0.6717152608193084, 0.6216752742678225, 0.6817070124690945, 0.6757632949212096, 0.6166584883602305, 0.5832720704578758, 0.6122768334543334]

Average: 0.6072933948004131, Std. Dev: 0.06336743255903299, Variance 0.004015431509123594
