In [1]:
import pandas as pd
import numpy as np
from padelpy import padeldescriptor
import glob



Load in Dye Dataset with Descriptors

In [2]:
qikPharmDyeData = pd.read_csv('QikPropAnd3PointPharmFPDyeData.csv', index_col=0)

qikPharmDyeData

Unnamed: 0,#stars,#amine,#amidine,#acid,#amide,#rotor,#rtvFG,CNS,mol MW,dipole,...,Semiconductor,Solar simulator,Article DOI,Molecule keywords,Molecule spectrum absorption maxima,Molecule spectrum emission maxima,VOC,JSC,FF,PCE
1,17.0,0.0,0.0,2.0,0.0,25.0,2.0,-2.0,907.090,5.627,...,TiO2,AM 1.5G 100mW/cm2,10.1016/j.dyepig.2012.02.011,phenothiazine,439.0,576.0,687.0,10.79,0.70,5.19
8,15.0,0.0,0.0,1.0,0.0,19.0,3.0,-2.0,987.198,12.071,...,TiO2,AM 1.5G 100mW/cm2,10.1039/c0ee00218f,"coumarin, triphenylamine",465.0,640.0,678.0,13.20,0.67,6.00
9,15.0,0.0,0.0,1.0,0.0,19.0,3.0,-2.0,905.078,12.937,...,TiO2,AM 1.5G 100mW/cm2,10.1039/c0ee00218f,"coumarin, triphenylamine",460.0,590.0,695.0,12.20,0.74,6.20
10,14.0,0.0,0.0,1.0,0.0,19.0,3.0,-2.0,822.959,11.899,...,TiO2,AM 1.5G 100mW/cm2,10.1039/c0ee00218f,"coumarin, triphenylamine",454.0,525.0,800.0,9.00,0.76,5.50
11,11.0,1.0,0.0,2.0,0.0,11.0,0.0,-2.0,781.945,10.301,...,TiO2,AM 1.5G 100mW/cm2,10.1016/j.jphotochem.2014.06.001,coumarin,465.0,490.0,560.0,3.41,0.73,1.39
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4411,6.0,0.0,0.0,1.0,0.0,10.0,1.0,-2.0,595.687,5.938,...,TiO2,AM 1.5G 100mW/cm2,10.1016/j.dyepig.2012.02.014,triphenylamine,505.0,640.0,558.0,1.42,0.51,0.41
4412,7.0,0.0,0.0,1.0,0.0,8.0,1.0,-2.0,563.688,5.176,...,TiO2,AM 1.5G 100mW/cm2,10.1016/j.dyepig.2012.02.014,triphenylamine,503.0,611.0,519.0,1.81,0.46,0.44
4413,12.0,0.0,0.0,1.0,0.0,19.0,2.0,-2.0,665.787,13.889,...,TiO2,AM 1.5G 100mW/cm2,10.1021/am403668d,triphenylamine,488.0,,720.0,12.61,0.65,5.94
4414,9.0,0.0,0.0,1.0,0.0,20.0,2.0,-2.0,707.868,9.403,...,TiO2,AM 1.5G 100mW/cm2,10.1021/am403668d,triphenylamine,486.0,,700.0,12.60,0.67,5.88


In [31]:
qikPharmDyeData = qikPharmDyeData[qikPharmDyeData['#stars'].notna()]


Split Entire Set into Model Set and External Evaluation Set

In [39]:
from sklearn.model_selection import train_test_split

modelSet, evalSet = train_test_split(qikPharmDyeData, test_size=0.2, random_state=0)

evalSetCols = list(evalSet.columns)

evalSet_X = evalSet[evalSetCols[:251]]
evalSet_Y = evalSet['PCE']

modelSet.shape, evalSet.shape



((1072, 263), (268, 263))

Split Model Set into Compound Family Sets

In [32]:

triphModelSet = modelSet.loc[(modelSet['Molecule keywords'].str.contains('triphenylamine'))]
phenModelSet = modelSet.loc[(modelSet['Molecule keywords'].str.contains('phenothiazine'))]
carbModelSet = modelSet.loc[(modelSet['Molecule keywords'].str.contains('carbazole'))]
indoModelSet = modelSet.loc[(modelSet['Molecule keywords'].str.contains('indoline'))]
coumModelSet = modelSet.loc[(modelSet['Molecule keywords'].str.contains('coumarin'))]
diphModelSet = modelSet.loc[(modelSet['Molecule keywords'].str.contains('diphenylamine'))]

othrModelSet = modelSet.loc[(~modelSet['Molecule keywords'].str.contains('triphenylamine')) 
                            & (~modelSet['Molecule keywords'].str.contains('phenothiazine')) 
                            & (~modelSet['Molecule keywords'].str.contains('carbazole')) 
                            & (~modelSet['Molecule keywords'].str.contains('indoline')) 
                            & (~modelSet['Molecule keywords'].str.contains('coumarin')) 
                            & (~modelSet['Molecule keywords'].str.contains('diphenylamine'))
                            ]

othrModelSet.shape

modelSet.groupby('Molecule keywords').count().sort_values('PCE', ascending=False).head(10)

triphModelSet.shape


(458, 263)

In [37]:
from sklearn.feature_selection import VarianceThreshold

def remove_low_variance(input_data, threshold=0.0):
    selection = VarianceThreshold(threshold)
    selection.fit(input_data)
    return input_data[input_data.columns[selection.get_support(indices=True)]]

In [40]:
def getPadelDescriptors(dataframe):
    cols = list(dataframe.columns)
    X = dataframe[cols[:251]] # start at 51 for just 3PointFPs
    return X

Split Compound Family Sets into Training and Test Sets

In [41]:
glob_train_X, glob_test_X, glob_train_Y, glob_test_Y = train_test_split(remove_low_variance(getDescriptors(modelSet)), modelSet['PCE'], test_size=0.2, random_state=0)
triph_train_X, triph_test_X, triph_train_Y, triph_test_Y = train_test_split(remove_low_variance(getDescriptors(triphModelSet)), triphModelSet['PCE'], test_size=0.2, random_state=0)
phen_train_X, phen_test_X, phen_train_Y, phen_test_Y = train_test_split(remove_low_variance(getDescriptors(phenModelSet)), phenModelSet['PCE'], test_size=0.2, random_state=0)
carb_train_X, carb_test_X, carb_train_Y, carb_test_Y = train_test_split(remove_low_variance(getDescriptors(carbModelSet)), carbModelSet['PCE'], test_size=0.2, random_state=0)
indo_train_X, indo_test_X, indo_train_Y, indo_test_Y = train_test_split(remove_low_variance(getDescriptors(indoModelSet)), indoModelSet['PCE'], test_size=0.2, random_state=0)
coum_train_X, coum_test_X, coum_train_Y, coum_test_Y = train_test_split(remove_low_variance(getDescriptors(coumModelSet)), coumModelSet['PCE'], test_size=0.2, random_state=0)
diph_train_X, diph_test_X, diph_train_Y, diph_test_Y = train_test_split(remove_low_variance(getDescriptors(diphModelSet)), diphModelSet['PCE'], test_size=0.2, random_state=0)
othr_train_X, othr_test_X, othr_train_Y, othr_test_Y = train_test_split(remove_low_variance(getDescriptors(othrModelSet)), othrModelSet['PCE'], test_size=0.2, random_state=0)

triph_train_X
phen_train_X

Unnamed: 0,BIT8151720,BIT18475850,BIT22127076,BIT35877664,BIT39553675,BIT67883118,BIT98575424,BIT160201318,BIT165768516,BIT190396141,...,BIT3991753262,BIT4030205765,BIT4037040228,BIT4043098439,BIT4155789727,BIT4165710193,BIT4188580987,BIT4216389085,BIT4257993391,BIT4271161317
293,1,0,0,0,1,1,0,0,1,1,...,1,1,0,0,1,1,1,0,0,0
404,0,0,1,0,0,0,0,1,0,1,...,1,1,1,0,0,0,0,1,1,0
386,1,1,0,1,1,1,1,0,0,1,...,1,1,1,0,1,1,1,1,1,1
208,1,0,1,1,1,0,1,1,0,1,...,1,1,1,0,1,0,0,1,0,1
164,0,0,1,1,1,0,0,1,0,0,...,0,1,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176,0,1,0,0,1,1,0,0,0,0,...,1,0,1,0,1,1,1,0,1,0
424,0,1,1,1,0,0,0,0,1,1,...,1,1,1,1,1,1,1,1,1,0
115,0,1,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,1,0,0,0
432,1,1,0,1,1,0,1,1,0,1,...,1,1,1,1,1,1,1,1,0,1


For Each Training Set, Perform n-Cross-Fold Validation to tune Parameters of Family Models

In [43]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score, LeaveOneOut


model = RandomForestRegressor(n_estimators=200)
scores = cross_val_score(model, glob_train_X, glob_train_Y, cv=20, scoring='neg_mean_absolute_error')


mean = scores.mean()
stdDev = scores.std()

scores, mean, stdDev


KeyboardInterrupt: 

GTM Because I am Frustrated

In [54]:
from ugtm import eGTR
from sklearn.model_selection import GridSearchCV

tuned_params = {'regul': [0.0001, 0.001, 0.01],
                's': [0.1, 0.2, 0.3],
                'k': [16],
                'm': [4]}

gs = GridSearchCV(eGTR(), tuned_params, cv=3, scoring='r2')
gs.fit(glob_train_X, glob_train_Y)
print(gs.best_params_)

{'k': 16, 'm': 4, 'regul': 0.0001, 's': 0.3}


In [3]:
from ugtm import eGTR

gtr = eGTR(k=16, m=4, regul=0.0001, s=0.3)

gtr = gtr.fit(glob_train_X, glob_train_Y)

Y_pred_train = gtr.predict(glob_train_X)
Y_pred_test = gtr.predict(glob_test_X)


NameError: name 'glob_train_X' is not defined

In [56]:
from sklearn.metrics import mean_absolute_error, r2_score
import altair as alt

mae_train_GTR = mean_absolute_error(glob_train_Y, Y_pred_train)
mae_test_GTR = mean_absolute_error(glob_test_Y, Y_pred_test)

r2_train = r2_score(glob_train_Y, Y_pred_train)
r2_test = r2_score(glob_test_Y, Y_pred_test)

mae_train_GTR, mae_test_GTR, r2_train, r2_test

dfclassmap = pd.DataFrame(gtr.optimizedModel.matX, columns=["x1","x2"])
dfclassmap["predicted_node_label"] = gtr.node_label

alt.Chart(dfclassmap).mark_square().encode(
    x='x1',
    y='x2',
    color=alt.Color('predicted_node_label:Q',
    scale = alt.Scale(scheme='greenblue'),
    legend=alt.Legend(title="PCE")),
    size=alt.value(50),
    tooltip=['x1','x2', 'predicted_node_label:Q']
).properties(title="Dye-Sensitised Solar Cells", width=200, height=200)

In [57]:
mae_train_GTR, mae_test_GTR, r2_train, r2_test

(1.3577256862533427,
 1.7288704321185642,
 0.3295678129769981,
 -0.046925079229013233)

Get Performance for Each Family Using Test Sets

Design Consensus or "Winner-Takes-All Algorithm" To Develop Global Model

Test Model on External Validation Set