In [8]:
import pandas as pd
import numpy as np
from numpy import nan
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
from sklearn.decomposition import PCA, KernelPCA
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.metrics import calinski_harabasz_score, r2_score, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
import helper_funcs as my_funcs
import math
%matplotlib inline

In [6]:
co_sensitizers = pd.read_csv("DBVersions/rawDSSCDBdata.csv")[["PCE","Co-sensitizer"]].replace({"-":np.nan})
co_sensitizers

Unnamed: 0,PCE,Co-sensitizer
0,5.19,
1,2.60,
2,3.10,
3,2.20,
4,5.00,
...,...,...
4421,8.79,
4422,4.52,
4423,6.09,
4424,5.65,


In [11]:
total_set = co_sensitizers[co_sensitizers["Co-sensitizer"].notnull()]
total_set

Unnamed: 0,PCE,Co-sensitizer
595,8.01,N719
596,8.12,N719
788,6.71,"N719, Ruthenium"
789,8.01,"N719, Ruthenium"
802,11.60,Zinc-porphyrin as cosensitizer
...,...,...
3724,10.41,WS-5
3725,9.02,WS-5
3746,3.82,bodipy
4407,2.21,D35 dye


In [15]:
total_set.groupby(["Co-sensitizer"]).count().sort_values(by=["PCE"], ascending=False)

Unnamed: 0_level_0,PCE
Co-sensitizer,Unnamed: 1_level_1
N719,12
C1,12
WS-5,10
Zn-3,7
FNE46 cosensitizer,7
Y123,6
D35,5
TC2,4
XW4,4
S2,4


In [17]:
dye_library = {"Y123": "CCCCCCC1(CCCCCC)C2=C(SC(/C=C(C#N)/C(O)=O)=C2)C(S3)=C1C=C3C(C=C4)=CC=C4N(C5=CC=C(C6=CC=C(OCCCCCC)C=C6OCCCCCC)C=C5)C7=CC=C(C8=C(OCCCCCC)C=C(OCCCCCC)C=C8)C=C7",
                "Zinc-porphyrin": "CCCCCCCCOC1=C(C2=C3C=CC(C(C#CC4=CC=C(C(O)=O)C=C4)=C5C=C6)=[N]3[Zn]78N5C6=C(C9=C(OCCCCCCCC)C=CC=C9OCCCCCCCC)C%10=[N]7C(C=C%10)=C(N(C%11=CC=C(CCCCCC)C=C%11)C%12=CC=C(CCCCCC)C=C%12)C%13=CC=C2N%138)C(OCCCCCCCC)=CC=C1",
                "N719": "S=C=N[Ru]12(N=C=S)(N3CCC(CC3C3N2CCC(C3)C(=O)[O-])C(=O)O)N2CCC(CC2C2N1CCC(C2)C(=O)[O-])C(=O)O",
                "C1": "COC(C=C1)=CC=C1N(C2=C(C=CC=C3)C3=C(C#CC4=CC=C(C(O)=O)C=C4)C5=C2C=CC=C5)C6=CC=C(OC)C=C6",
                "WS-5": "CC(C=C1)=CC=C1N2C(C=CC(C3=CC=C(C4=CC=C(/C=C(C(O)=O)/C#N)S4)C5=NN(CCCCCCCC)N=C35)=C6)=C6C7C2CCC7",
                "Zn-3": "CC(C=C1C)=CC(C)=C1C2=C3C=CC(C(C4=C(C)C=C(C)C=C4C)=C5C=C6)=[N]3[Zn]78N5C6=C(C9=CC=C(C(O)=O)C=C9)C%10=[N]7C(C=C%10)=C(C%11=C(C)C=C(C)C=C%11C)C%12=CC=C2N%128",
                "FNE46 cosensitizer":"O=C(/C(C#N)=C/C(S1)=CC=C1C(C2=C3N=CC=N2)=CC=C3C(S4)=CC=C4C(C=C5)=CC=C5N(C6=CC=C(OCCCCCCCC)C=C6)C7=CC=C(OCCCCCCCC)C=C7)O",
                "D35": "CCCCOC(C=C(OCCCC)C=C1)=C1C(C=C2)=CC=C2N(C3=CC=C(C4=CC=C(/C=C(C#N)/C(O)=O)S4)C=C3)C5=CC=C(C6=CC=C(OCCCC)C=C6OCCCC)C=C5",
                "TC2": "CCCCCCOC(C=C1)=CC=C1N(C2=CC=C(C3=CC=C(/C=C(C#N)/C(O)=O)S3)C=C2)C4=CC=C(OCCCCCC)C=C4",
                "XW4": "CCCCCCCCCCCCOC1=C(C2=C3C=CC(C(C#CC4=CC=C(C(O)=O)C=C4)=C5C=C6)=[N]3[Zn]78N5C6=C(C9=C(OCCCCCCCCCCCC)C=CC=C9OCCCCCCCCCCCC)C%10=[N]7C(C=C%10)=C(C#CC%11=C(OCCCCCC)C=C(N%12C(C=CC=C%13)=C%13C%14=C%12C=CC=C%14)C(OCCCCCC)=C%11)C%15=CC=C2N%158)C(OCCCCCCCCCCCC)=CC=C1",
                "S2": "CCCCC(CC)COC(C=C1)=CC=C1N(C2=CC=C(C3=CC=C(C4=CC=C(/C=C(C(O)=O)/C#N)S4)S3)C=C2)C5=CC=C(OCC(CC)CCCC)C=C5",
                "TC1": "CCCCCCOC(C=C1)=CC=C1N(C2=CC=C(/C=C(C#N)/C(O)=O)C=C2)C3=CC=C(OCCCCCC)C=C3",
                "S1\ttriphenylamine" : "CCCCC(CC)COC(C=C1)=CC=C1N(C2=CC=C(C3=CC=C(CC(C#N)C(O)=O)S3)C=C2)C4=CC=C(OCC(CC)CCCC)C=C4",
                "D149": "O=C(O)CN(C/1=O)/C(SC1=C\C2=CC3=C(N(C4=CC=C(/C=C(C5=CC=CC=C5)\C6=CC=CC=C6)C=C4)C7C3CCC7)C=C2)=C(SC(N8CC)=S)\C8=O",
                "D131": "O=C(O)/C(C#N)=C/C1=CC2=C(N(C3=CC=C(/C=C(C4=CC=CC=C4)\C5=CC=CC=C5)C=C3)C6C2CCC6)C=C1",
                "D149+D131": "O=C(O)CN(C/1=O)/C(SC1=C\C2=CC3=C(N(C4=CC=C(/C=C(C5=CC=CC=C5)\C6=CC=CC=C6)C=C4)C7C3CCC7)C=C2)=C(SC(N8CC)=S)\C8=O.O=C(O)/C(C#N)=C/C1=CC2=C(N(C3=CC=C(/C=C(C4=CC=CC=C4)\C5=CC=CC=C5)C=C3)C6C2CCC6)C=C1",
                "XS-3": "O=C1N(C(C(C=C2)=CC=C2C(C=C3)=CC4=C3N(CCCCCC)C(C=C5)=C(S4)C=C5C6=CC=C(OCCCCCC)C=C6)=C7C1=C(N(CCCCCC)C7=O)C8=CC=C(C#CC9=CC(CCCCCC)=C(/C=C(C#N)/C(O)=O)S9)C=C8)CCCCCC",
                "IQ21": "CC(C=C1)=CC=C1N(C2C3CCC2)C(C3=C4)=CC=C4C5=CC=C(C6=CC(C(CCCCCCCC)(CCCCCCCC)C7=C8SC(/C=C(C(O)=O)/C#N)=C7)=C8S6)C9=C5N=C(C%10=CC=CC=C%10)C(C%11=CC=CC=C%11)=N9",
                "SQ2": "[O-]C1=C(/C=C2N(CC)C(C=CC3=C4C=CC=C3)=C4C/2(C)C)C(/C1=C/C5=[N+](CCCCCCCC)C(C=CC(C(O)=O)=C6)=C6C5(C)C)=O",
                "MK2": "CCN1C2=C(C=CC=C2)C3=C1C=CC(C(S4)=CC(CCCCCC)=C4C5=CC(CCCCCC)=C(C6=CC(CCCCCC)=C(C7=CC(CCCCCC)=C(/C=C(C(O)=O)/C#N)S7)S6)S5)=C3",
                "QX20": "CCCCCCC1(CCCCCC)C2=CC=CC=C2C3=C1C4=C(N3CCCCCC)C=CC(C#CC5=CC=C(/C=C(C#N)/C(O)=O)S5)=C4",
                "bodipy": "FB1(F)[N]2=C(C)C(CC)=C(C)C2=C(C3=C(C)C(CC)=C(C)N31)C4=CC=C(C#CCN5C=NC=C5)C=C4",
                "H2LD14": "O=C(O)C(C=C1)=CC=C1C#C/C2=C3C=CC(/C(C4=C(OCCCCCCCCCCCC)C=CC=C4OCCCCCCCCCCCC)=C5C=C/C(N/5)=C(C#CC6=CC=C(N(C)C)C=C6)/C7=N/C(C=C7)=C(C8=C(OCCCCCCCCCCCC)C=CC=C8OCCCCCCCCCCCC)\C9=CC=C2N9)=N\3",
                "AN-3": "O=C(O)C(C=C1)=CC=C1C#CC2=C3C(C=CC=C3)=C(C#CC4=CC=C(N(CCCCCCCC)CCCCCCCC)C=C4)C5=C2C=CC=C5",
                "H2LD14+AN-3": "O=C(O)C(C=C1)=CC=C1C#C/C2=C3C=CC(/C(C4=C(OCCCCCCCCCCCC)C=CC=C4OCCCCCCCCCCCC)=C5C=C/C(N/5)=C(C#CC6=CC=C(N(C)C)C=C6)/C7=N/C(C=C7)=C(C8=C(OCCCCCCCCCCCC)C=CC=C8OCCCCCCCCCCCC)\C9=CC=C2N9)=N\3.O=C(O)C(C=C1)=CC=C1C#CC2=C3C(C=CC=C3)=C(C#CC4=CC=C(N(CCCCCCCC)CCCCCCCC)C=C4)C5=C2C=CC=C5",
                "NT35": "O=C(/C(C#N)=C/C(C=C1)=CC=C1N(C2=CC=C(C3=C(OCCCCCC)C=C(OCCCCCC)C=C3)C=C2)C4=CC=C(C5=CC=C(OCCCCCC)C=C5OCCCCCC)C=C4)O",
                "HD2-mono": "CC(C=C1)=CC2=[N]1[Ru]3([N]4=C(C5=[N]3C=CC(C(O)=O)=C5)C=C(C(O)=O)C=C4)(N=C=S)(N=C=S)[N]6=C2C=C(/C=C/C7=CC=C(OCCO8)C8=C7)C=C6",
                "D35+HD2-mono": "CCCCOC(C=C(OCCCC)C=C1)=C1C(C=C2)=CC=C2N(C3=CC=C(C4=CC=C(/C=C(C#N)/C(O)=O)S4)C=C3)C5=CC=C(C6=CC=C(OCCCC)C=C6OCCCC)C=C5.CC(C=C1)=CC2=[N]1[Ru]3([N]4=C(C5=[N]3C=CC(C(O)=O)=C5)C=C(C(O)=O)C=C4)(N=C=S)(N=C=S)[N]6=C2C=C(/C=C/C7=CC=C(OCCO8)C8=C7)C=C6",
                "D3": "CC(C=C1)=CC=C1N(C2C3CCC2)C(C3=C4)=CC=C4C#CC5=CC=C(C#CC6=CC=C(C(O)=O)C=C6)C7=NSN=C57",
                "D205": "O=C1N(CC(O)=O)C(C2C(N(CCCCCCCC)C(S2)=S)=O)S/C1=C\C3=CC4=C(N(C5=CC=C(/C=C(C6=CC=CC=C6)\C7=CC=CC=C7)C=C5)C8C4CCC8)C=C3"
            }