In [351]:
import pandas as pd
import numpy as np

In [352]:
from rdkit import Chem

In [353]:
raw_DSSCDB = pd.read_csv('Data/rawDSSCDBdata.csv')

In [354]:
raw_DSSCDB.shape

(4426, 30)

In [355]:
article_columns = []
for column in raw_DSSCDB.columns:
    if ('Article' in column) or ('comment' in column) or ('spectrum' in column):
        article_columns.append(column)

In [356]:
trim_article_info = raw_DSSCDB.drop(article_columns, axis=1)

In [357]:
trim_article_info.columns

Index(['VOC', 'JSC', 'FF', 'PCE', 'Electrolyte', 'Active area', 'Co-adsorbent',
       'Co-sensitizer', 'Semiconductor', 'Dye loading', 'Exposure time',
       'Solar simulator', 'Molecule SMILE', 'Molecule keywords'],
      dtype='object')

In [358]:
trim_article_info.shape

(4426, 14)

In [359]:
trim_article_info['Molecule keywords'] = trim_article_info['Molecule keywords'].str.lower()

In [360]:
trim_article_info.groupby(['Molecule keywords']).count().sort_values('PCE', ascending=False)

Unnamed: 0_level_0,VOC,JSC,FF,PCE,Electrolyte,Active area,Co-adsorbent,Co-sensitizer,Semiconductor,Dye loading,Exposure time,Solar simulator,Molecule SMILE
Molecule keywords,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
triphenylamine,1471,1471,1471,1471,1471,1471,1451,1471,1471,1471,1471,1471,1471
porphyrin,512,512,512,512,512,512,511,512,512,512,512,512,512
phenothiazine,384,384,384,384,384,384,384,384,384,384,384,384,384
indoline,376,376,376,376,376,376,376,376,376,376,376,376,376
"n719, ruthenium",331,331,331,331,331,326,328,329,331,331,331,331,331
...,...,...,...,...,...,...,...,...,...,...,...,...,...
"triphenylamine, dithienopyrrolobenzothiadiazole",1,1,1,1,1,1,1,1,1,1,1,1,1
indoloquinoxaline,1,1,1,1,1,1,1,1,1,1,1,1,1
"phenoxazine,bodipy",1,1,1,1,1,1,1,1,1,1,1,1,1
phenylenebisthiophene,1,1,1,1,1,1,1,1,1,1,1,1,1


In [361]:
no_mixes = trim_article_info.loc[(~trim_article_info['Molecule SMILE'].str.contains('\.', regex=True))]

In [362]:
no_mixes.shape

(4003, 14)

In [363]:
no_duplicates = no_mixes.sort_values('PCE', ascending=False).drop_duplicates('Molecule SMILE').sort_index()

In [364]:
no_duplicates.shape

(2318, 14)

In [365]:
no_metals = no_duplicates.loc[(~no_duplicates['Molecule SMILE'].str.contains('Ru')) & (~no_duplicates['Molecule SMILE'].str.contains('Zn'))            & (~no_duplicates['Molecule SMILE'].str.contains('se'))]

In [366]:
no_metals.shape

(2007, 14)

In [367]:
no_metals.groupby(['Molecule keywords']).count().sort_values('PCE', ascending=False)

Unnamed: 0_level_0,VOC,JSC,FF,PCE,Electrolyte,Active area,Co-adsorbent,Co-sensitizer,Semiconductor,Dye loading,Exposure time,Solar simulator,Molecule SMILE
Molecule keywords,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
triphenylamine,805,805,805,805,805,805,799,805,805,805,805,805,805
phenothiazine,263,263,263,263,263,263,263,263,263,263,263,263,263
indoline,180,180,180,180,180,180,180,180,180,180,180,180,180
carbazole,175,175,175,175,175,175,175,175,175,175,175,175,175
coumarin,58,58,58,58,58,58,58,58,58,58,58,58,58
...,...,...,...,...,...,...,...,...,...,...,...,...,...
"phenothiazine, indole",1,1,1,1,1,1,1,1,1,1,1,1,1
"phenothiazine, pyridinium",1,1,1,1,1,1,1,1,1,1,1,1,1
"phenothiazine,bodipy",1,1,1,1,1,1,1,1,1,1,1,1,1
"phenoxazine,bodipy",1,1,1,1,1,1,1,1,1,1,1,1,1


In [368]:
solar_sim_lst_1 = ["AM 1.5G 100 mW/cm2",
                   "AM 1.5 100 mW/cm2",
                   "AM 1.5 100mW/cm2",
                   "AM 1.5G 100mW/cm2",
                   "AM 1.5G, 100 mW/cm2",
                   "AM 1.5G 100 mW/cm2",
                   "AM 1.5G, 100mW/cm2",
                   "AM 1.5 100mW/cm2",
                   "AM 1.5G, 100 mw/cm2",
                   "AM1.5G, 100 mW/cm2",
                   "AM1.5 G 100 mW/cm2",
                   "AM 1.5G  100 mW/cm2"]

solar_sim_lst_2 = ["AM 1.5", "AM 1.5G", "AM 1.5 G", "AM1.5G"]

solar_sim_lst_3 = ["AM 1.5G 85mW/cm2", "AM 1.5 85mW/cm2", "AM 1.5G, 85mW/cm2", "AM 1.5G, 85 mW/cm2", "AM 1.5 85 mW/cm2"]

solar_sim_lst_4 = ["AM 1.5G 96 mW/cm2", "AM 1.5 96mW/cm2"]

solar_sim_lst_5 = ["AM 1.5G, 50mW/cm2", "AM 1.5G 50mW/cm2", "AM 1.5G 50 mW/cm2"]

solar_sim_lst_6 = ["AM 1.5G, 42 mW/cm2", "AM 1.5G 42mW/cm2"]

solar_sim_lst_7 = ["AM 1.5G  80 mW/cm2", "AM 1.5G 80mW/cm2"]

for index1 in range(len(solar_sim_lst_1)):
    no_metals.loc[(no_metals['Solar simulator']==solar_sim_lst_1[index1]), 'Solar simulator'] = "AM 1.5G 100mW/cm2"

for index2 in range(len(solar_sim_lst_2)):
    no_metals.loc[(no_metals['Solar simulator']==solar_sim_lst_2[index2]), 'Solar simulator'] = "AM 1.5G"

for index3 in range(len(solar_sim_lst_3)):
    no_metals.loc[(no_metals['Solar simulator']==solar_sim_lst_3[index3]), 'Solar simulator'] = "AM 1.5G 85mW/cm2"

for index4 in range(len(solar_sim_lst_4)):
    no_metals.loc[(no_metals['Solar simulator']==solar_sim_lst_4[index4]), 'Solar simulator'] = "AM 1.5G 96mW/cm2"

for index5 in range(len(solar_sim_lst_5)):
    no_metals.loc[(no_metals['Solar simulator']==solar_sim_lst_5[index5]), 'Solar simulator'] = "AM 1.5G 50mW/cm2"

for index6 in range(len(solar_sim_lst_6)):
    no_metals.loc[(no_metals['Solar simulator']==solar_sim_lst_6[index6]), 'Solar simulator'] = "AM 1.5G 42mW/cm2"

for index7 in range(len(solar_sim_lst_7)):
    no_metals.loc[(no_metals['Solar simulator']==solar_sim_lst_7[index7]), 'Solar simulator'] = "AM 1.5G 80mW/cm2"


In [369]:
no_metals.groupby(['Solar simulator']).count().sort_values('PCE', ascending=False)

Unnamed: 0_level_0,VOC,JSC,FF,PCE,Electrolyte,Active area,Co-adsorbent,Co-sensitizer,Semiconductor,Dye loading,Exposure time,Molecule SMILE,Molecule keywords
Solar simulator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
AM 1.5G 100mW/cm2,1818,1818,1818,1818,1818,1818,1812,1818,1818,1818,1818,1818,1818
AM 1.5G,83,83,83,83,83,83,83,83,83,83,83,83,83
AM 1.5G 85mW/cm2,30,30,30,30,30,30,30,30,30,30,30,30,30
AM 1.5G 80mW/cm2,13,13,13,13,13,13,13,13,13,13,13,13,13
AM 1.5G 96mW/cm2,9,9,9,9,9,9,9,9,9,9,9,9,9
AM 1.5G 70mW/cm2,8,8,8,8,8,8,8,8,8,8,8,8,8
AM 1.5 75mW/cm2,6,6,6,6,6,6,6,6,6,6,6,6,6
AM 1.5G 50mW/cm2,5,5,5,5,5,5,5,5,5,5,5,5,5
AM 1.5G 0.51 sun,5,5,5,5,5,5,5,5,5,5,5,5,5
AM 1.5G 42mW/cm2,4,4,4,4,4,4,4,4,4,4,4,4,4


In [370]:
no_metals.head()

Unnamed: 0,VOC,JSC,FF,PCE,Electrolyte,Active area,Co-adsorbent,Co-sensitizer,Semiconductor,Dye loading,Exposure time,Solar simulator,Molecule SMILE,Molecule keywords
0,687.0,10.79,0.7,5.19,0.6 M 1-methyl-3-propylimidazolium iodide (PMI...,-,-,-,TiO2 (20 nm particle size),-,15 hours,AM 1.5G 100mW/cm2,N#C/C(=C\c1ccc(cc1)N(c1ccccc1)CCCCCCN1c2ccccc2...,phenothiazine
7,678.0,13.2,0.67,6.0,"1.0 M 1,3-dimethylimidazolium iodide, 0.03 M I...",0.158,-,-,TiO2 film (8 uM thick + 5 uM scattering layer),-,5 hours,AM 1.5G 100mW/cm2,N#C/C(=C\c1ccc(s1)c1ccc(s1)c1ccc(cc1)N(c1ccc(c...,"coumarin, triphenylamine"
8,695.0,12.2,0.74,6.2,"1.0 M 1,3-dimethylimidazolium iodide, 0.03 M I...",0.158,-,-,TiO2 film (8 uM thick + 5 uM scattering layer),-,5 hours,AM 1.5G 100mW/cm2,N#C/C(=C\c1ccc(s1)c1ccc(cc1)N(c1ccc(cc1)/C=C/c...,"coumarin, triphenylamine"
9,800.0,9.0,0.76,5.5,"1.0 M 1,3-dimethylimidazolium iodide, 0.03 M I...",0.158,-,-,TiO2 film (8 uM thick + 5 uM scattering layer),-,5 hours,AM 1.5G 100mW/cm2,CCN(c1ccc2c(c1)oc(=O)c(c2)/C=C/c1ccc(cc1)N(c1c...,"coumarin, triphenylamine"
10,560.0,3.41,0.73,1.39,OPV-AN-I (iodide-triiodide),0.16,-,-,TiO2 film,-,24 hours,AM 1.5G 100mW/cm2,CCN([C@@H]1C=Cc2c(C1)oc(=O)c(c2)c1ccc(cc1)c1cc...,coumarin


In [371]:
no_metals.groupby(['Electrolyte']).count().sort_values('PCE', ascending=False)

Unnamed: 0_level_0,VOC,JSC,FF,PCE,Active area,Co-adsorbent,Co-sensitizer,Semiconductor,Dye loading,Exposure time,Solar simulator,Molecule SMILE,Molecule keywords
Electrolyte,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
iodide/triiodide,59,59,59,59,59,59,59,59,59,59,59,59,59
"LiI (500 mM), iodine (50 mM), 4-tBu pyridine (580 mM) and ethyl-methyl-imidazolium dicyanoimide (600 mM) in MeCN",50,50,50,50,50,50,50,50,50,50,50,50,50
"LiI (100 mM), iodine (30 mM), 4-tert-butyl pyridine (500 mM) and 1-butyl-3-methyl-imidazolium iodide (600 mM) in mixed solvent of acetonitrile and varelonitrile (volume ratio of 85:15)",29,29,29,29,29,29,29,29,29,29,29,29,29
"0.05 M I2, 0.5 M LiI, 0.5 M tert-butylpyridine in acetonitrile",18,18,18,18,18,18,18,18,18,18,18,18,18
"LiI (0.5 M), I2 (0.05 M), and 4-tert-butylpyridine (0.5 M) in acetonitrile",16,16,16,16,16,16,16,16,16,16,16,16,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...
"0.6 M DMPImI, 0.05 M LiI, and 0.03 M I2 in acetonitrile/valeronitrile (85/15, v/v)",1,1,1,1,1,1,1,1,1,1,1,1,1
"0.6 M DMPImI, 0.1 M LiI, 0.05 M I2 , and 0.1 M TBP",1,1,1,1,1,1,1,1,1,1,1,1,1
"0.10 M lithium iodide, 0.60 M butylmethylimidazolium iodide, 0.05 M I2, and 0.05 M 4-tert-butylpyridine in AcCN:valeronitrile (v/v, 85:15).",1,1,1,1,1,1,1,1,1,1,1,1,1
"0.6 M DMPImI, 0.1 M LiI, 0.05 M I2 , and 0.5 M TBP",1,1,1,1,1,1,1,1,1,1,1,1,1


In [372]:
varElectrolyte1 = ["iod", "Iod","LiI", "I2", "AN-I", "Mosalyte", "HI-30"]
varElectrolyte2 = ["Co", "cobalt"]
varElectrolyte3 = ["bromide", "Br"]
varElectrolyte4 = ["Spiro", "spiro"]
varElectrolyte5 = ["DMPIC"]
varElectrolyte6 = ["CuI(dmp)2TFSI,"]

for electolyte1 in range(len(varElectrolyte1)):
    no_metals.loc[(no_metals['Electrolyte'].str.contains(varElectrolyte1[electolyte1])), 'Electrolyte'] = "Iodide_Triiodide"

for electrolyte2 in range(len(varElectrolyte2)):
    no_metals.loc[(no_metals['Electrolyte'].str.contains(varElectrolyte2[electrolyte2])), 'Electrolyte'] = "Co(II)_Co(III)"

for electrolyte3 in range(len(varElectrolyte3)):
    no_metals.loc[(no_metals['Electrolyte'].str.contains(varElectrolyte3[electrolyte3])), 'Electrolyte'] = "Bromide_Tribromide"

for electrolyte4 in range(len(varElectrolyte4)):
    no_metals.loc[(no_metals['Electrolyte'].str.contains(varElectrolyte4[electrolyte4])), 'Electrolyte'] = "Spiro-OMeTAD"

for electrolyte5 in range(len(varElectrolyte5)):
    no_metals.loc[(no_metals['Electrolyte'].str.contains(varElectrolyte5[electrolyte5])), 'Electrolyte'] = "DMPIC_DMPIDC"

for electrolyte6 in range(len(varElectrolyte6)):
    no_metals.loc[(no_metals['Electrolyte'].str.contains(varElectrolyte6[electrolyte6], regex=False)), 'Electrolyte'] = "Cu(I)_Cu(II)"

In [373]:
no_metals.groupby(['Electrolyte']).count().sort_values('PCE', ascending=False)

Unnamed: 0_level_0,VOC,JSC,FF,PCE,Active area,Co-adsorbent,Co-sensitizer,Semiconductor,Dye loading,Exposure time,Solar simulator,Molecule SMILE,Molecule keywords
Electrolyte,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Iodide_Triiodide,1868,1868,1868,1868,1868,1867,1868,1868,1868,1868,1868,1868,1868
Co(II)_Co(III),97,97,97,97,97,97,97,97,97,97,97,97,97
Spiro-OMeTAD,12,12,12,12,12,12,12,12,12,12,12,12,12
not specified,7,7,7,7,7,7,7,7,7,7,7,7,7
"DHS-Z23, Heptachroma",6,6,6,6,6,1,6,6,6,6,6,6,6
Bromide_Tribromide,5,5,5,5,5,5,5,5,5,5,5,5,5
Z-50 (Solaronix),5,5,5,5,5,5,5,5,5,5,5,5,5
0.005 M isopropanol solution of H2PtCl6·6H2O,3,3,3,3,3,3,3,3,3,3,3,3,3
"EL-HSE, Dyesol",3,3,3,3,3,3,3,3,3,3,3,3,3
solid organic ionic conductors,1,1,1,1,1,1,1,1,1,1,1,1,1


In [374]:
no_metals.groupby(['Semiconductor']).mean().sort_values('PCE', ascending=False)

  no_metals.groupby(['Semiconductor']).mean().sort_values('PCE', ascending=False)


Unnamed: 0_level_0,VOC,JSC,FF,PCE
Semiconductor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
TiO2 film (5.2 um + 5 um scattering layer),956.000000,17.030000,0.770000,12.500000
TiO2 film + 3D photonic crystal layer,811.000000,18.790000,0.733100,11.180000
TiO2 film (4 um thick + 3.5 um scattering layer),827.000000,17.550000,0.752933,10.943333
TiO2 film (5 um thick + 4.1 um scattering layer),790.000000,3.880000,0.780000,10.300000
TiO2 film (8 uM thick),803.333333,13.123333,0.742000,9.486667
...,...,...,...,...
TiO2 film 2.1 um thick,502.500000,1.194500,0.459000,0.284000
TiO2 16uM,215.000000,1.505000,0.455000,0.180000
TiO2 film (15 um transparent + 10 um scat­tering layer),197.500000,0.972500,0.540000,0.129700
NiO film 2 um thick,115.000000,2.700000,0.360000,0.113000


In [375]:
varSemiconductor = ["TiO2"]
varSemiconductor2 = ["ZnO"]
varSemiconductor3 = ["NiO"]

for semiconductor in range(len(varSemiconductor)):
    no_metals.loc[(no_metals['Semiconductor'].str.contains(varSemiconductor[semiconductor], regex=False)), 'Semiconductor'] = "TiO2"

for semiconductor2 in range(len(varSemiconductor2)):
    no_metals.loc[(no_metals['Semiconductor'].str.contains(varSemiconductor2[semiconductor2], regex=False)), 'Semiconductor'] = "ZnO"

for semiconductor3 in range(len(varSemiconductor3)):
    no_metals.loc[(no_metals['Semiconductor'].str.contains(varSemiconductor3[semiconductor3], regex=False)), 'Semiconductor'] = "NiO"

In [376]:
no_metals.groupby(['Semiconductor']).count().sort_values('PCE', ascending=False)

Unnamed: 0_level_0,VOC,JSC,FF,PCE,Electrolyte,Active area,Co-adsorbent,Co-sensitizer,Dye loading,Exposure time,Solar simulator,Molecule SMILE,Molecule keywords
Semiconductor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
TiO2,1947,1947,1947,1947,1947,1947,1941,1947,1947,1947,1947,1947,1947
ZnO,40,40,40,40,40,40,40,40,40,40,40,40,40
NiO,17,17,17,17,17,17,17,17,17,17,17,17,17
zinc titanium oxide film,2,2,2,2,2,2,2,2,2,2,2,2,2
Zn2SnO4 film 12-14 um,1,1,1,1,1,1,1,1,1,1,1,1,1


In [377]:
no_metals.groupby(['Co-adsorbent']).count().sort_values('PCE', ascending=False)

Unnamed: 0_level_0,VOC,JSC,FF,PCE,Electrolyte,Active area,Co-sensitizer,Semiconductor,Dye loading,Exposure time,Solar simulator,Molecule SMILE,Molecule keywords
Co-adsorbent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
-,1390,1390,1390,1390,1390,1390,1390,1390,1390,1390,1390,1390,1390
10 mM CDCA,92,92,92,92,92,92,92,92,92,92,92,92,92
5 mM CDCA,88,88,88,88,88,88,88,88,88,88,88,88,88
1 mM CDCA,40,40,40,40,40,40,40,40,40,40,40,40,40
20 mM CDCA,37,37,37,37,37,37,37,37,37,37,37,37,37
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20 mM diphenyphosphinic acid,1,1,1,1,1,1,1,1,1,1,1,1,1
1 mM HC-A1 (hole conductor),1,1,1,1,1,1,1,1,1,1,1,1,1
1mM CDCA,1,1,1,1,1,1,1,1,1,1,1,1,1
150 mM DCA,1,1,1,1,1,1,1,1,1,1,1,1,1


In [378]:
no_metals.loc[(no_metals['Co-adsorbent']=="-"), 'Co-adsorbent'] = 'None'

no_metals.loc[(no_metals['Co-adsorbent'].isnull()), 'Co-adsorbent'] = 'None'

no_metals.groupby(['Co-adsorbent']).count().sort_values('PCE', ascending=False)

Unnamed: 0_level_0,VOC,JSC,FF,PCE,Electrolyte,Active area,Co-sensitizer,Semiconductor,Dye loading,Exposure time,Solar simulator,Molecule SMILE,Molecule keywords
Co-adsorbent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
,1396,1396,1396,1396,1396,1396,1396,1396,1396,1396,1396,1396,1396
10 mM CDCA,92,92,92,92,92,92,92,92,92,92,92,92,92
5 mM CDCA,88,88,88,88,88,88,88,88,88,88,88,88,88
1 mM CDCA,40,40,40,40,40,40,40,40,40,40,40,40,40
20 mM CDCA,37,37,37,37,37,37,37,37,37,37,37,37,37
...,...,...,...,...,...,...,...,...,...,...,...,...,...
12 mM CDCA,1,1,1,1,1,1,1,1,1,1,1,1,1
120 mM CDCA,1,1,1,1,1,1,1,1,1,1,1,1,1
20 mM diphenyphosphinic acid,1,1,1,1,1,1,1,1,1,1,1,1,1
0.75 mM CDCA,1,1,1,1,1,1,1,1,1,1,1,1,1


In [379]:
varCoAdsorbent = ["CDCA"]

for coAdsorbent in range(len(varCoAdsorbent)):
    no_metals.loc[(no_metals['Co-adsorbent'].str.contains(varCoAdsorbent[coAdsorbent])), 'Co-adsorbent'] = "CDCA"

In [380]:
no_metals.groupby(['Co-adsorbent']).count().sort_values('PCE', ascending=False)

Unnamed: 0_level_0,VOC,JSC,FF,PCE,Electrolyte,Active area,Co-sensitizer,Semiconductor,Dye loading,Exposure time,Solar simulator,Molecule SMILE,Molecule keywords
Co-adsorbent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
,1396,1396,1396,1396,1396,1396,1396,1396,1396,1396,1396,1396,1396
CDCA,472,472,472,472,472,472,472,472,472,472,472,472,472
20 mM DCA,13,13,13,13,13,13,13,13,13,13,13,13,13
10 mM DCA,12,12,12,12,12,12,12,12,12,12,12,12,12
1 mM cholic acid,12,12,12,12,12,12,12,12,12,12,12,12,12
1 mM DCA,11,11,11,11,11,11,11,11,11,11,11,11,11
2 mM cholic acid,10,10,10,10,10,10,10,10,10,10,10,10,10
1 mmol/dm3 cholic acid,6,6,6,6,6,6,6,6,6,6,6,6,6
0.5 mM DCA,6,6,6,6,6,6,6,6,6,6,6,6,6
0.5 mmol/dm3 cholic acid,6,6,6,6,6,6,6,6,6,6,6,6,6


In [381]:
varCoAdsorbent2 = [" DCA", "DCA 20 uM"]
varCoAdsorbent3 = ["cholic acid"]
varCoAdsorbent4 = ["isooctyltrimethoxysilane"]
varCoAdsorbent5 = ["isooctyltriethoxysilane"]
varCoAdsorbent6 = ["Saturated", "saturated"]
varCoAdsorbent7 = ["HC-A1"]

for coAdsorbent2 in range(len(varCoAdsorbent2)):
    no_metals.loc[(no_metals['Co-adsorbent'].str.contains(varCoAdsorbent2[coAdsorbent2])), 'Co-adsorbent'] = "DCA"

for coAdsorbent3 in range(len(varCoAdsorbent3)):
    no_metals.loc[(no_metals['Co-adsorbent'].str.contains(varCoAdsorbent3[coAdsorbent3])), 'Co-adsorbent'] = "cholic acid"

for coAdsorbent4 in range(len(varCoAdsorbent4)):
    no_metals.loc[(no_metals['Co-adsorbent'].str.contains(varCoAdsorbent4[coAdsorbent4])), 'Co-adsorbent'] = "isooctyltrimethoxysilane"

for coAdsorbent5 in range(len(varCoAdsorbent5)):
    no_metals.loc[(no_metals['Co-adsorbent'].str.contains(varCoAdsorbent5[coAdsorbent5])), 'Co-adsorbent'] = "isooctyltriethoxysilane"

for coAdsorbent6 in range(len(varCoAdsorbent6)):
    no_metals.loc[(no_metals['Co-adsorbent'].str.contains(varCoAdsorbent6[coAdsorbent6])), 'Co-adsorbent'] = "Saturated"

for coAdsorbent7 in range(len(varCoAdsorbent7)):
    no_metals.loc[(no_metals['Co-adsorbent'].str.contains(varCoAdsorbent7[coAdsorbent7])), 'Co-adsorbent'] = "HC-A1"

no_metals.groupby(['Co-adsorbent']).count().sort_values('PCE', ascending=False)

Unnamed: 0_level_0,VOC,JSC,FF,PCE,Electrolyte,Active area,Co-sensitizer,Semiconductor,Dye loading,Exposure time,Solar simulator,Molecule SMILE,Molecule keywords
Co-adsorbent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
,1396,1396,1396,1396,1396,1396,1396,1396,1396,1396,1396,1396,1396
CDCA,472,472,472,472,472,472,472,472,472,472,472,472,472
DCA,85,85,85,85,85,85,85,85,85,85,85,85,85
cholic acid,39,39,39,39,39,39,39,39,39,39,39,39,39
HC-A1,4,4,4,4,4,4,4,4,4,4,4,4,4
1 mM cholanic acid,3,3,3,3,3,3,3,3,3,3,3,3,3
isooctyltrimethoxysilane,3,3,3,3,3,3,3,3,3,3,3,3,3
isooctyltriethoxysilane,2,2,2,2,2,2,2,2,2,2,2,2,2
20 mM diphenyphosphinic acid,1,1,1,1,1,1,1,1,1,1,1,1,1
7 mM CC1(C2=CC=CC=C2C=2C=CC(=CC12)N(C1=CC=C(C(=O)O)C=C1)C1=CC=2C(C3=CC=CC=C3C2C=C1)(C)C)C,1,1,1,1,1,1,1,1,1,1,1,1,1


In [382]:
no_metals.groupby(['Co-sensitizer']).count().sort_values('PCE', ascending=False)

Unnamed: 0_level_0,VOC,JSC,FF,PCE,Electrolyte,Active area,Co-adsorbent,Semiconductor,Dye loading,Exposure time,Solar simulator,Molecule SMILE,Molecule keywords
Co-sensitizer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
-,1980,1980,1980,1980,1980,1980,1980,1980,1980,1980,1980,1980,1980
N719,11,11,11,11,11,11,11,11,11,11,11,11,11
TC2,4,4,4,4,4,4,4,4,4,4,4,4,4
"N719, Ruthenium",2,2,2,2,2,2,2,2,2,2,2,2,2
S2,2,2,2,2,2,2,2,2,2,2,2,2,2
0.73e-04 mol/cm3,1,1,1,1,1,1,1,1,1,1,1,1,1
0.89e-04 mol/cm3,1,1,1,1,1,1,1,1,1,1,1,1,1
0.92e-04 mol/cm3,1,1,1,1,1,1,1,1,1,1,1,1,1
D3,1,1,1,1,1,1,1,1,1,1,1,1,1
D35,1,1,1,1,1,1,1,1,1,1,1,1,1


In [383]:
no_metals.loc[(no_metals['Co-sensitizer'] == "-"), 'Co-sensitizer'] = 'None'

no_metals.loc[(no_metals['Co-sensitizer'].isnull()), 'Co-sensitizer'] = 'None'

no_metals.groupby(['Co-sensitizer']).count().sort_values('PCE', ascending=False)

Unnamed: 0_level_0,VOC,JSC,FF,PCE,Electrolyte,Active area,Co-adsorbent,Semiconductor,Dye loading,Exposure time,Solar simulator,Molecule SMILE,Molecule keywords
Co-sensitizer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
,1980,1980,1980,1980,1980,1980,1980,1980,1980,1980,1980,1980,1980
N719,11,11,11,11,11,11,11,11,11,11,11,11,11
TC2,4,4,4,4,4,4,4,4,4,4,4,4,4
"N719, Ruthenium",2,2,2,2,2,2,2,2,2,2,2,2,2
S2,2,2,2,2,2,2,2,2,2,2,2,2,2
0.73e-04 mol/cm3,1,1,1,1,1,1,1,1,1,1,1,1,1
0.89e-04 mol/cm3,1,1,1,1,1,1,1,1,1,1,1,1,1
0.92e-04 mol/cm3,1,1,1,1,1,1,1,1,1,1,1,1,1
D3,1,1,1,1,1,1,1,1,1,1,1,1,1
D35,1,1,1,1,1,1,1,1,1,1,1,1,1


In [384]:
varCoSensitizer = ["D35"]
varCoSensitizer2 = ["D149", "D131"]
varCoSensitizer3 = ["mol/cm3"]

for coSensitizer in range(len(varCoSensitizer)):
    no_metals.loc[(no_metals['Co-sensitizer'].str.contains(varCoSensitizer[coSensitizer])), 'Co-sensitizer'] = "D35"

for coSensitizer2 in range(len(varCoSensitizer2)):
    no_metals.loc[(no_metals['Co-sensitizer'].str.contains(varCoSensitizer2[coSensitizer2])), 'Co-sensitizer'] = "D149/D131"

for coSensitizer3 in range(len(varCoSensitizer3)):
    no_metals.loc[(no_metals['Co-sensitizer'].str.contains(varCoSensitizer3[coSensitizer3])), 'Co-sensitizer'] = "Unknown"

no_metals.groupby(['Co-sensitizer']).count().sort_values('PCE', ascending=False)

Unnamed: 0_level_0,VOC,JSC,FF,PCE,Electrolyte,Active area,Co-adsorbent,Semiconductor,Dye loading,Exposure time,Solar simulator,Molecule SMILE,Molecule keywords
Co-sensitizer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
,1980,1980,1980,1980,1980,1980,1980,1980,1980,1980,1980,1980,1980
N719,11,11,11,11,11,11,11,11,11,11,11,11,11
TC2,4,4,4,4,4,4,4,4,4,4,4,4,4
Unknown,3,3,3,3,3,3,3,3,3,3,3,3,3
"N719, Ruthenium",2,2,2,2,2,2,2,2,2,2,2,2,2
S2,2,2,2,2,2,2,2,2,2,2,2,2,2
D3,1,1,1,1,1,1,1,1,1,1,1,1,1
D35,1,1,1,1,1,1,1,1,1,1,1,1,1
IQ21,1,1,1,1,1,1,1,1,1,1,1,1,1
QX20 cosensitizer,1,1,1,1,1,1,1,1,1,1,1,1,1


In [385]:
no_metals.columns

Index(['VOC', 'JSC', 'FF', 'PCE', 'Electrolyte', 'Active area', 'Co-adsorbent',
       'Co-sensitizer', 'Semiconductor', 'Dye loading', 'Exposure time',
       'Solar simulator', 'Molecule SMILE', 'Molecule keywords'],
      dtype='object')

In [386]:
no_metals.shape

(2007, 14)

In [387]:
# dropping experimental feature space for now

dye_device_activity = no_metals.drop(['Active area', 'Dye loading', 'Exposure time', 'Solar simulator'], axis=1)

In [388]:
dye_device_activity

Unnamed: 0,VOC,JSC,FF,PCE,Electrolyte,Co-adsorbent,Co-sensitizer,Semiconductor,Molecule SMILE,Molecule keywords
0,687.0,10.79,0.70,5.19,Iodide_Triiodide,,,TiO2,N#C/C(=C\c1ccc(cc1)N(c1ccccc1)CCCCCCN1c2ccccc2...,phenothiazine
7,678.0,13.20,0.67,6.00,Iodide_Triiodide,,,TiO2,N#C/C(=C\c1ccc(s1)c1ccc(s1)c1ccc(cc1)N(c1ccc(c...,"coumarin, triphenylamine"
8,695.0,12.20,0.74,6.20,Iodide_Triiodide,,,TiO2,N#C/C(=C\c1ccc(s1)c1ccc(cc1)N(c1ccc(cc1)/C=C/c...,"coumarin, triphenylamine"
9,800.0,9.00,0.76,5.50,Iodide_Triiodide,,,TiO2,CCN(c1ccc2c(c1)oc(=O)c(c2)/C=C/c1ccc(cc1)N(c1c...,"coumarin, triphenylamine"
10,560.0,3.41,0.73,1.39,Iodide_Triiodide,,,TiO2,CCN([C@@H]1C=Cc2c(C1)oc(=O)c(c2)c1ccc(cc1)c1cc...,coumarin
...,...,...,...,...,...,...,...,...,...,...
4419,740.0,14.56,0.68,7.39,Iodide_Triiodide,,,TiO2,C(CCCCC)N1C=2C=CC(=CC2C=2N(C=3C=CC(=CC3C21)/C=...,indoline
4422,651.0,9.93,0.70,4.52,Iodide_Triiodide,,,TiO2,C(#N)/C(/C(=O)O)=C\C=1SC(=CC1)C=1C=CC=2N(C3=CC...,phenothiazine
4423,667.0,12.85,0.71,6.09,Iodide_Triiodide,,,TiO2,CSC1=CC=C(C=C1)C=1C=C2SC=3C=C(C=CC3N(C2=CC1)CC...,phenothiazine
4424,659.0,12.46,0.69,5.65,Iodide_Triiodide,,,TiO2,COC1=CC=C(C=C1)C=1C=C2SC=3C=C(C=CC3N(C2=CC1)CC...,phenothiazine


In [389]:
activity_df = dye_device_activity[['VOC', 'JSC', 'FF', 'PCE']]
dye_df = dye_device_activity[['Molecule SMILE', 'Molecule keywords']]
device_df = dye_device_activity[['Electrolyte', 'Co-adsorbent', 'Co-sensitizer', 'Semiconductor', 'Molecule keywords']]

In [390]:
activity_df.head()

Unnamed: 0,VOC,JSC,FF,PCE
0,687.0,10.79,0.7,5.19
7,678.0,13.2,0.67,6.0
8,695.0,12.2,0.74,6.2
9,800.0,9.0,0.76,5.5
10,560.0,3.41,0.73,1.39


In [391]:
dye_df.shape

(2007, 2)

In [392]:
device_df.shape

(2007, 5)

In [393]:
activity_df.to_csv('Data/fp_activity.csv')
dye_df.to_csv('Data/fp_dye.csv')
device_df.to_csv('Data/fp_device.csv')