In [1]:
import os
os.chdir("..")
assert 'PromptDataExtract' in os.getcwd()

In [2]:
# Load settings
from backend import sett, postgres
from backend.utils.frame import Frame
sett.load_settings()
postgres.load_settings()

Load OK: settings.yaml


In [3]:
# Connect to database
db = postgres.connect('polylet')

[1;36m      --[0m postgres_ SSH tunnel established.
[1;36mNOTE  --[0m postgres_ Connected to PostGres DB: polylet (took 0.048 s)


In [4]:
import pandas as pd
import matplotlib.pyplot as plt
try:
    plt.style.use("PromptDataExtraction/notebooks/matplotlib.mplstyle")
except:
    try:
        plt.style.use("~/matplotlib.mplstyle")
    except: pass

In [5]:
def execute(sql, **kwargs) -> pd.DataFrame:
    """ Query the database using raw sql.
        Return a pandas dataframe containing the results.
    """
    results = postgres.raw_sql(sql, kwargs)
    if not results:
        return None
    return pd.DataFrame.from_records(results, columns=results[0]._fields)

# Property Specific vs. General

In [22]:
# Methods IDs for property specific similar GPT pipeline.
gpt_similar_id_name = {
    164: 'glass transition temperature',
    171: 'melting temperature',
    155: 'thermal decomposition temperature',
    169: 'thermal conductivity',
    174: 'tensile strength',
    159: 'youngs modulus',
    130: 'compressive strength', 
    133: 'elongation at break',
    127: 'flexural strength',
    172: 'impact strength',
    157: 'ion exchange capacity',
    163: 'ionic conductivity',
    156: 'water contact angle',
    129: 'dielectric constant',
    134: 'density',
    131:  'bandgap',
    158: 'limiting oxygen index',
    170: 'hardness',
    173: 'lower critical solution temperature',
    166: 'upper critical solution temperature',
    132: 'CO_{2} permeability',
    167: 'O_{2} permeability',
    168: 'H_{2} permeability',
    128: 'crystallization temperature', 
    162: 'refractive index', 
    161: 'water uptake', 
    160: 'swelling degree', 
    165: 'methanol permeability'    
}

In [44]:
# All extracted properties by the P-GPT pipeline.
propnames = []
for _id, _item in gpt_similar_id_name.items():
    # print(_item)
    rows = postgres.raw_sql("""
                SELECT DISTINCT(ep.entity_name) FROM extracted_properties ep
                WHERE ep.method_id = :mid""", mid = _id)
    for row in rows:
        propnames.append(row.entity_name.lower())
propnames = list(set(propnames))
propnames = sorted(propnames)

In [48]:
print("Properties extracted by the property specific pipeline:", len(propnames))
propnames

Properties extracted by the property specific pipeline: 911


['%-elongation at break',
 '%coverage',
 '%χc',
 '0-1 transition',
 '0-2 transition',
 '1% mass loss temperature',
 '1% weight loss temperature',
 '1-0 transition',
 '10 % weight loss temperature',
 '10 wt % weight loss temperature',
 '10% decomposition temperature',
 '10% degradation temperature',
 '10% thermal decomposition temperature',
 '10% weight loss temperature',
 '10% weight-loss temperature',
 '2-0 transition',
 '20% weight loss decomposition temperature',
 '5 wt % decomposition temperature',
 '5% decomposition temperature',
 '5% weight loss temperature',
 '5% weight loss temperature (t5%)',
 '50% cp change during vitrification',
 '50% weight loss decomposition temperature',
 '50% weight loss temperature',
 'abs max',
 'abs onset',
 'absorption band',
 'absorption band edges',
 'absorption bands',
 'absorption coefficient',
 'absorption edge',
 'absorption edge optical band gap',
 'absorption maxima',
 'absorption maximum',
 'absorption maximum in solution',
 'absorption maxi

In [47]:
# Known property names.
knownprops = []
for row in postgres.raw_sql("""
    Select * From property_metadata;
"""):
    knownprops += row.other_names

knownprops = sorted([k.lower() for k in knownprops])
knownprops = list(set(knownprops))

print("Known polymer property names:", len(knownprops))
knownprops

Known polymer property names: 396


['energy density',
 't_{c}',
 'bandgaps',
 'compressive stress',
 'calculated molecular weight',
 'refractive indices',
 'glassy transition temperature (t g)',
 'band gap energies',
 'h2/co2 selectivity',
 'specific capacity',
 'highest occupied molecular orbital levels',
 'light power conversion efficiency',
 'relative crystallinity',
 'm_{w}/m_{n}',
 'current efficiency',
 't_{g})',
 'bending strength',
 'm_{n}',
 'polydispersity indices',
 '% crystallinity',
 'optical-band-gap energies',
 'rejections',
 'glass transition',
 'lcst',
 'power conversion efficiencies pces',
 'water contact angle',
 'glass transition temperature (tg)',
 't g',
 'compressive strengths',
 'ocv',
 'short-circuit density',
 'areal power density',
 'mn',
 'bend strength',
 't_d',
 'j_{sc}',
 'optical band gap energy',
 'refractive index',
 'tc',
 "young's modulus",
 'homo energy level',
 'degrees of crystallinity (xc)',
 'band-gaps',
 'optical bandgaps',
 'swelling degree',
 'co2 permeance',
 'minimum water a

In [49]:
# General extracted names.
method = postgres.raw_sql("""
    Select * From extraction_methods Where name = 'g-ner-gpt35-similar-sel1k';
""")[0]

gprops = postgres.raw_sql("""
SELECT DISTINCT(ep.entity_name) FROM extracted_properties ep
WHERE ep.method_id = :mid;""", mid = method.id)
gprops = [p.entity_name.lower() for p in gprops]
gprops = sorted(gprops)

print("Properties extracted by the general pipeline:", len(gprops))
gprops

Properties extracted by the general pipeline: 5341


['%',
 '% crystallinity',
 '% rejection',
 '% xylose rejection',
 '% χ c',
 '%-elongation at break',
 '%coverage',
 '%de_{30}',
 '%de_{90}',
 '%i\\mu',
 '(010) diffraction peak',
 '(010) i-i stacking peak position',
 '(010) scattering peak',
 '(100) diffraction peaks',
 '(100) scattering peak',
 '(300) scattering peak',
 '(dn/dc)μ',
 '(oi)m',
 '(δ m/ δ e)',
 '(δ q/ δ e)',
 '(τ_{ac} + τ_{br})',
 '-(ch_{2}) n-protons of dodecyloxy group',
 '-ch resonance',
 '-och_{2} -groups attached to thiophene ring',
 '-och_{2} ch_{2} peak',
 '-so_{3} h absorption intensity',
 '-δ s_{m}',
 '0-0 absorption peak',
 '0-0 transition',
 '0-0 transition in film',
 '0-0 transition in solution',
 '0-1 absorption peak',
 '0-1 transition',
 '0-2 absorption peak',
 '0-2 transition',
 '1 % mass loss temperature',
 '1 % weight loss temperature',
 '1% weight loss temperature',
 '1,4-cyclohexanedimethanol terephthalate content',
 '1-0 transition',
 '1/ c_{1}',
 '1/n',
 '10 % weight loss temperature',
 '10 wt % weigh

In [50]:
# Extra properties.
extraprops = []
for prop in gprops:
    if prop not in propnames:
        extraprops.append(prop)

print("Properties extracted by the general pipeline, but not by the property specific pipeline:", len(extraprops))
extraprops

Properties extracted by the general pipeline, but not by the property specific pipeline: 4634


['%',
 '% crystallinity',
 '% rejection',
 '% xylose rejection',
 '% χ c',
 '%de_{30}',
 '%de_{90}',
 '%i\\mu',
 '(010) diffraction peak',
 '(010) i-i stacking peak position',
 '(010) scattering peak',
 '(100) diffraction peaks',
 '(100) scattering peak',
 '(300) scattering peak',
 '(dn/dc)μ',
 '(oi)m',
 '(δ m/ δ e)',
 '(δ q/ δ e)',
 '(τ_{ac} + τ_{br})',
 '-(ch_{2}) n-protons of dodecyloxy group',
 '-ch resonance',
 '-och_{2} -groups attached to thiophene ring',
 '-och_{2} ch_{2} peak',
 '-so_{3} h absorption intensity',
 '-δ s_{m}',
 '0-0 absorption peak',
 '0-0 transition',
 '0-0 transition in film',
 '0-0 transition in solution',
 '0-1 absorption peak',
 '0-2 absorption peak',
 '1 % mass loss temperature',
 '1 % weight loss temperature',
 '1,4-cyclohexanedimethanol terephthalate content',
 '1/ c_{1}',
 '1/n',
 '13c nmr',
 '1^{31} p-nmr',
 '1h nmr spectra',
 '1h nmr spectrum',
 '1hnmr',
 '2,6-dfbn (mmol)',
 '250 nm',
 '2d flory exponent (v)',
 '2θ',
 '2θ angle',
 '2θ d value',
 '2θ s

In [51]:
# Extra unknown properties.
unknownprops = []
for prop in gprops:
    if prop not in propnames and prop not in knownprops:
        unknownprops.append(prop)

print("Properties that are not known, and extracted by the general pipeline:", len(unknownprops))
unknownprops

Properties that are not known, and extracted by the general pipeline: 4511


['%',
 '% xylose rejection',
 '% χ c',
 '%de_{30}',
 '%de_{90}',
 '%i\\mu',
 '(010) diffraction peak',
 '(010) i-i stacking peak position',
 '(010) scattering peak',
 '(100) diffraction peaks',
 '(100) scattering peak',
 '(300) scattering peak',
 '(dn/dc)μ',
 '(oi)m',
 '(δ m/ δ e)',
 '(δ q/ δ e)',
 '(τ_{ac} + τ_{br})',
 '-(ch_{2}) n-protons of dodecyloxy group',
 '-ch resonance',
 '-och_{2} -groups attached to thiophene ring',
 '-och_{2} ch_{2} peak',
 '-so_{3} h absorption intensity',
 '-δ s_{m}',
 '0-0 absorption peak',
 '0-0 transition',
 '0-0 transition in film',
 '0-0 transition in solution',
 '0-1 absorption peak',
 '0-2 absorption peak',
 '1 % mass loss temperature',
 '1 % weight loss temperature',
 '1,4-cyclohexanedimethanol terephthalate content',
 '1/ c_{1}',
 '1/n',
 '13c nmr',
 '1^{31} p-nmr',
 '1h nmr spectra',
 '1h nmr spectrum',
 '1hnmr',
 '2,6-dfbn (mmol)',
 '250 nm',
 '2d flory exponent (v)',
 '2θ',
 '2θ angle',
 '2θ d value',
 '2θ scan data range',
 '2θ value',
 '2θ v