In [2]:
import pandas as pd
from pathlib import Path

In [7]:
flat_path = Path.home() / "projects" / "xray-atlas" /  "data"

molecules = [f.name for f in flat_path.glob("*.json") if f.stem != "registry"]
_ = [print(f) for f in molecules]

P3HT.json
PC71BM.json
PBDB-T.json
PC61BM.json
ITIC.json
D18.json
Y11.json
PBTTT.json
Y6.json
N2200.json


In [16]:
import boto3

boto3.setup_default_session(profile_name='lbl')

s3 = boto3.resource('s3')

for bucket in s3.buckets.all():
    if 'data' in bucket.name:
        data_bucket = bucket
        break


P3HT.json already exists on s3
PC71BM.json already exists on s3
PBDB-T.json already exists on s3
PC61BM.json already exists on s3
ITIC.json already exists on s3
D18.json already exists on s3
Y11.json already exists on s3
PBTTT.json already exists on s3
Y6.json already exists on s3
N2200.json already exists on s3


In [15]:
flat_files = [f for f in flat_path.rglob("*.json") if f.stem in [f.stem for f in data_files]]
flat_files

NameError: name 'data_files' is not defined

In [83]:
def unpack_key(key:str):
    split = key.split("_")
    return {
        "edge": f"{split[1]} ({2})",
        "type": split[3],
        "syncrotron": split[4],
        "endstation": split[5],
    }

In [86]:
import json

def add_to_flat_file(file, flat_file):
    # get data from flat file
    flat_data = json.load(flat_file.open())
    xcl = pd.ExcelFile(file)
    experiments = []
    for i, key in enumerate(xcl.sheet_names):
        # check if the key ends in En
        split = key.split("_")
        experiment = flat_data["experiments"][0] | {
                    "edge": f"{split[1]} ({split[2]})",
                    "type": split[4],
                    "synchrotron": split[5],
                    "endstation": split[6],
                    }
        print(experiment)
        data = {}
        sheet = xcl.parse(key)
        # get degrees from every other column
        en = sheet.columns[1::2]
        mu = sheet.columns[0::2]
        for (e, m) in zip(en, mu):
            deg = int(e.split("_")[-1].split("d")[0])
            e_list = sheet[e].tolist()
            m_list = sheet[m].tolist()
            nexafs = {
                "energy": e_list,
                "mu": m_list,
            }
            data[deg] = nexafs
        experiment['data'] = data
        experiments.append(experiment)
    flat_data["experiments"] = experiments
    with flat_file.open("w") as f:
        json.dump(flat_data, f, indent=4)


for file in data_files:
    flat_file = flat_path / f"{file.stem}.json"
    add_to_flat_file(file, flat_file)
    print(f"Added {file} to {flat_file}")

{'edge': 'C (K)', 'type': 'Ps-TEY', 'synchrotron': 'ANSTO', 'endstation': 'SXR', 'group': {'name': '', 'link': ''}, 'units': {'en': 'eV', 'theta': 'deg.', 'mu': 'arb. u.'}, 'data': {'30': {'energy': [0.094234467, 0.09302941, 0.094274715, 0.092356943, 0.090430386, 0.088830099, 0.082513399, 0.085180581, 0.080953352, 0.080396019, 0.081615537, 0.075170755, 0.080290899, 0.074743696, 0.07152418, 0.07052429, 0.07268659, 0.072405584, 0.073373042, 0.074487172, 0.065085635, 0.067382194, 0.06570784, 0.063952647, 0.061055552, 0.062234346, 0.062761106, 0.060392823, 0.062986284, 0.057397876, 0.061698675, 0.054391615, 0.058109157, 0.057217605, 0.053246219, 0.054229036, 0.055944666, 0.052330531, 0.048491895, 0.052361388, 0.05267825, 0.046177372, 0.048599001, 0.049718551, 0.045557018, 0.043214928, 0.044265769, 0.041390207, 0.039973818, 0.040337309, 0.043568753, 0.037364513, 0.038624972, 0.040107667, 0.038101301, 0.038190302, 0.037527021, 0.039090496, 0.029968929, 0.035666712, 0.033115413, 0.029633688, 

In [13]:
import json

# Read the registry.json file
with open(r'C:\Users\hduva\.projects\xray-atlas\data\regestry.json', 'r') as file:
    registry_data = json.load(file)

# Iterate through the molecule list
for molecule in registry_data['molecule']:
    # Construct the new JSON file
    new_data = {
        'header': {
            'name': molecule['name'],
            'formula': molecule['formula'],
            'image': molecule['image'],
            'vendor': molecule['vendor'],
            'cid': molecule['cid'],
            'cas': molecule['cas']
        },
        "prep": "",
        "experiments": [
            {
                "edge": "",
                "type": "",
                "synchrotron": "",
                "endstation": "",
                "group": {
                    "name": "",
                    "link": ""
                },
                 "units": {
                    "en": "eV",
                    "theta": "deg.",
                    "mu": "arb. u."
                },
                "data" : [{
                    "theta": 55,
                    "en": [],
                    "mu" : [],
                }]
                }
            ]
    }

    # Save the new JSON file
    with open(fr'C:\Users\hduva\.projects\xray-atlas\data\{molecule["name"]}.json', 'w') as new_file:
        json.dump(new_data, new_file, indent=4)

In [37]:
from dataclasses import dataclass
# construct datastructures to describe molecule data to be stored in a

@dataclass
class Molecule:
    name: str
    formula: str
    image: str
    vendor: str
    cid: str
    cas: str
    experiments: list[Experiment]

@dataclass
class Prep:
    method: str
    specification: SpinCoat | Pvd

@dataclass
class SpinCoat:
    solvent: str
    concentration: float
    speed: float
    time: float

@dataclass
class Pvd:
    substrate: Molecule
    temp: float
    rate: float
    pressure: float
    thickness: float

@dataclass
class Experiment:
    molecule: Molecule
    prep: Prep
    source: str
    endstation: str
    atom: str
    edge: str
    method: str
    experimentalist: {
        "name": str,
        "email": str,
        "institution": str
    }
    data_path: str

@dataclass
class Manifest:
    molecules: list[Molecule]

In [29]:
import dis

Mol = []
Atom = []
Edge = []
Method = []
Source = []
Endstation = []

for flat_data in data_files:
    xlf = pd.ExcelFile(flat_data)
    sheet_names = xlf.sheet_names

    sheets = []
    for sheet in sheet_names:
        df_ = xlf.parse(sheet)
        sheets.append(df_)
    df_ = pd.concat(sheets, axis=1)
    col = df_.columns
    for c in col:
        split = c.split("_")
        if len(split) == 9:
            split = split[:-1]
        mol, atom, edge, _, method, source, endstation, _ = split
        Mol.append(mol)
        Atom.append(atom)
        Edge.append(edge)
        Method.append(method)
        Source.append(source)
        Endstation.append(endstation)

xcl = pd.DataFrame({"Molecule": Mol, "Atom": Atom, "Edge": Edge, "Method": Method, "Source": Source, "Endstation": Endstation})

In [42]:
xcl.loc[xcl["Molecule"] == "P3HT", "Formula"] = "(C10H14S)n"
xcl.loc[xcl["Molecule"] == "P3HT", "Vendor"] = "https://weiyougroup.org/"
xcl.loc[xcl["Molecule"] == "P3HT", "CID"] = None
xcl.loc[xcl["Molecule"] == "P3HT", "CAS"] = "104934-50-1"
xcl.loc[xcl["Molecule"] == "P3HT", "Image"] = "https://www.sigmaaldrich.com/deepweb/assets/sigmaaldrich/product/structures/168/794/c83bc969-4b4a-4531-b2d4-580788e2989f/800/c83bc969-4b4a-4531-b2d4-580788e2989f.png"

xcl.loc[xcl["Molecule"] == "Y6", "Formula"] = "C82H86F4N8O2S5"
xcl.loc[xcl["Molecule"] == "Y6", "Vendor"] = "https://www.1-material.com/y6-btp-4f-pce157-nfa157/"
xcl.loc[xcl["Molecule"] == "Y6", "CID"] = "146037665"
xcl.loc[xcl["Molecule"] == "Y6", "CAS"] = "2304444-49-1"
xcl.loc[xcl["Molecule"] == "Y6", "Image"] = "https://pubchem.ncbi.nlm.nih.gov/image/imagefly.cgi?cid=146037665&width=300&height=300"

xcl.loc[xcl["Molecule"] == "PBDB-T", "Formula"] = ""


Unnamed: 0,Molecule,Atom,Edge,Method,Source,Endstation,Formula,Vendor,CID,CAS,Image
0,D18,C,K,TEY,ANSTO,SXR,C18H37OH,,,,
1,D18,C,K,TEY,ANSTO,SXR,C18H37OH,,,,
2,D18,C,K,TEY,ANSTO,SXR,C18H37OH,,,,
3,D18,C,K,TEY,ANSTO,SXR,C18H37OH,,,,
4,D18,C,K,TEY,ANSTO,SXR,C18H37OH,,,,
...,...,...,...,...,...,...,...,...,...,...,...
197,Y6,S,K,FY,NSLSII,SMI,C82H86F4N8O2S5,https://www.1-material.com/y6-btp-4f-pce157-nf...,146037665,2304444-49-1,https://pubchem.ncbi.nlm.nih.gov/image/imagefl...
198,Y6,S,K,FY,NSLSII,SMI,C82H86F4N8O2S5,https://www.1-material.com/y6-btp-4f-pce157-nf...,146037665,2304444-49-1,https://pubchem.ncbi.nlm.nih.gov/image/imagefl...
199,Y6,S,K,FY,NSLSII,SMI,C82H86F4N8O2S5,https://www.1-material.com/y6-btp-4f-pce157-nf...,146037665,2304444-49-1,https://pubchem.ncbi.nlm.nih.gov/image/imagefl...
200,Y6,C,K,Trans,ALS,BL532,C82H86F4N8O2S5,https://www.1-material.com/y6-btp-4f-pce157-nf...,146037665,2304444-49-1,https://pubchem.ncbi.nlm.nih.gov/image/imagefl...


In [4]:
import decimal
import pyodbc
import xml.etree.ElementTree as ET


conn = pyodbc.connect(
    "DRIVER={ODBC Driver 18 for SQL Server};SERVER=HDUVALLH;DATABASE=NEXAFS;Trusted_Connection=yes;TrustServerCertificate=yes"
)


def table_to_xml(conn: pyodbc.Connection, table: str) -> str:
    cursor = conn.cursor()
    cursor.execute(f"SELECT * FROM {table}")
    columns = [column[0] for column in cursor.description]
    rows = cursor.fetchall()
    # check if the rows are decimal
    for i, row in enumerate(rows):
        for j, value in enumerate(row):
            if isinstance(value, decimal.Decimal):
                rows[i][j] = float(value)
    root = ET.Element(table)
    for row in rows:
        row_element = ET.SubElement(root, "row")
        for col, value in zip(columns, row):
            col_element = ET.SubElement(row_element, col)
            col_element.text = str(value)
    xml_data = ET.tostring(root, encoding="utf-8", method="xml")
    return xml_data


def db_to_xml(conn: pyodbc.Connection):
    # get all tables from the database
    cursor = conn.cursor()
    cursor.execute(
        "SELECT * FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_TYPE='BASE TABLE'"
    )
    tables = [row[2] for row in cursor.fetchall()]
    for table in tables:
        xml_data = table_to_xml(conn, table)
        with open(f"{table}.xml", "wb") as f:
            f.write(xml_data)


db_to_xml(conn)

In [8]:
deg = {col.split("deg")[0][-2:] for col in xcl.columns}

dfs = []
for d in deg:
    for key, id in key_pairs.items():
        e = xcl[f"{key}_{d}deg_En"].dropna()
        i = xcl[f"{key}_{d}deg"].dropna()
        deg = [d] * len(e)

        dfs.append(pd.DataFrame({"exp_id": key, "e": e, "mu": i, "deg": deg}))

new_df = pd.concat(dfs)
new_df


Unnamed: 0,exp_id,e,mu,deg
0,P3HT_C_K_edge_TEY_ANSTO_SXR,270.66074,0.049305,90
1,P3HT_C_K_edge_TEY_ANSTO_SXR,270.76413,0.048886,90
2,P3HT_C_K_edge_TEY_ANSTO_SXR,270.86069,0.042668,90
3,P3HT_C_K_edge_TEY_ANSTO_SXR,270.95999,0.043200,90
4,P3HT_C_K_edge_TEY_ANSTO_SXR,271.06006,0.046153,90
...,...,...,...,...
90,P3HT_S_K_edge_FY_NSLSII_SMI,2510.00000,0.934421,40
91,P3HT_S_K_edge_FY_NSLSII_SMI,2515.00000,0.951659,40
92,P3HT_S_K_edge_FY_NSLSII_SMI,2520.00000,0.985252,40
93,P3HT_S_K_edge_FY_NSLSII_SMI,2525.00000,1.053501,40


In [9]:
def add_to_db(df, conn):
    for i, row in df.iterrows():
        id = row.exp_id
        split = id.split("_")
        mol = split[0]
        atom = split[1]
        edge = split[2]
        nex_type = split[4]
        source = split[5]
        endstation = split[6]

        query = f"""
        DECLARE @exp_id UNIQUEIDENTIFIER

        SELECT @exp_id = ExperimentID
        FROM EXP_LIST
        WHERE name = '{mol}'
        AND atom = '{atom}'
        AND edge = '{edge}'
        AND NEXAFSMethod = '{nex_type}'
        AND source = '{source}'
        AND endstation = '{endstation}'

        INSERT INTO Nexafs (exp_id, e, mu, deg)
        VALUES (@exp_id, {row.e}, {row.mu}, {row.deg})
        """
        conn.execute(query)
    conn.commit()

add_to_db(new_df, conn)