In [1]:
import pyref as ref
import polars as pl
from pathlib import Path
import pandas as pd

# Data cleaning and processing for reflectivity data

In [2]:
beamtime = Path(
    "C:/Users/hduva/Washington State University (email.wsu.edu)/Carbon Lab Research Group - Documents/Synchrotron Logistics and Data/ALS - Berkeley/Data/BL1101/2024Apr/XRR/Processed"
)

ccd_dirs = [file for file in beamtime.glob("**/CCD*") if file.is_dir()]
print(ccd_dirs)

[WindowsPath('C:/Users/hduva/Washington State University (email.wsu.edu)/Carbon Lab Research Group - Documents/Synchrotron Logistics and Data/ALS - Berkeley/Data/BL1101/2024Apr/XRR/Processed/ZnPcOLD_/CCD Scan 83703'), WindowsPath('C:/Users/hduva/Washington State University (email.wsu.edu)/Carbon Lab Research Group - Documents/Synchrotron Logistics and Data/ALS - Berkeley/Data/BL1101/2024Apr/XRR/Processed/ZnPcOLD_/CCD Scan 83704'), WindowsPath('C:/Users/hduva/Washington State University (email.wsu.edu)/Carbon Lab Research Group - Documents/Synchrotron Logistics and Data/ALS - Berkeley/Data/BL1101/2024Apr/XRR/Processed/ZnPcOLD_/CCD Scan 83705'), WindowsPath('C:/Users/hduva/Washington State University (email.wsu.edu)/Carbon Lab Research Group - Documents/Synchrotron Logistics and Data/ALS - Berkeley/Data/BL1101/2024Apr/XRR/Processed/ZnPcOLD_/CCD Scan 83706'), WindowsPath('C:/Users/hduva/Washington State University (email.wsu.edu)/Carbon Lab Research Group - Documents/Synchrotron Logistics

In [80]:
def process_all(ccd_dirs: list[Path]) -> pl.DataFrame | None:
    from warnings import filterwarnings

    filterwarnings("ignore")
    all_data = []
    for ccd_dir in ccd_dirs:
        scan_id = ccd_dir.stem
        data_files = list(ccd_dir.glob("*/*"))
        for files in data_files:
            sample = files.parent.parent.parent.stem
            energy = files.parent.stem
            pol = files.stem
            scan_id = files.parent.parent.stem
            try:
                data = ref.Refl(files)
                df = data.refl
            except Exception as e:
                df = pd.DataFrame()
            if not df.empty:
                df["scan_id"] = scan_id
                df["sample"] = sample
                df["energy"] = float(energy)
                df["polarization"] = pol
                df = pl.from_pandas(
                    df[
                        [
                            "Refl",
                            "Err",
                            "Q",
                            "scan_id",
                            "sample",
                            "energy",
                            "polarization",
                        ]
                    ]
                )
                all_data.append(df)
    if len(all_data) > 0:
        return pl.concat(all_data)
    else:
        return None


df = process_all(ccd_dirs)
print(df.head())

shape: (5, 7)
┌──────────┬──────────┬────────┬────────────────┬──────────┬────────┬──────────────┐
│ Refl     ┆ Err      ┆ Q      ┆ scan_id        ┆ sample   ┆ energy ┆ polarization │
│ ---      ┆ ---      ┆ ---    ┆ ---            ┆ ---      ┆ ---    ┆ ---          │
│ f64      ┆ f64      ┆ f64    ┆ str            ┆ str      ┆ f64    ┆ str          │
╞══════════╪══════════╪════════╪════════════════╪══════════╪════════╪══════════════╡
│ 0.863552 ┆ 0.002024 ┆ 0.0027 ┆ CCD Scan 83710 ┆ ZnPcOLD_ ┆ 250.0  ┆ 100          │
│ 0.675314 ┆ 0.001583 ┆ 0.0054 ┆ CCD Scan 83710 ┆ ZnPcOLD_ ┆ 250.0  ┆ 100          │
│ 0.628422 ┆ 0.001474 ┆ 0.0082 ┆ CCD Scan 83710 ┆ ZnPcOLD_ ┆ 250.0  ┆ 100          │
│ 0.78342  ┆ 0.001837 ┆ 0.0109 ┆ CCD Scan 83710 ┆ ZnPcOLD_ ┆ 250.0  ┆ 100          │
│ 0.671482 ┆ 0.001575 ┆ 0.0136 ┆ CCD Scan 83710 ┆ ZnPcOLD_ ┆ 250.0  ┆ 100          │
└──────────┴──────────┴────────┴────────────────┴──────────┴────────┴──────────────┘


In [6]:
import hvplot.polars

plot = df.plot(
    x="Q",
    y="Refl",
    by=["polarization", "scan_id"],
    kind="scatter",
    title="April Beam Time",
    widget_location="top",
    logy=True,
    size=1,
)

plot

BokehModel(combine_events=True, render_bundle={'docs_json': {'bcf2bbd2-ed00-4044-9d7b-fec50ea6ddd6': {'version…

In [93]:
import numpy as np
from refnx.dataset import ReflectDataset


def refl_mask(data: pl.DataFrame) -> pl.DataFrame:
    """Mask the data based on the reflectivity."""

    data = data.filter(pl.col("Refl").is_not_nan())
    data = data.filter(pl.col("Refl") < 1 / pl.col("Q")**2)
    data = data.filter(pl.col("Refl") > 0)
    data = data.filter(pl.col("Err") < 1)

    # Mask pol 190 between 44 and 46 degrees for brewster's angle
    data = data.filter(
        ~(
            (data["polarization"] == "190")
            & (data["Q"] > 0.17)
            & (data["Q"] < 0.19)
        )
    )

    print(len(data))
    return data


def df_to_refnx(df: pl.DataFrame) -> pl.DataFrame:
    """Convert a pandas DataFrame to a refnx DataFrame."""

    combined_data = []

    for idx, df_slice in df.groupby(["energy", "sample"]):
        pol_100 = df_slice.filter(pl.col("polarization") == "100")
        pol_190 = df_slice.filter(pl.col("polarization") == "190")
        filter = []
        pol = ""

        if len(pol_100) > 0:
            pol += "s"
            filter.append(
                pol_100.groupby("scan_id")
                .agg(pl.count("Refl").sort(descending=True))
                .item(0, "scan_id")
            )

        if len(pol_190) > 0:
            pol += "p"
            filter.append(
                pol_190.groupby("scan_id")
                .agg(pl.count("Refl").sort(descending=True))
                .item(0, "scan_id")
            )

        data = df.filter(
            (pl.col("scan_id").is_in(filter)) & (pl.col("energy") == idx[0])
        ).sort("polarization")
        data = refl_mask(data)
        combined = pd.DataFrame(
            {
                "dataset": ReflectDataset(
                    data=(
                        data["Q"].to_numpy(),
                        data["Refl"].to_numpy(),
                        .1*data["Refl"].to_numpy(),
                    )
                ),
                "scan_id": ",".join(filter),
                "sample": idx[1],
                "energy": idx[0],
                "pol": pol,
            },
            index=[0],
        )

        combined_data.append(combined)
    return pd.concat(
        combined_data,
    )


preped_df = df_to_refnx(df)

import pickle as pkl

pkl.dump(preped_df, open(beamtime / "data.pkl", "wb"))


433
194
495
91
82
181
171
195
102
20
274
91
497


In [5]:
def reconstruct_runfile(ccd_dirs: list[Path]):
    from astropy.io import fits

    rf_headers = [
        "Sample X",
        "Sample Y",
        "Sample Z",
        "Sample Theta",
        "CCD Theta",
        "Higher Order Suppressor",
        "Horizontal Exit Slit Size",
        "Beamline Energy",
        "Exposure",
    ]
    all_data = []
    for ccd_dir in ccd_dirs:
        scan_id = ccd_dir.stem
        label = ccd_dir.parent.stem
        data_files = list(ccd_dir.glob("*/*/*.fits"))
        for file in data_files:
            pol = file.parent.stem
            try:
                with fits.open(file) as hdul:
                    header = hdul[0].header
                    data = [header[rf] for rf in rf_headers]
                    data.append(scan_id)
                    data.append(pol)
                    data.append(label)
                    all_data.append(data)
            except Exception as e:
                print(f"Error: {e}")
                print(f"File {file}")
    return pd.DataFrame(
        all_data, columns=rf_headers + ["scan_id", "polarization", "descriptor"]
    )


rf_test = reconstruct_runfile(ccd_dirs)
rf_test.head(10)

Error: No SIMPLE card found, this file does not appear to be a valid FITS file. If this is really a FITS file, try with ignore_missing_simple=True
File C:\Users\hduva\Washington State University (email.wsu.edu)\Carbon Lab Research Group - Documents\Synchrotron Logistics and Data\ALS - Berkeley\Data\BL1101\2024Apr\XRR\Processed\ZnPc_40nm\CCD Scan 83784\284.1\190.0\ZnPc_40nm83784-00694.fits


Unnamed: 0,Sample X,Sample Y,Sample Z,Sample Theta,CCD Theta,Higher Order Suppressor,Horizontal Exit Slit Size,Beamline Energy,Exposure,scan_id,polarization,descriptor
0,22.0,-11.0,-1.0,0.0,0.0,4.999514,1500.0,250.000479,0.001,CCD Scan 83703,100,ZnPcOLD_
1,22.0,-11.0,-0.99,0.0,0.0,4.999514,1500.0,250.000479,0.001,CCD Scan 83703,100,ZnPcOLD_
2,22.0,-11.0,-1.5,0.0,0.0,8.898891,1500.0,250.000479,0.001,CCD Scan 83704,100,ZnPcOLD_
3,22.0,-11.0,-1.49,0.0,0.0,8.898891,1500.0,250.000479,0.001,CCD Scan 83704,100,ZnPcOLD_
4,22.0,-11.0,-1.48,0.0,0.0,8.898891,1500.0,250.000479,0.001,CCD Scan 83704,100,ZnPcOLD_
5,22.0,-11.0,-1.47,0.0,0.0,8.898891,1500.0,250.000479,0.001,CCD Scan 83704,100,ZnPcOLD_
6,22.0,-11.0,-1.46,0.0,0.0,8.898891,1500.0,250.000479,0.001,CCD Scan 83704,100,ZnPcOLD_
7,22.0,-11.0,-1.45,0.0,0.0,8.898891,1500.0,250.000479,0.001,CCD Scan 83704,100,ZnPcOLD_
8,22.0,-11.0,-1.44,0.0,0.0,8.898891,1500.0,250.000479,0.001,CCD Scan 83704,100,ZnPcOLD_
9,22.0,-11.0,-1.43,0.0,0.0,8.898891,1500.0,250.000479,0.001,CCD Scan 83704,100,ZnPcOLD_


In [6]:
als_path = Path(
    "C:/Users/hduva/Washington State University (email.wsu.edu)/Carbon Lab Research Group - Documents/Synchrotron Logistics and Data/ALS - Berkeley/Data/BL1101/"
)

beamtimes = {"2024Apr", "2023Dec", "2023Nov"}

beamtime_list = [als_path / beamtime / "XRR/Processed" for beamtime in beamtimes]

runfile_summary = []
for beamtime in beamtime_list:
    ccd_dirs = [file for file in beamtime.glob("**/CCD*") if file.is_dir()]
    rf = reconstruct_runfile(ccd_dirs)
    runfile_summary.append(rf)

rf_all = pd.concat(runfile_summary)
rf_all.head(10)

Error: No SIMPLE card found, this file does not appear to be a valid FITS file. If this is really a FITS file, try with ignore_missing_simple=True
File C:\Users\hduva\Washington State University (email.wsu.edu)\Carbon Lab Research Group - Documents\Synchrotron Logistics and Data\ALS - Berkeley\Data\BL1101\2024Apr\XRR\Processed\ZnPc_40nm\CCD Scan 83784\284.1\190.0\ZnPc_40nm83784-00694.fits
Error: [Errno 22] Invalid argument
File C:\Users\hduva\Washington State University (email.wsu.edu)\Carbon Lab Research Group - Documents\Synchrotron Logistics and Data\ALS - Berkeley\Data\BL1101\2023Nov\XRR\Processed\PS-Film\CCD Scan 82264\283.0\100.0\PS-Film82264-00864.fits
Error: [Errno 22] Invalid argument
File C:\Users\hduva\Washington State University (email.wsu.edu)\Carbon Lab Research Group - Documents\Synchrotron Logistics and Data\ALS - Berkeley\Data\BL1101\2023Nov\XRR\Processed\ZnPc\CCD Scan 82261\287.0\190.0\ZnPc82261-00662.fits


Unnamed: 0,Sample X,Sample Y,Sample Z,Sample Theta,CCD Theta,Higher Order Suppressor,Horizontal Exit Slit Size,Beamline Energy,Exposure,scan_id,polarization,descriptor
0,22.0,-11.0,-1.0,0.0,0.0,4.999514,1500.0,250.000479,0.001,CCD Scan 83703,100,ZnPcOLD_
1,22.0,-11.0,-0.99,0.0,0.0,4.999514,1500.0,250.000479,0.001,CCD Scan 83703,100,ZnPcOLD_
2,22.0,-11.0,-1.5,0.0,0.0,8.898891,1500.0,250.000479,0.001,CCD Scan 83704,100,ZnPcOLD_
3,22.0,-11.0,-1.49,0.0,0.0,8.898891,1500.0,250.000479,0.001,CCD Scan 83704,100,ZnPcOLD_
4,22.0,-11.0,-1.48,0.0,0.0,8.898891,1500.0,250.000479,0.001,CCD Scan 83704,100,ZnPcOLD_
5,22.0,-11.0,-1.47,0.0,0.0,8.898891,1500.0,250.000479,0.001,CCD Scan 83704,100,ZnPcOLD_
6,22.0,-11.0,-1.46,0.0,0.0,8.898891,1500.0,250.000479,0.001,CCD Scan 83704,100,ZnPcOLD_
7,22.0,-11.0,-1.45,0.0,0.0,8.898891,1500.0,250.000479,0.001,CCD Scan 83704,100,ZnPcOLD_
8,22.0,-11.0,-1.44,0.0,0.0,8.898891,1500.0,250.000479,0.001,CCD Scan 83704,100,ZnPcOLD_
9,22.0,-11.0,-1.43,0.0,0.0,8.898891,1500.0,250.000479,0.001,CCD Scan 83704,100,ZnPcOLD_


In [17]:
len(rf_all)

15512

In [16]:
rf_all.write_parquet("runfile_summary.parquet")

In [16]:
df_all.plot(
    x="Q",
    y="Refl",
    by=["scan_id", "polarization"],
    groupby="energy",
    kind="scatter",
    title="ALS Beam Times",
    widget_location="top",
    logy=True,
    size=1,
)

BokehModel(combine_events=True, render_bundle={'docs_json': {'17b40b70-38d1-45e1-a54a-cfb9b296660d': {'version…