In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import joblib

from pathlib import Path
import matplotlib.pyplot as plt
from scipy.interpolate import interp1d
from pathlib import Path
from pymatgen.core import Structure, Composition, Element

# user-friendly print
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# see https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
import torch
torch.set_float32_matmul_precision('high')

### Load data

In [2]:
qc_ac_te_mp_dataset = pd.read_pickle('../data/raw/qc_ac_te_mp_rebuild_T=290K_20250202.pd.xz')

qc_ac_te_mp_dataset.head(3)
qc_ac_te_mp_dataset.shape

Unnamed: 0_level_0,formula,hypermaterial_type,temperature,composition,elements,Thermal conductivity,Seebeck coefficient,Carrier concentration,Electrical conductivity,Thermopower,...,Density,Efermi,Final energy per atom,Formation energy per atom,Total magnetization,Volume,Magnetic susceptibility,Hall coefficient,Electronic contribution,Electronic thermal conductivity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
starry_0,(ErAs)0.003InGaAs,others,290,"(Er, As, In, Ga)","(Er, As, In, Ga)",3.2670491071158483,,,,,...,,,,,,,,,,
qa_1,Al61.5Cu26.5Fe12,IQC,290,"(Al, Cu, Fe)","(Al, Cu, Fe)",1.4258964578008275,-1.1863527516689362e-05,,,,...,,,,,,,,,,
starry_2,Bi2Te2.4Se0.6,others,290,"(Bi, Te, Se)","(Bi, Te, Se)",,,-5.436073908332581e+24,1063.8779641515926,,...,,,,,,,,,,


(41017, 26)

All properties

In [3]:
starry_props = [
    "Thermal conductivity",
    "Carrier concentration",
    "Electrical conductivity",
    "Thermopower",
    "Electrical resistivity",
    "Power factor",
    "Seebeck coefficient",
    "Lattice thermal conductivity",
    "ZT",
    "Hall mobility",
    "Electronic contribution",  # Nan after QuantileTransformer
    "Electronic thermal conductivity",
]

ac_qc_props = [
    "Seebeck coefficient",
    "Thermal conductivity",
    "Electrical resistivity",
    "Magnetic susceptibility",
    # "Specific heat capacity",
    "Hall coefficient",
    "ZT",
    "Power factor",
]

mp_props = [
    "Band gap",
    "Density",
    "Efermi",
    "Final energy per atom",
    "Formation energy per atom",
    "Total magnetization",
    "Volume",  
]

ac_qc_starry_shared_props = [
    "Seebeck coefficient",
    "Thermal conductivity",
    "Electrical resistivity",
    "Power factor",
    "ZT",
]

ac_qc_starry_props = [
    "Seebeck coefficient",
    "Thermal conductivity",
    "Electrical resistivity",
    "Magnetic susceptibility",
    # "Specific heat capacity",
    "Hall coefficient",
    "ZT",
    "Power factor",
    "Carrier concentration",
    "Electrical conductivity",
    "Thermopower",
    "Lattice thermal conductivity",
    "Hall mobility",
    "Electronic contribution",
    "Electronic thermal conductivity",
]

all_props = ac_qc_starry_props + mp_props

qc_ac_te_mp_props = qc_ac_te_mp_dataset[all_props]

### Load logs

In [4]:
logs_dir = '../results/20250206_logs/common_logs/'

In [5]:
all_losses = []
for log in Path(logs_dir).iterdir():
    if log.name != '.ipynb_checkpoints':
        mp_rate, _ = log.name.split('-')
        _, mp_rate = mp_rate.split('_')
        mp_rate = float(mp_rate)
    
        if mp_rate == 0.8:
            continue
        print(f'MP rate: {mp_rate}')
    
        for version in log.iterdir():
            if version.name != '.ipynb_checkpoints':
                print(f'Version: {version.name}')
        
                metrics = pd.read_csv(version / 'metrics.csv')
                loss = metrics.iloc[-1, ["(test_loss)" in col for col in metrics.columns]].to_frame().T.assign(mp_rate=mp_rate, version=int(version.name.split('_')[1]))
                all_losses.append(loss)

all_losses = pd.concat(all_losses)
all_losses.to_csv('all_losses.csv', index=True)                

MP rate: 1.0
Version: version_0
Version: version_1
Version: version_2
Version: version_3
Version: version_4
MP rate: 0.6
Version: version_0
Version: version_1
Version: version_2
Version: version_3
Version: version_4
MP rate: 0.4
Version: version_0
Version: version_1
Version: version_2
Version: version_3
Version: version_4
MP rate: 0.2
Version: version_0
Version: version_1
Version: version_2
Version: version_3
Version: version_4
MP rate: 0.0
Version: version_0
Version: version_1
Version: version_2
Version: version_3
Version: version_4
MP rate: 0.1
Version: version_0
Version: version_1
Version: version_2
Version: version_3
Version: version_4
MP rate: 0.7
Version: version_0
Version: version_1
Version: version_2
Version: version_3
Version: version_4
MP rate: 0.95
Version: version_0
Version: version_1
Version: version_2
Version: version_3
Version: version_4
MP rate: 0.3
Version: version_0
Version: version_1
Version: version_2
Version: version_3
Version: version_4
MP rate: 0.5
Version: versi

In [33]:
metrics.iloc[-1, ["(test_loss)" in col for col in metrics.columns]].to_frame().T

Unnamed: 0,Carrier concentration (test_loss),Electrical conductivity (test_loss),Electrical resistivity (test_loss),Electronic contribution (test_loss),Electronic thermal conductivity (test_loss),Hall coefficient (test_loss),Hall mobility (test_loss),Lattice thermal conductivity (test_loss),Magnetic susceptibility (test_loss),Power factor (test_loss),Seebeck coefficient (test_loss),Thermal conductivity (test_loss),Thermopower (test_loss),ZT (test_loss)
648,1.674612,0.875944,0.634669,1.73968,1.47688,0.038775,1.800642,0.457433,0.117374,0.612136,0.671038,0.635638,1.252157,0.655596


In [None]:
test_losses.append(np.mean(losses))
test_losses_std.append(np.std(losses))

# Plotting with confidence intervals
plt.figure(figsize=(10, 6))
sns.set_style("whitegrid")

# Plot mean loss
plt.plot(fractions, test_losses, marker="o", label="Mean Loss")

# Add confidence intervals
ci = 1.96 * np.array(test_losses_std) / np.sqrt(num_runs)
plt.fill_between(
    fractions,
    np.array(test_losses) - ci,
    np.array(test_losses) + ci,
    alpha=0.3,
    label="95% CI",
)

plt.xlabel("Data Fraction")
plt.ylabel(f"Test Loss for {target_property}")
title = (
    f"Effect of {'Target' if mode == 'vary_target' else 'Other Properties'} "
    f"Data Size on {target_property} Prediction"
)
plt.title(title)
plt.legend()

# Save plot
save_dir = Path("images/multi_tasks/scaling_laws")
save_dir.mkdir(parents=True, exist_ok=True)
plt.savefig(
    save_dir / f"scaling_law_{mode}_{target_property}.png",
    bbox_inches="tight",
    dpi=300,
)
plt.close()