### Imports + repo paths

In [1]:
import os, json, re, sys, shutil, shlex, subprocess
from pathlib import Path
from datetime import datetime

# If you have helper.utils in your repo, use it; otherwise fallback to local finder.
def find_repo_root(start: Path = None) -> Path:
    p = start or Path.cwd()
    for parent in [p, *p.resolve().parents]:
        if (parent / ".git").exists():
            return parent
    raise RuntimeError("No git repo root found (.git not present). Run this inside the repo folder.")

repo_root = find_repo_root()
print("‚úÖ Repo root:", repo_root)

payload_dir = repo_root / "notebooks" / "release_payload"
meta_dir    = repo_root / "metadata"
wf_dir      = repo_root / ".github" / "workflows"
helper_dir  = repo_root / "helper"

payload_dir.mkdir(parents=True, exist_ok=True)
meta_dir.mkdir(parents=True, exist_ok=True)
wf_dir.mkdir(parents=True, exist_ok=True)

print("payload_dir:", payload_dir)
print("meta_dir:", meta_dir)
print("wf_dir:", wf_dir)


‚úÖ Repo root: /home/097e80f6-6687-4e65-aab6-9abf7b887006/GreenInformationFactory_Prototype
payload_dir: /home/097e80f6-6687-4e65-aab6-9abf7b887006/GreenInformationFactory_Prototype/notebooks/release_payload
meta_dir: /home/097e80f6-6687-4e65-aab6-9abf7b887006/GreenInformationFactory_Prototype/metadata
wf_dir: /home/097e80f6-6687-4e65-aab6-9abf7b887006/GreenInformationFactory_Prototype/.github/workflows


### Clean payload directory

Files will be deleted locally after the upload to avoid glutter on the repo. For now: alawys start clean.

In [2]:
# Clean payload
for p in payload_dir.glob("*"):
    if p.is_file():
        p.unlink()
    elif p.is_dir():
        shutil.rmtree(p)

print("‚úÖ Cleaned payload dir:", payload_dir)


‚úÖ Cleaned payload dir: /home/097e80f6-6687-4e65-aab6-9abf7b887006/GreenInformationFactory_Prototype/notebooks/release_payload


Define what goes into the release

In [3]:
# --- Core processed data files ---
Files = [
    "data/processed/Train/BioFairNet_Pilot1_Testrun_Train_in.csv",
    "data/processed/Train/BioFairNet_Pilot1_Testrun_Train_out.csv",
    "data/processed/Test/BioFairNet_Pilot1_Testrun_Test_in.csv",
    "data/processed/Test/BioFairNet_Pilot1_Testrun_Test_out.csv",
]

# --- Models (you said they are in notebooks/models) ---
Models = [
    "notebooks/models/all_models.pkl",
]

# --- Automatically collect ALL outputs from Notebooks 04 & 05 ---
results_dir = repo_root / "data" / "results"

Results = []

# Sustainability evaluation outputs (Notebook 04)
Results += [str(p.relative_to(repo_root)) for p in results_dir.glob("sustainability*.*")]
Results += [str(p.relative_to(repo_root)) for p in results_dir.glob("compare_*.*")]
Results += [str(p.relative_to(repo_root)) for p in results_dir.glob("tradeoff*.*")]
Results += [str(p.relative_to(repo_root)) for p in results_dir.glob("dist_*.*")]
Results += [str(p.relative_to(repo_root)) for p in results_dir.glob("test_vs_validation*.*")]
Results += [str(p.relative_to(repo_root)) for p in results_dir.glob("method_comparison*.*")]
Results += [str(p.relative_to(repo_root)) for p in results_dir.glob("correlation*.*")]

# Scenario analysis outputs (Notebook 05)
Results += [str(p.relative_to(repo_root)) for p in results_dir.glob("scenario*.*")]
Results += [str(p.relative_to(repo_root)) for p in results_dir.glob("sensitivity*.*")]
Results += [str(p.relative_to(repo_root)) for p in results_dir.glob("response_surface*.*")]
Results += [str(p.relative_to(repo_root)) for p in results_dir.glob("oneway*.*")]
Results += [str(p.relative_to(repo_root)) for p in results_dir.glob("twofactor*.*")]

# Remove duplicates & sort
Results = sorted(set(Results))

# --- Combine everything ---
sources = Files + Results + Models

print("üì¶ Release payload will include:")
for s in sources:
    print(" -", s)


üì¶ Release payload will include:
 - data/processed/Train/BioFairNet_Pilot1_Testrun_Train_in.csv
 - data/processed/Train/BioFairNet_Pilot1_Testrun_Train_out.csv
 - data/processed/Test/BioFairNet_Pilot1_Testrun_Test_in.csv
 - data/processed/Test/BioFairNet_Pilot1_Testrun_Test_out.csv
 - data/results/compare_co2_methods_test_rf.png
 - data/results/compare_mci_methods_test_rf.png
 - data/results/compare_methods_correlation_test_rf.png
 - data/results/dist_co2_test_vs_val_rf.png
 - data/results/dist_mci_test_vs_val_rf.png
 - data/results/scenario_co2_Stiring_rf.png
 - data/results/scenario_co2_time_s_rf.png
 - data/results/scenario_mci_Stiring_rf.png
 - data/results/scenario_mci_time_s_rf.png
 - data/results/scenario_results_oneway_rf.csv
 - data/results/scenario_sustainable_region_pca_Stiring_rf.png
 - data/results/scenario_sustainable_region_pca_time_s_rf.png
 - data/results/scenario_y_pred_Stiring_rf.png
 - data/results/scenario_y_pred_time_s_rf.png
 - data/results/sustainability_metho

### Collect all files

everything for upload will go into `notebooks/release_payload/`

In [4]:
missing = []
copied = []

for rel in sources:
    src = repo_root / rel
    if src.exists():
        shutil.copy2(src, payload_dir / src.name)
        copied.append(src.name)
    else:
        missing.append(rel)

print("‚úÖ Copied files:")
for f in sorted(copied):
    print("  -", f)

if missing:
    print("\n‚ö†Ô∏è Missing (not copied):")
    for m in missing:
        print("  -", m)


‚úÖ Copied files:
  - BioFairNet_Pilot1_Testrun_Test_in.csv
  - BioFairNet_Pilot1_Testrun_Test_out.csv
  - BioFairNet_Pilot1_Testrun_Train_in.csv
  - BioFairNet_Pilot1_Testrun_Train_out.csv
  - all_models.pkl
  - compare_co2_methods_test_rf.png
  - compare_mci_methods_test_rf.png
  - compare_methods_correlation_test_rf.png
  - dist_co2_test_vs_val_rf.png
  - dist_mci_test_vs_val_rf.png
  - scenario_co2_Stiring_rf.png
  - scenario_co2_time_s_rf.png
  - scenario_mci_Stiring_rf.png
  - scenario_mci_time_s_rf.png
  - scenario_results_oneway_rf.csv
  - scenario_sustainable_region_pca_Stiring_rf.png
  - scenario_sustainable_region_pca_time_s_rf.png
  - scenario_y_pred_Stiring_rf.png
  - scenario_y_pred_time_s_rf.png
  - sustainability_method_comparison_test_rf.csv
  - sustainability_method_comparison_test_vs_val_rf.csv
  - sustainability_pca_test_rf.csv
  - sustainability_pca_val_rf.csv
  - test_vs_validation_shift_summary_rf.csv
  - tradeoff_test_vs_validation_pca_rf.png


### Set Zenodo params

`metadata/zenodo_params.json` as single source of truth. This avoids having tons of workflow inputs.

<div class="alert alert-block alert-warning">
<b>Note:</b> Sandbox defaults to true.
</div>

In [15]:
zenodo_params = {
    "use_sandbox": False,                 # ‚úÖ default safe
    "community": "biofairnet",
    "license": "MIT",
    "upload_type": "dataset",
    "title": "GreenInformationFactory ‚Äì Release Payload (Processed Data, Models, Results)",
    "description": (
        "Release payload generated by the GreenInformationFactory pipeline. "
        "Contains processed data splits, trained model bundle, evaluation outputs, "
        "sustainability proxy results (v1/PCA/assumptions), and scenario analysis artifacts. "
        "Raw data source: 10.5281/zenodo.16256961."
    ),
    "creators": [
        {
            "name": "Rosnitschek, Tobias",
            "affiliation": "University of Bayreuth",
            "orcid": "0000-0002-4876-2536"  # optional
        }
    ],
    "keywords": [
        "FAIR", "machine learning", "sustainability assessment"
    ],
    "related_doi": "10.5281/zenodo.16256961",
    "payload_dir": "notebooks/release_payload"
}

params_path = meta_dir / "zenodo_params.json"
params_path.write_text(json.dumps(zenodo_params, indent=2), encoding="utf-8")
print("‚úÖ Wrote:", params_path.relative_to(repo_root))


‚úÖ Wrote: metadata/zenodo_params.json


### Generate the upload workflow from template

In [16]:
tpl_path = helper_dir / "zenodo-upload-template.yml"
assert tpl_path.exists(), f"Template not found: {tpl_path}"

tpl = tpl_path.read_text(encoding="utf-8")

# Insert path to params json into workflow
filled = tpl.replace("__PARAMS_JSON__", "metadata/zenodo_params.json")

# Slugged workflow filename
slug = "greeninformationfactory-release-zenodo-upload"
wf_path = wf_dir / f"{slug}.yml"

wf_path.write_text(filled, encoding="utf-8")
print("‚úÖ Wrote workflow:", wf_path.relative_to(repo_root))


‚úÖ Wrote workflow: .github/workflows/greeninformationfactory-release-zenodo-upload.yml


### Git commit params + workflow + payload

In [13]:
def run(cmd, check=True):
    print("$", cmd)
    subprocess.run(shlex.split(cmd), cwd=repo_root, check=check)

run("git pull --rebase origin main", check=False)

run("git add notebooks/release_payload metadata/zenodo_params.json .github/workflows", check=True)

msg = 'chore: prepare Zenodo release payload (sandbox default)'
run(f'git commit -m "{msg}"', check=False)

run("git push origin main", check=False)
print("‚úÖ Pushed changes to main.")


$ git pull --rebase origin main


From github.com:Tobi-Wan-Kenob1/GreenInformationFactory_Prototype
 * branch            main       -> FETCH_HEAD


Already up to date.
Current branch main is up to date.
$ git add notebooks/release_payload metadata/zenodo_params.json .github/workflows
$ git commit -m "chore: prepare Zenodo release payload (sandbox default)"
On branch main
Your branch is up to date with 'origin/main'.

nothing to commit, working tree clean
$ git push origin main
‚úÖ Pushed changes to main.


Everything up-to-date


### Trigger upload

This avoids triggering all actions at onec. Here we use a tag prefix that only the upload workflow istens to.

In [17]:
doi = zenodo_params.get("related_doi", "10.5281/zenodo.16256961")
ts = datetime.utcnow().strftime("%Y%m%d-%H%M%S")
tag = f"zenodo-ul-release-{ts}"

# ensure uniqueness
existing = subprocess.run(shlex.split("git tag"), cwd=repo_root, capture_output=True, text=True).stdout.splitlines()
if tag in existing:
    tag = f"{tag}-{datetime.utcnow().strftime('%f')}"

run("git pull --rebase origin main", check=False)
run(f"git tag {tag}", check=True)
run("git push origin --tags", check=True)

print(f"‚úÖ Triggered Zenodo upload with tag: {tag}")
print("‚û°Ô∏è  Go to GitHub ‚Üí Actions ‚Üí 'Zenodo Upload' run logs.")


$ git pull --rebase origin main
$ git tag zenodo-ul-release-20260129-075226
$ git push origin --tags


error: cannot pull with rebase: You have unstaged changes.
error: please commit or stash them.


‚úÖ Triggered Zenodo upload with tag: zenodo-ul-release-20260129-075226
‚û°Ô∏è  Go to GitHub ‚Üí Actions ‚Üí 'Zenodo Upload' run logs.


To github.com:Tobi-Wan-Kenob1/GreenInformationFactory_Prototype.git
 * [new tag]         zenodo-ul-release-20260129-075226 -> zenodo-ul-release-20260129-075226


### Local Clean up

We do this since the data is curated on Zenodo, not within the Git Repository.

<div class="alert alert-block alert-danger">
<b>Attention:</b> Run this cell ONLY AFTER you see the Zenodo deposition succeeded in the Actions logs.
</div>

In [None]:
# Optional: remove local payload after successful upload
# (Do this only AFTER you see the Zenodo deposition succeeded in Actions logs.)
for p in payload_dir.glob("*"):
    if p.is_file():
        p.unlink()
print("‚úÖ Local payload cleaned.")
