In [1]:
import os
import random
import pandas as pd

# Configuration
ROOT = "data/Bayou_Freeze"
SCENARIO = "S1"
NUM_ROWS = 200  # number of BOM items to generate

# Scraped or reference pipe schedule data (ASME B36.10M SCH 40 & 80)
# Normally pandas.read_html could scrape from EngineeringToolBox.
# Here we define a sample for demonstration.
pipe_dims = pd.DataFrame([
    {"nps": "1/2", "od_mm": 21.3, "sch40_thk_mm": 2.87, "sch80_thk_mm": 3.73},
    {"nps": "3/4", "od_mm": 26.7, "sch40_thk_mm": 2.87, "sch80_thk_mm": 3.73},
    {"nps": "1", "od_mm": 33.4, "sch40_thk_mm": 3.38, "sch80_thk_mm": 4.55},
    {"nps": "2", "od_mm": 60.3, "sch40_thk_mm": 3.91, "sch80_thk_mm": 6.02},
    {"nps": "4", "od_mm": 114.3, "sch40_thk_mm": 6.02, "sch80_thk_mm": 8.56},
    {"nps": "6", "od_mm": 168.3, "sch40_thk_mm": 6.55, "sch80_thk_mm": 10.97},
])

# Material specs relevant for cryogenic service
specs = ["304L SS", "9% Ni Alloy"]

# Generate synthetic BOM
records = []
for i in range(1, NUM_ROWS + 1):
    dim = pipe_dims.sample(1).iloc[0]
    spec = random.choice(specs)
    qty = random.randint(1, 20)
    need_by = pd.Timestamp("2025-08-01") - pd.Timedelta(days=random.randint(0, 30))
    records.append({
        "item": f"Line {i:03d}",
        "description": f"{dim['nps']} NPS pipe, {spec}",
        "spec": spec,
        "qty": qty,
        "unit": "pcs",
        "nps": dim["nps"],
        "schedule": random.choice(["SCH 40", "SCH 80"]),
        "od_mm": dim["od_mm"],
        "thickness_mm": dim["sch40_thk_mm"] if random.random() < 0.5 else dim["sch80_thk_mm"],
        "need_by": need_by.date(),
        "source_file": "synthetic_bom_generator",
        "scenario_id": SCENARIO
    })

df_bom = pd.DataFrame.from_records(records)
os.makedirs(ROOT, exist_ok=True)
df_bom.to_csv(os.path.join(ROOT, "scope.csv"), index=False)

print(f"Generated {len(df_bom)} BOM rows at {os.path.join(ROOT, 'scope.csv')}")


Generated 200 BOM rows at data/Bayou_Freeze/scope.csv


In [5]:
#!/usr/bin/env python3
"""
bayou_freeze_data_synthesis.py

1) Scrape ASME B36.10M pipe schedule table for OD & wall-thickness
2) Load supplier master mapping cryogenic-spec → vendors
3) Generate synthetic BOM entries with realistic quantities & need-by dates
4) Enrich each row with part codes and supplier recommendations
5) Output consolidated scope.csv for RAG ingestion
"""

import os
import random
import pandas as pd
import requests
from bs4 import BeautifulSoup

# -------- Configuration -------- #
ROOT_DIR       = "data/Bayou_Freeze"
SCENARIO_ID    = "S1"
SUPPLIER_CSV   = os.path.join(ROOT_DIR, "supplier_master.csv")
OUTPUT_CSV     = os.path.join(ROOT_DIR, "scope.csv")
NUM_BOM_ROWS   = 200
LEAD_TIME_DAYS = (20, 40)  # realistic lead-time window
# -------------------------------- #


def scrape_pipe_schedule() -> pd.DataFrame:
    """
    Scrape ASME B36.10M table from EngineeringToolBox for OD and wall thickness.
    """
    url = "https://www.engineeringtoolbox.com/asme-steel-pipes-sizes-d_42.html"
    resp = requests.get(url)
    resp.raise_for_status()
    tables = pd.read_html(resp.text)
    print(tables)
    # The first table is the metric schedule
    df = tables[0]
    df = df.rename(columns={
        "Pipe Size": "nps",
        "Outside Diameter (mm)": "od_mm",
        "Wall Thickness (mm)": "thickness_mm"
    })
    df = df[["nps", "od_mm", "thickness_mm"]]
    return df


def load_supplier_master(path: str) -> pd.DataFrame:
    """
    Load supplier master mapping cryogenic specs to pre-qualified vendors.
    Expect columns: spec, part_code, vendor_list (semicolon-separated)
    """
    df = pd.read_csv(path, dtype=str)
    # explode vendor_list into a Python list
    df["vendors"] = df["vendor_list"].str.split(";")
    return df.set_index("spec")


def generate_synthetic_bom(pipe_df: pd.DataFrame,
                           supplier_df: pd.DataFrame) -> pd.DataFrame:
    """
    Create synthetic BOM by sampling pipe sizes and mapping to OD/thickness.
    Enrich with part codes, supplier recommendations, and need-by dates.
    """
    specs = list(supplier_df.index)
    records = []
    for i in range(1, NUM_BOM_ROWS + 1):
        # pick a random spec and matching pipe size
        spec = random.choice(specs)
        # filter pipe_df for common NPS values (e.g., 1/2" to 6")
        candidate = pipe_df[pipe_df["nps"].isin(
            ["1/2\"", "3/4\"", "1\"", "2\"", "4\"", "6\""]
        )]
        row = candidate.sample(1).iloc[0]
        qty = random.randint(1, 20)
        need_by = pd.Timestamp("2025-08-01") - pd.Timedelta(
            days=random.randint(*LEAD_TIME_DAYS)
        )

        records.append({
            "item":           f"Line {i:03d}",
            "description":    f"{row['nps']} pipe, {spec}",
            "spec":           spec,
            "qty":            qty,
            "unit":           "pcs",
            "nps":            row["nps"],
            "od_mm":          row["od_mm"],
            "thickness_mm":   row["thickness_mm"],
            "need_by":        need_by.date(),
            # metadata enrichment
            "part_code":      supplier_df.at[spec, "part_code"],
            "vendors":        ", ".join(supplier_df.at[spec, "vendors"]),
            "scenario_id":    SCENARIO_ID,
        })

    return pd.DataFrame.from_records(records)


def main():
    os.makedirs(ROOT_DIR, exist_ok=True)

    # 1. Scrape pipe dimensions
    pipe_schedule_df = scrape_pipe_schedule()

    # 2. Load supplier master
    supplier_df = load_supplier_master(SUPPLIER_CSV)

    # 3. Generate synthetic BOM with metadata enrichment
    bom_df = generate_synthetic_bom(pipe_schedule_df, supplier_df)

    # 4. Write out scope.csv
    bom_df.to_csv(OUTPUT_CSV, index=False)
    print(f"[INFO] Generated {len(bom_df)} BOM rows → {OUTPUT_CSV}")


if __name__ == "__main__":
    main()


[    Diameter Nominal      Schedule            Outside Diameter - D - (mm)  \
            (inches) (mm) Schedule Schedule.1 Outside Diameter - D - (mm)   
0                1/8    6      10S        NaN                      10.300   
1                1/8    6      Std       40.0                      10.300   
2                1/8    6       XS       80.0                      10.300   
3                1/4    8      10S        NaN                      13.700   
4                1/4    8      Std       40.0                      13.700   
..               ...  ...      ...        ...                         ...   
109                8  200      NaN      120.0                     219.075   
110                8  200      NaN      140.0                     219.075   
111                8  200      NaN      160.0                     219.075   
112                8  200      NaN        NaN                     219.075   
113                8  200      NaN        NaN                     219.075  

  tables = pd.read_html(resp.text)


KeyError: "['nps' 'od_mm' 'thickness_mm'] not in index"

In [3]:
pip install lxml

Defaulting to user installation because normal site-packages is not writeable
Collecting lxml
  Downloading lxml-5.4.0-cp39-cp39-macosx_10_9_universal2.whl.metadata (3.5 kB)
Downloading lxml-5.4.0-cp39-cp39-macosx_10_9_universal2.whl (8.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.1/8.1 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: lxml
Successfully installed lxml-5.4.0
Note: you may need to restart the kernel to use updated packages.
