In [1]:
# Load OpenAI API-Key
from dotenv import load_dotenv
import os

load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

In [2]:
from baml_client.sync_client import b
from baml_client.types import FaultReport

In [None]:
# few-shot 20 cases for evaluation

import json
from pathlib import Path

# Path to the few-shot cases JSONL file
input_path = Path("/Users/wbm/Documents/BIT/Research Topics/Potential Datasets/20_cases_for_baml_fewshot.jsonl")

# Load 20 log_text strings
log_entries = []
with open(input_path, "r", encoding="utf-8") as f:
    for line in f:
        log_entries.append(json.loads(line))  # Each line is a raw log string

# Collect the results
extracted_results = []

for i, log_text in enumerate(log_entries):
    print(f"🛠️ Processing case {i+1}/{len(log_entries)}")
    try:
        result: FaultReport = b.ExtractFaultInfo(log_text=log_text)
        extracted_results.append({
            "case_id": log_text.split("\n")[0].replace("Case-ID:", "").strip(),
            "result": result.model_dump()
        })
    except Exception as e:
        print(f" Failed to process case {i+1}: {e}")
        extracted_results.append({
            "case_id": log_text.split("\n")[0].replace("Case-ID:", "").strip(),
            "error": str(e)
        })

# Save the results to file
output_path = Path("baml_extracted_20_cases.json")
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(extracted_results, f, indent=2, ensure_ascii=False)

print(f" Extraction completed! Saved to: {output_path}")


🛠️ Processing case 1/20
2025-04-23T17:45:49.314 [BAML [92mINFO[0m] [35mFunction ExtractFaultInfo[0m:
    [33mClient: CustomGPT4o (gpt-4o-2024-08-06) - 2884ms. StopReason: stop. Tokens(in/out): 2028/87[0m
    [34m---PROMPT---[0m
    [2m[43msystem: [0m[2mYou are an expert in analyzing technical maintenance logs.
    
        Extract the following entities:
        - fault_location: the component with issue (add machine: IBM3/IBM4 if mentioned)
        - fault_symptoms: all observable problems or symptoms.
        - fault_reason: the stated causes of the issue (if any) + certainty
        - fault_measures: steps taken to resolve the issue + resolution_status
    
        Here are examples of extracting structured data:
    
        Examples 1:
            log_text: "Case-ID: IBM3_C22_03-Aug-16_04-Aug-16\nSources: storing, Storing, reparatie\nIssues:\n- Probleem: Beamcurrent tussen 300 - 200mA  (als je langzaam met de hand zakt tot 150mA)  Niet lager.        | Beamboard getest.

In [None]:
import json
import time
from pathlib import Path

# Define file paths
input_path = Path("/Users/wbm/Documents/BIT/Research Topics/Potential Datasets/remaining_cases_for_baml_fewshot.jsonl")
output_path = Path("baml_extracted_remaining_cases.json")

# Load log entries
with open(input_path, "r", encoding="utf-8") as f:
    log_entries = [json.loads(line) for line in f]

# Retry logic
def extract_with_retry(log_text, retries=3, delay=30):
    for attempt in range(retries):
        try:
            result: FaultReport = b.ExtractFaultInfo(log_text=log_text)
            return result.model_dump()
        except Exception as e:
            if "rate limit" in str(e).lower() and attempt < retries - 1:
                print(f"⚠️ Rate limited. Waiting {delay}s before retrying (attempt {attempt+1}/{retries})...")
                time.sleep(delay)
            else:
                raise e

# Process each log entry
extracted_results = []
for i, log_text in enumerate(log_entries):
    case_id = log_text.split("\n")[0].replace("Case-ID:", "").strip()
    print(f" Processing case {i+1}/{len(log_entries)}: {case_id}")

    try:
        result = extract_with_retry(log_text)
        extracted_results.append({
            "case_id": case_id,
            "result": result
        })
    except Exception as e:
        print(f" Failed to process case {case_id}: {e}")
        extracted_results.append({
            "case_id": case_id,
            "error": str(e)
        })

# Save results
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(extracted_results, f, indent=2, ensure_ascii=False)

print(f"\n Extraction completed! Saved to: {output_path}")

🛠️ Processing case 1/87: IBM3_C14_15-Jan-15_15-Jan-15
🛠️ Processing case 2/87: IBM3_C15_25-Mar-15_25-Mar-152025-04-23T17:54:33.832 [BAML [92mINFO[0m] [35mFunction ExtractFaultInfo[0m:
    [33mClient: CustomGPT4o (gpt-4o-2024-08-06) - 8966ms. StopReason: stop. Tokens(in/out): 2243/209[0m
    [34m---PROMPT---[0m
    [2m[43msystem: [0m[2mYou are an expert in analyzing technical maintenance logs.
    
        Extract the following entities:
        - fault_location: the component with issue (add machine: IBM3/IBM4 if mentioned)
        - fault_symptoms: all observable problems or symptoms.
        - fault_reason: the stated causes of the issue (if any) + certainty
        - fault_measures: steps taken to resolve the issue + resolution_status
    
        Here are examples of extracting structured data:
    
        Examples 1:
            log_text: "Case-ID: IBM3_C22_03-Aug-16_04-Aug-16\nSources: storing, Storing, reparatie\nIssues:\n- Probleem: Beamcurrent tussen 300 - 200mA  

In [12]:
with open("baml_extracted_remaining_cases.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Count only entries that have a 'result' key (not 'error')
successful = [entry for entry in data if "result" in entry]
failed = [entry for entry in data if "error" in entry]

print(f"Successfully extracted: {len(successful)}")
print(f"Failed cases: {len(failed)}")
print(f"Total entries in file: {len(data)}")

Successfully extracted: 87
Failed cases: 0
Total entries in file: 87


EXTRACTING ENTITIES FROM MAINTENANCE LOGS

In [24]:
# Extracting Entity from Machine Manual Books

import os
import base64

image_folder = "/Users/wbm/Documents/BIT/Research Topics/Potential Datasets/Manual Book/Images"
output = {}

# Sort filenames to keep them in page order
image_files = sorted([
    f for f in os.listdir(image_folder)
    if f.lower().endswith(('.jpg', '.jpeg', '.png'))
])

for img_file in image_files:
    img_path = os.path.join(image_folder, img_file)
    with open(img_path, "rb") as f:
        img_bytes = f.read()
    img_b64 = base64.b64encode(img_bytes).decode("utf-8")
    output[img_file] = img_b64
    print(f"Encoded: {img_file}")


Encoded: Compressor_9600_Brooks-images-0.jpg
Encoded: Compressor_9600_Brooks-images-1.jpg
Encoded: Cryopump_Brooks_Installation and Maintenance_001.jpg
Encoded: Cryopump_Brooks_Installation and Maintenance_002.jpg
Encoded: Helix_On-Board controller-2_page-0001.jpg
Encoded: Helix_On-Board controller-2_page-0002.jpg
Encoded: Ion Beam Drive-images-0.jpg
Encoded: Ion Beam Drive-images-1.jpg
Encoded: Ion Beam Drive-images-2.jpg
Encoded: Ion Beam Drive-images-3.jpg
Encoded: Ion Beam Drive-images-4.jpg


In [7]:
import os
import base64
import time
import json
from baml_client.sync_client import b
from baml_py import Image

# Configuration
image_folder = "/Users/wbm/Documents/BIT/Research Topics/Potential Datasets/Manual Book/Images"
output_path = "manual_book_fault_reports.json"
max_retries = 3
retry_delay = 30  # seconds

# Get image list
image_files = sorted([
    f for f in os.listdir(image_folder)
    if f.lower().endswith(('.jpg', '.jpeg', '.png'))
])

all_faults = []

def extract_with_retry(img_obj, image_name, retries=max_retries, delay=retry_delay):
    for attempt in range(retries):
        try:
            return b.ExtractFaultsFromImage(img=img_obj)
        except Exception as e:
            error_str = str(e).lower()
            if any(keyword in error_str for keyword in [
                "rate limit", 
                "context_length_exceeded", 
                "bad mac", 
                "token", 
                "context length", 
                "request failed"
            ]) and attempt < retries - 1:
                print(f" Error with {image_name}: {e}")
                print(f" Waiting {delay}s before retrying (attempt {attempt+1}/{retries})...")
                time.sleep(delay)
            else:
                print(f" Giving up on {image_name} after {attempt+1} attempt(s).\n")
                return []
            

# Main loop
for image_name in image_files:
    img_path = os.path.join(image_folder, image_name)
    print(f"\n🔍 Processing: {image_name}")
    
    try:
        # Load and encode
        with open(img_path, "rb") as f:
            img_bytes = f.read()
        img_b64 = base64.b64encode(img_bytes).decode("utf-8")
        img_obj = Image.from_base64("image/jpeg", img_b64)

        # Extract faults
        results = extract_with_retry(img_obj, image_name)

        # Append results with image source tag
        for fault in results:
            fault_dict = fault.model_dump()
            fault_dict["source_image"] = image_name
            all_faults.append(fault_dict)

    except Exception as e:
        print(f"Unhandled error on {image_name}: {e}")



🔍 Processing: Compressor_9600_Brooks-images-0.jpg
2025-04-25T15:44:15.218 [BAML [92mINFO[0m] [35mFunction ExtractFaultsFromImage[0m:
    [33mClient: CustomGPT4o (gpt-4o-2024-08-06) - 20875ms. StopReason: stop. Tokens(in/out): 1219/455[0m
    [34m---PROMPT---[0m
    [2m[43msystem: [0m[2mYou are an expert in industrial machine maintenance and repair.
    
    The following image is a troubleshooting table. Your task is to extract all distinct fault cases.
    
    **Important rules**:
    - The overall component/system being diagnosed (e.g., from the table title like “Compressor Troubleshooting Procedures”) should be used as the **fault_location**, unless a more specific subcomponent is clearly the root of the issue.
    - If no machine (IBM3 or IBM4) is mentioned in the image, omit it from the output.
    - Do NOT use the component mentioned in the symptom (e.g., a breaker or connector) as the fault location unless it is clearly the root cause.
    - For each `fault_reason`

In [8]:
# Save everything
with open(output_path, "w", encoding="utf-8") as f:
    json.dump(all_faults, f, indent=2, ensure_ascii=False)

print(f"\n Done! Extracted {len(all_faults)} faults from {len(image_files)} images → {output_path}")


 Done! Extracted 28 faults from 11 images → manual_book_fault_reports.json
