In [52]:
# !pip install langchain==0.2.12
# !pip install langgraph==0.2.2
# !pip install langchain-ollama==0.1.1
# !pip install langsmith==0.1.98
# !pip install langchain_community==0.2.11
# !pip install duckduckgo-search==6.2.13
# !pip install pandas
# !pip install tqdm

In [53]:
from langchain_community.chat_models import ChatOllama
import time
import pandas as pd
from tqdm import tqdm
import re
import string
import json

In [54]:
seed = int(time.time())

In [55]:
llama3 = ChatOllama(model="llama3.2:3b",
                    format="json",
                    temperature=0.1,
                    top_k=100,
                    top_p=0.1,
                    seed=seed,
                    mirostat_tau=0.1)

In [56]:
def call_llm(question):
    return llama3.invoke(question)

In [57]:
def call_llm_with_retry(prompt, max_retries=3):
    retries = 0
    while retries < max_retries:
        response = call_llm(prompt)
        try:
            response_content = response.content
            json_response = json.loads(response_content)
            return json_response  # Return valid JSON response
        except (json.JSONDecodeError, AttributeError) as e:
            retries += 1
            print(f"Retry {retries} for prompt due to: {e}")
    return None  # Return None if all retries fail


In [58]:
def batch_call_llm(prompts, batch_size=10):
    all_responses = []
    for i in range(0, len(prompts), batch_size):
        batch_prompts = prompts[i:i + batch_size]
        responses = llama3.batch(batch_prompts)  # Use batch call for parallel processing
        all_responses.extend([resp.content for resp in responses])
    return all_responses


In [59]:
SIF_KEYWORDS = [
    "suspend", "load", "fall", "elevation", "mobile", "equipment", "traffic", "motor", "vehicle", "heavy", 
    "rotating", "equipment", "machine", "mechanical", "temperature", "high", "steam", "fire", "fuel", 
    "explosion", "trench", "excavation", "electrical", "contact", "arc", "flash", "toxic", "chemical", 
    "radiation", "high-energy", "pressure", "unsupported", "soil", "depth", "voltage", "shock", "burn", 
    "third-degree", "burns", "combustion", "IDLH", "oxygen depletion", "pH", "corrosive", "exposure", 
    "crane", "hoist", "lifting", "work zone", "pedestrian", "struck", "vehicle speed", "30 mph"
]

In [60]:
# Define the function to preprocess the text data
def preprocess_text(text):
    text = str(text).lower()
    text = text.replace('\n', ' ')
    text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove digits
    return text

In [61]:
def calculate_risk_score(text, keywords=SIF_KEYWORDS):
    # Create a regex pattern from the keywords
    pattern = re.compile(r'\b(?:' + '|'.join(re.escape(k) for k in keywords) + r')\b')
    return len(re.findall(pattern, text))


In [62]:
DATA_FILE = r"CORE_HackOhio_subset_cleaned_downsampled 1.csv"

In [63]:
dataframe = pd.read_csv(DATA_FILE)

In [64]:
dataframe['RESPONSE'] = None
dataframe['HIGH_ENERGY'] = None
dataframe['INCIDENT'] = None
dataframe['INJURY'] = None
dataframe['CONTROLS_PRESENT'] = None
dataframe['SEVERITY_SCORE'] = None
dataframe['LLM_STATUS'] = None
dataframe['CONFIDENCE_SCORE'] = None

In [65]:
RISK_SCORE = 15

In [66]:
failed_responses = []

In [67]:
# Main loop to process each row and call the LLM
for index, row in tqdm(dataframe.iterrows(), desc="Assembling Prompts", total=len(dataframe), unit="row"):
    point_name = row['PNT_NM']
    qualifier_txt = row['QUALIFIER_TXT']
    atrisk_notes = row['PNT_ATRISKNOTES_TX']
    followup_notes = row['PNT_ATRISKFOLWUPNTS_TX']

    if pd.isna(followup_notes):
        followup_notes = "No follow-up notes provided."

    combined_text = preprocess_text(f"{point_name} {qualifier_txt} {atrisk_notes} {followup_notes}")
    risk_score = calculate_risk_score(combined_text)

    dataframe.at[index, 'RISK_SCORE'] = risk_score
    # dataframe.at[index, 'COMBINED_TEXT'] = combined_text

    if risk_score > RISK_SCORE:
        prompt = f"""
        Safety Observation: {combined_text}.
        Based on the safety observation provided, please answer the following questions by providing a **valid JSON object**. All answers should be in exact field names and integer values only (1 for Yes, 0 for No). Ensure consistency with the safety guidelines provided below.

        JSON format:
        {{
        "high_energy_present": 1 or 0, 
        "high_energy_incident": 1 or 0, 
        "serious_injury_sustained": 1 or 0, 
        "direct_controls_present": 1 or 0, 
        "severity_score": 1 to 5, 
        "confidence_score": 0 to 1
        }}

        ### Guidelines for Responses:
        1. **Is high-energy present?** 
        (High-energy refers to any condition where energy exceeds 500 ft-lbs, such as electrical sources >50 volts, falling from heights >4 feet, mechanical equipment, etc.)
        
        2. **Was there a high-energy incident?**
        (An incident is defined by the release of high energy where a worker was in contact or proximity, within 6 feet without restricted egress.)

        3. **Was a serious injury sustained?** 
        (Serious injuries are life-threatening or life-altering injuries, as defined by EEI criteria: fractures, amputations, concussions, third-degree burns, etc.)

        4. **Were direct controls present?** 
        (Direct controls refer to barriers that specifically mitigate high-energy hazards. Examples include LOTO, fall protection, or machine guarding, not general safety equipment like PPE unless specifically designed to target the high-energy source.)

        5. **Provide a severity score** from 1 (low severity) to 5 (high severity). 
        (Assess the potential severity of the incident based on proximity, injury risk, and energy magnitude.)

        6. **Confidence score** (0-1): 
        Provide a confidence score from 0 (low) to 1 (high), reflecting your certainty in the presence of high-energy, control effectiveness, and the accuracy of this classification.

        Ensure the response is clear, concise, and structured as a JSON object.
        """


        # Call the LLM with retry logic
        response_content = call_llm_with_retry(prompt)
        if response_content is None:
            print(f"Error processing prompt for row {index}")
            dataframe.at[index, 'LLM_STATUS'] = "LLM Failed"
            failed_responses.append({"index": index, "prompt": prompt, "error": "LLM Error"})
            continue

        # Check if response_content is already a dictionary (since JSON is already parsed)
        if isinstance(response_content, dict):
            json_response = response_content
        else:
            try:
                json_response = json.loads(response_content)
            except json.JSONDecodeError:
                print(f"Error parsing JSON for row {index}")
                dataframe.at[index, 'LLM_STATUS'] = "Failed"
                failed_responses.append({"index": index, "prompt": prompt, "error": "JSON Parsing Error"})
                continue

        # Process the JSON response
        high_energy = json_response.get("high_energy_present", None)
        incident = json_response.get("high_energy_incident", None)
        injury = json_response.get("serious_injury_sustained", None)
        controls_present = json_response.get("direct_controls_present", None)
        severity_score = json_response.get("severity_score", None)
        confidence_score = json_response.get("confidence_score", None)

        dataframe.at[index, 'HIGH_ENERGY'] = high_energy
        dataframe.at[index, 'INCIDENT'] = incident
        dataframe.at[index, 'INJURY'] = injury
        dataframe.at[index, 'CONTROLS_PRESENT'] = controls_present
        dataframe.at[index, 'SEVERITY_SCORE'] = severity_score
        dataframe.at[index, 'CONFIDENCE_SCORE'] = confidence_score
        dataframe.at[index, 'LLM_STATUS'] = "Success"
        
    # Save progress every 100 rows
    if index % 100 == 0:
        dataframe.to_csv(f"output_partial.csv", index=False)


Assembling Prompts: 100%|██████████| 20000/20000 [00:50<00:00, 397.40row/s]


In [68]:
# Save the updated dataframe with numerical responses and status to a new CSV file
output_file = "output_with_llm_json_responses.csv"
dataframe.to_csv(output_file, index=False)

print(f"Processing complete. LLM responses saved to {output_file}")


Processing complete. LLM responses saved to output_with_llm_json_responses.csv


In [69]:
with open("failed_responses.json", "w") as outfile:
    json.dump(failed_responses, outfile, indent=4)

In [70]:
print(f"Processing complete. LLM responses saved to {output_file}")

Processing complete. LLM responses saved to output_with_llm_json_responses.csv
