In [1]:
# !pip install langchain==0.2.12
# !pip install langgraph==0.2.2
# !pip install langchain-ollama==0.1.1
# !pip install langsmith==0.1.98
# !pip install langchain_community==0.2.11
# !pip install duckduckgo-search==6.2.13
# !pip install pandas
# !pip install tqdm

In [2]:
from langchain_community.chat_models import ChatOllama
import time
import pandas as pd
from tqdm import tqdm
import re
import string
import json

In [3]:
seed = int(time.time())

In [4]:
llama3 = ChatOllama(model="llama3.2:3b",
                    format="json",
                    temperature=0.1,
                    top_k=100,
                    top_p=0.1,
                    seed=seed,
                    mirostat_tau=0.1)

In [5]:
def call_llm(question):
    return llama3.invoke(question)

In [6]:
DATA_FILE = r"CORE_HackOhio_subset_cleaned_downsampled 1.csv"

In [7]:
dataframe = pd.read_csv(DATA_FILE)

In [8]:
SIF_KEYWORDS = [
    "suspend", "load", "fall", "elevation", "mobile", "equipment", "traffic", "motor", "vehicle", "heavy", 
    "rotating", "equipment", "machine", "mechanical", "temperature", "high", "steam", "fire", "fuel", 
    "explosion", "trench", "excavation", "electrical", "contact", "arc", "flash", "toxic", "chemical", 
    "radiation", "high-energy", "pressure", "unsupported", "soil", "depth", "voltage", "shock", "burn", 
    "third-degree", "burns", "combustion", "IDLH", "oxygen depletion", "pH", "corrosive", "exposure", 
    "crane", "hoist", "lifting", "work zone", "pedestrian", "struck", "vehicle speed", "30 mph"
]

In [9]:
# Define the function to preprocess the text data
def preprocess_text(text):
    text = str(text).lower()
    text = text.replace('\n', ' ')
    text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove digits
    return text

In [10]:
def calculate_risk_score(text, keywords=SIF_KEYWORDS):
    # Create a regex pattern from the keywords
    pattern = re.compile(r'\b(?:' + '|'.join(re.escape(k) for k in keywords) + r')\b')
    return len(re.findall(pattern, text))


In [11]:
dataframe['RESPONSE'] = None
dataframe['HIGH_ENERGY'] = None
dataframe['INCIDENT'] = None
dataframe['INJURY'] = None
dataframe['CONTROLS_PRESENT'] = None
dataframe['SEVERITY_SCORE'] = None

In [12]:
for index, row in tqdm(dataframe.iterrows(), desc="Assembling Prompts", total=len(dataframe), unit="row"):
    point_name = row['PNT_NM']
    qualifier_txt = row['QUALIFIER_TXT']
    atrisk_notes = row['PNT_ATRISKNOTES_TX']
    followup_notes = row['PNT_ATRISKFOLWUPNTS_TX']

    if pd.isna(followup_notes):
        followup_notes = "No follow-up notes provided."

    # Preprocess and clean the combined text
    combined_text = preprocess_text(f"{point_name} {qualifier_txt} {atrisk_notes} {followup_notes}")
    risk_score = calculate_risk_score(combined_text)

    dataframe.at[index, 'RISK_SCORE'] = risk_score
    dataframe.at[index, 'COMBINED_TEXT'] = combined_text

    if risk_score > 15:
        # Generate the prompt requesting a structured JSON response
        prompt = f"""
        Safety Observation: {combined_text}.
        Please provide the answers to the following questions in the form of a structured JSON object with the following fields:

        {{
          "high_energy_present": 1 or 0, 
          "high_energy_incident": 1 or 0, 
          "serious_injury_sustained": 1 or 0, 
          "direct_controls_present": 1 or 0, 
          "severity_score": 1 to 5
        }}
        Questions:
        1. Is high-energy present? (1 for Yes, 0 for No).
        2. Was there a high-energy incident? (1 for Yes, 0 for No).
        3. Was a serious injury sustained? (1 for Yes, 0 for No).
        4. Were direct controls present? (1 for Yes, 0 for No).
        5. Provide a severity score from 1 (low severity) to 5 (high severity) based on energy management and the outcome.
        """
        
        # Call the LLM
        response = call_llm(question=prompt)

        # Access the actual content of the LLM response
        try:
            response_content = response.content  # Access the content attribute from AIMessage object
            print(response_content)

            # Parse the JSON response
            try:
                json_response = json.loads(response_content)

                high_energy = json_response.get("high_energy_present", None)
                incident = json_response.get("high_energy_incident", None)
                injury = json_response.get("serious_injury_sustained", None)
                controls_present = json_response.get("direct_controls_present", None)
                severity_score = json_response.get("severity_score", None)

                dataframe.at[index, 'HIGH_ENERGY'] = high_energy
                dataframe.at[index, 'INCIDENT'] = incident
                dataframe.at[index, 'INJURY'] = injury
                dataframe.at[index, 'CONTROLS_PRESENT'] = controls_present
                dataframe.at[index, 'SEVERITY_SCORE'] = severity_score

            except json.JSONDecodeError:
                print(f"Error parsing JSON for row {index}")
                dataframe.at[index, 'RESPONSE'] = "Error parsing JSON"

        except Exception as e:
            print(f"Error accessing response content for row {index}: {e}")
            dataframe.at[index, 'RESPONSE'] = "Error accessing response content"

        # Store the raw LLM response in the dataframe
        dataframe.at[index, 'RESPONSE'] = response_content  # Store the content
        print(f"{combined_text}: {response_content}")

Assembling Prompts:  18%|█▊        | 3657/20000 [00:03<00:13, 1192.04row/s]

{
  "high_energy_present": 1,
  "high_energy_incident": 0,
  "serious_injury_sustained": 0,
  "direct_controls_present": 0,
  "severity_score": 4
}
drilled excavations appropriately protected drilling excavations  e g   guardrail system  fall protection  hole covers  etc    fall protection utilized inside guardrail area  guardrail system erected within  feet the crew was placing concrete as backfill for a pole butt that was being set  when i came around the concrete truck there was an employee  who i believe to be the crew lead  leaning across the excavation with one hand on the pole attempting to level it without fall protection or guardrails in place  i asked said individual how far the concrete was to the top of excavation  in which he replied      i cautiously approached the hole to investigate  in which i determined that distance to be      i then stopped the crew and asked them to step back away from the excavation while i investigated further  to give some backstory  it was dete

Assembling Prompts:  82%|████████▏ | 16461/20000 [00:08<00:03, 1077.30row/s]

{
  "high_energy_present": 0,
  "high_energy_incident": 0,
  "serious_injury_sustained": 0,
  "direct_controls_present": 1,
  "severity_score": 2
}
material handling  motorized mechanical equipment designated spotter and or qualified observer in place and safe work distance maintained to structures  equipment and overhead hazards   mechanical  employees completed a truck loading unloading assessment for all materials being loaded and unloaded where equipment or rigging is required to assist in moving the material   mechanical  load properly secured and equipment in use not left unattended   mechanical  operators properly trained and deemed competent qualified to operate equipment   mechanical  other   material handling  motorized mechanical equipment   telecom   mechanical  weight of the load verified and equipment rigging rated for the load   mechanical workers in bucket exhibited safety precautions as did those on the ground  it looked like they all are very experienced and adhered t

Assembling Prompts: 100%|██████████| 20000/20000 [00:08<00:00, 2356.87row/s]


In [13]:
# Save the updated dataframe with numerical responses to a new CSV file
output_file = "output_with_llm_json_responses.csv"
dataframe.to_csv(output_file, index=False)

print(f"Processing complete. LLM responses saved to {output_file}")

Processing complete. LLM responses saved to output_with_llm_json_responses.csv
