In [10]:
import pandas as pd

#Fetch data from given dataset for hazardous event prediction

df = pd.read_excel("C:/Users/017721457/Downloads/Example_HARA1.xlsx", sheet_name="Operating scenarios")

scenario_col = df.columns[0]           
hazard_cols = df.columns[7:20]        
possible_hazardevents = {
    "collisions with pedestrians",
    "collisions with cyclists",
    "electric shock",
    "front-end collisions",
    "side collisions",
    "rear-end collisions"
}

#iterate over rows to fetch operating scenario and hazard
rows = []
for _, row in df.iterrows():
    scenario_text = row[scenario_col]
    
    for hazard_col in hazard_cols:
        hazard_text = hazard_col 
        cell_value = str(row[hazard_col]).strip().lower()
      
        if cell_value in possible_hazardevents:
            event_label = cell_value
        else:
            event_label = "no event"
        
        rows.append({
            "Operating Scenario": scenario_text,
            "Hazard": hazard_text,
            "Hazardous Event": event_label
        })

# Convert to dataframe
pairs_df = pd.DataFrame(rows)
print(pairs_df.head())

output_path = "C:/Users/017721457/Downloads/scenario_hazard_pairs.xlsx"
pairs_df.to_excel(output_path, index=False)


                                  Operating Scenario  \
0  Vehicle Stationary (Vehicle in standstill, par...   
1  Vehicle Stationary (Vehicle in standstill, par...   
2  Vehicle Stationary (Vehicle in standstill, par...   
3  Vehicle Stationary (Vehicle in standstill, par...   
4  Vehicle Stationary (Vehicle in standstill, par...   

                                              Hazard Hazardous Event  
0                       Loss of vehicle acceleration        no event  
1  Unintended vehicle movement in the opposite di...        no event  
2                        Unintended vehicle movement        no event  
3                 Unintended/excessive acceleration         no event  
4        Vehicle acceleration in opposite direction         no event  


In [11]:
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sentence_transformers import SentenceTransformer

#Train the model from the obtained dataset

def clean_text(text):
    if not isinstance(text, str):
        text = str(text)
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return re.sub(r'\s+', ' ', text).strip()


pairs_df = pd.read_excel("C:/Users/017721457/Downloads/scenario_hazard_pairs.xlsx")

pairs_df['scenario_clean'] = pairs_df['Operating Scenario'].apply(clean_text)
pairs_df['hazard_clean'] = pairs_df['Hazard'].apply(clean_text)

#combine scenario + hazard to single string
pairs_df['combined_text'] = pairs_df['scenario_clean'] + " " + pairs_df['hazard_clean']

#encode the target
le_event = LabelEncoder()
pairs_df['event_label'] = le_event.fit_transform(pairs_df['Hazardous Event'])

X_train, X_test, y_train, y_test = train_test_split(
    pairs_df['combined_text'], 
    pairs_df['event_label'], 
    test_size=0.2, 
    random_state=42
)

#generate embeddings of train and test data using sentence transformer
embedder = SentenceTransformer('all-MiniLM-L6-v2')
X_train_emb = embedder.encode(X_train.tolist(), show_progress_bar=True)
X_test_emb  = embedder.encode(X_test.tolist(), show_progress_bar=True)

rf_clf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf_clf.fit(X_train_emb, y_train)

y_pred = rf_clf.predict(X_test_emb)

unique_labels = np.unique(y_test)
target_names = le_event.inverse_transform(unique_labels)

print("Classification Report:", classification_report(y_test, y_pred, labels=unique_labels, target_names=target_names))
print("Test Accuracy:", accuracy_score(y_test, y_pred))

test_results_df = pairs_df.loc[X_test.index, ["Operating Scenario", "Hazard", "Hazardous Event"]].copy()
test_results_df["Predicted Hazardous Event"] = le_event.inverse_transform(y_pred)
output_file = "C:/Users/017721457/Downloads/predicted_hazardous_events_rf.xlsx"
test_results_df.to_excel(output_file, index=False)
print(f"Results saved to {output_file}")


Batches: 100%|█████████████████████████████████████████████████████████████████████████| 25/25 [00:03<00:00,  8.32it/s]
Batches: 100%|███████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00,  9.65it/s]


Classification Report:                              precision    recall  f1-score   support

collisions with pedestrians       0.00      0.00      0.00         1
       front-end collisions       1.00      0.17      0.29         6
                   no event       0.94      1.00      0.97       185
        rear-end collisions       0.00      0.00      0.00         6

                   accuracy                           0.94       198
                  macro avg       0.48      0.29      0.31       198
               weighted avg       0.91      0.94      0.91       198

Test Accuracy: 0.9393939393939394
Results saved to C:/Users/017721457/Downloads/predicted_hazardous_events_rf.xlsx


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [14]:
import pandas as pd
import re
from sentence_transformers import SentenceTransformer

#Run the model on new dataset for predictions

def clean_text(text):
    if not isinstance(text, str):
        text = str(text)
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text
    
def predict_hazard_event(scenario, hazard):
   
    #combine the given operating scenario and hazard, generate embedding & predict
    combined_text = clean_text(scenario) + " " + clean_text(hazard)
    
    embedding = embedder.encode([combined_text])
    
    pred_label = rf_clf.predict(embedding)
    
    #convert numeric label back to text using the label encoder
    predicted_event = le_event.inverse_transform(pred_label)[0]
    return predicted_event

#new dataset
new_file_path = "C:/Users/017721457/Downloads/new_data.xlsx"

df_new = pd.read_excel(new_file_path, sheet_name="Sheet1")

scenario_col = df_new.columns[0]
exposure_col = df_new.columns[1]
hazard_cols = list(df_new.columns[2:15])

results = []

# For each operating scenario, iterate over all hazard columns and check predictions
for idx, row in df_new.iterrows():
    scenario = row[scenario_col]
    exposure = row[exposure_col]
    
    for hazard in hazard_cols:
        # cell_value = str(row[hazard]).strip().lower()
        
        predicted_event = predict_hazard_event(scenario, hazard)
        
        if predicted_event.lower() != "no event":
            results.append({
                "Operating Scenario": scenario,
                "E": exposure,
                "Hazard": hazard,
                "Predicted Hazardous Event": predicted_event
            })

#store as dataframe
results_df = pd.DataFrame(results)

output_file = "C:/Users/017721457/Downloads/prepared_data.xlsx"
results_df.to_excel(output_file, index=False)
print(f"Saved predicted hazardous events to {output_file}")


Saved predicted hazardous events to C:/Users/017721457/Downloads/prepared_data.xlsx
