In [13]:
import pandas as pd
import re
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sentence_transformers import SentenceTransformer
from sklearn.ensemble import RandomForestClassifier

def clean_text(text):
    if pd.isnull(text):
        return ""
    text = str(text).lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def parse_delta_v(delta_str):
    #fetch first number in the string
    if pd.isnull(delta_str):
        return np.nan
    delta_str = str(delta_str).strip()
    match = re.search(r'(\d+\.?\d*)', delta_str)
    if match:
        return float(match.group(1))
    return np.nan

file_path = "C:/Users/017721457/Downloads/Example_HARA1.xlsx"
risk_df = pd.read_excel(file_path, sheet_name="Risk Assessment")

#drop rows missing required columns
risk_df = risk_df.dropna(subset=["ID", "Hazard", "Hazardous Event", "S", "C", "ASIL"])

cols = ['Operating Scenario', 'E', 'Hazard', 'Hazardous Event', 'S', 'Δv', 'C', 'ASIL', 'people at risk', 'Severity Rational', 
        'Controllability Rational']
risk_df = risk_df[cols]


risk_df['op_scenario_clean'] = risk_df['Operating Scenario'].apply(clean_text)
risk_df['hazard_clean']       = risk_df['Hazard'].apply(clean_text)
risk_df['hazard_event_clean'] = risk_df['Hazardous Event'].apply(clean_text)

#convert Exposure to string for prompt
risk_df['exposure_str'] = risk_df['E'].apply(lambda x: str(x))


#create a combined text prompt for input to the model
risk_df['combined_text'] = (risk_df['op_scenario_clean'] + " " +
                            risk_df['exposure_str'] + " " +
                            risk_df['hazard_clean'] + " " +
                            risk_df['hazard_event_clean']) 

# convert Δv into numeric value
risk_df['delta_v_numeric'] = risk_df['Δv'].apply(parse_delta_v)

# convert S and C to integers
risk_df['S'] = risk_df['S'].astype(int)
risk_df['C'] = risk_df['C'].astype(int)


#we'll use the same split for all.
dataset = risk_df.copy()
train_df, test_df = train_test_split(dataset, test_size=0.2, random_state=42)

#inputs for model training and testing
X_train = train_df['combined_text']
X_test  = test_df['combined_text']

#targets for classification tasks
y_train_s = train_df['S']
y_test_s  = test_df['S']
y_train_c = train_df['C']
y_test_c  = test_df['C']
y_train_delta = train_df['delta_v_numeric']
y_test_delta  = test_df['delta_v_numeric']
y_test_asil = test_df['ASIL']
y_train_pr = train_df['people at risk']
y_test_pr  = test_df['people at risk']


embedder = SentenceTransformer('all-MiniLM-L6-v2')

X_train_emb = embedder.encode(list(X_train), show_progress_bar=True)
X_test_emb  = embedder.encode(list(X_test), show_progress_bar=True)

clf_severity = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
clf_severity.fit(X_train_emb, y_train_s)
y_pred_s = clf_severity.predict(X_test_emb)
print("Severity Classification Report:")
print(classification_report(y_test_s, y_pred_s))
print("Severity Accuracy:", accuracy_score(y_test_s, y_pred_s))


clf_controllability = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
clf_controllability.fit(X_train_emb, y_train_c)
y_pred_c = clf_controllability.predict(X_test_emb)
print("Controllability Classification Report:")
print(classification_report(y_test_c, y_pred_c))
print("Controllability Accuracy:", accuracy_score(y_test_c, y_pred_c))


asil_lookup = {
    (3, 4, 3): "ASIL D", (3, 4, 2): "ASIL C", (3, 4, 1): "ASIL B",
    (3, 3, 3): "ASIL C", (3, 3, 2): "ASIL B", (3, 3, 1): "ASIL A",
    (3, 2, 3): "ASIL B", (3, 2, 2): "ASIL A", (3, 2, 1): "QM",
    (2, 4, 3): "ASIL C", (2, 4, 2): "ASIL B", (2, 4, 1): "ASIL A",
    (2, 3, 3): "ASIL B", (2, 3, 2): "ASIL A", (2, 3, 1): "QM",
    (1, 4, 3): "ASIL B", (1, 4, 2): "ASIL A", (1, 4, 1): "QM",
    (1, 3, 3): "ASIL A", (1, 3, 2): "QM",      (1, 3, 1): "QM",
}

def determine_asil(s, e, c):
    if s == 0:
        return "No ASIL"
    return asil_lookup.get((s, e, c), "QM")


s_preds = y_pred_s
c_preds = y_pred_c
e_vals = test_df['E'].astype(int).values
asil_preds = [determine_asil(s, e, c) for s, e, c in zip(s_preds, e_vals, c_preds)]
asil_accuracy = accuracy_score(y_test_asil, asil_preds)
print("ASIL Accuracy:", asil_accuracy)



X_train_extra = train_df[['E', 'S', 'C']].to_numpy()
X_test_extra  = test_df[['E', 'S', 'C']].to_numpy()

#combine text embeddings with extra features above
X_train_combined = np.hstack([X_train_emb, X_train_extra])
X_test_combined  = np.hstack([X_test_emb, X_test_extra])

regressor = RandomForestRegressor(n_estimators=100, random_state=42)
regressor.fit(X_train_combined, y_train_delta)
y_pred_delta = regressor.predict(X_test_combined)

mse = mean_squared_error(y_test_delta, y_pred_delta)
r2 = r2_score(y_test_delta, y_pred_delta)
print("Δv Regression Mean Squared Error:", mse)
print("Δv Regression R2 Score:", r2)


clf_people = LogisticRegression(max_iter=1000, class_weight="balanced")
clf_people.fit(X_train_emb, y_train_pr)
y_pred_pr = clf_people.predict(X_test_emb)
print("People at Risk Classification Report:")
print(classification_report(y_test_pr, y_pred_pr))
print("People at Risk Accuracy:", accuracy_score(y_test_pr, y_pred_pr))


result_df = test_df.copy()
result_df['Predicted_S'] = s_preds
result_df['Predicted_C'] = c_preds
result_df['Predicted_ASIL'] = asil_preds
result_df['Predicted_Δv'] = y_pred_delta
result_df['Predicted_People_at_risk'] = y_pred_pr


print(result_df[['Operating Scenario', 'E', 'S', 'Predicted_S', 'C', 'Predicted_C', 'ASIL', 'Predicted_ASIL', 'delta_v_numeric', 
                 'Predicted_Δv', 'people at risk', 'Predicted_People_at_risk']])


result_df.to_excel("C:/Users/017721457/Downloads/Test_Predictions.xlsx", index=False)
print("Results saved to 'Combined_Predictions.xlsx'.")


Batches: 100%|███████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.77it/s]
Batches: 100%|███████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 10.91it/s]


Severity Classification Report:
              precision    recall  f1-score   support

           1       0.75      0.75      0.75         4
           3       0.67      0.67      0.67         3

    accuracy                           0.71         7
   macro avg       0.71      0.71      0.71         7
weighted avg       0.71      0.71      0.71         7

Severity Accuracy: 0.7142857142857143
Controllability Classification Report:
              precision    recall  f1-score   support

           1       0.00      0.00      0.00         2
           3       0.71      1.00      0.83         5

    accuracy                           0.71         7
   macro avg       0.36      0.50      0.42         7
weighted avg       0.51      0.71      0.60         7

Controllability Accuracy: 0.7142857142857143
ASIL Accuracy: 0.8571428571428571


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Δv Regression Mean Squared Error: 2233.684628571428
Δv Regression R2 Score: 0.09073084438241463
People at Risk Classification Report:
                                               precision    recall  f1-score   support

 driver & passengers,
people in other vehicle       0.67      1.00      0.80         2
driver & passengers, 
people in other vehicle       1.00      0.50      0.67         2
driver & passengers,  people in other vehicle       1.00      1.00      1.00         1
                         pedestrians/cyclists       0.50      0.50      0.50         2

                                     accuracy                           0.71         7
                                    macro avg       0.79      0.75      0.74         7
                                 weighted avg       0.76      0.71      0.70         7

People at Risk Accuracy: 0.7142857142857143
                                   Operating Scenario  E  S  Predicted_S  C  \
29  Vehicle Stationary  - Start Up\n\ndriver

In [17]:
import pandas as pd
import numpy as np
import re
from sentence_transformers import SentenceTransformer

def clean_text(text):
    if pd.isnull(text):
        return ""
    text = str(text).lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text


#upload prepared data
new_file_path = "C:/Users/017721457/Downloads/prepared_data.xlsx" 
new_df = pd.read_excel(new_file_path, sheet_name="Sheet1") 


results_df = new_df.copy()
new_df['op_scenario_clean'] = new_df['Operating Scenario'].apply(clean_text)
new_df['hazard_clean']       = new_df['Hazard'].apply(clean_text)
new_df['hazard_event_clean'] = new_df['Hazardous Event'].apply(clean_text)
new_df['exposure_str']       = new_df['E'].apply(lambda x: str(x))


new_df['combined_text'] = (new_df['op_scenario_clean'] + " " +
                           new_df['exposure_str'] + " " +
                           new_df['hazard_clean'] + " " +
                           new_df['hazard_event_clean'])


X_new = new_df['combined_text']
X_new_emb = embedder.encode(list(X_new), show_progress_bar=True)

#use trained models here
pred_severity = clf_severity.predict(X_new_emb)
pred_controllability = clf_controllability.predict(X_new_emb)

results_df['Predicted_S'] = pred_severity
results_df['Predicted_C'] = pred_controllability


X_new_extra = results_df[['E', 'Predicted_S', 'Predicted_C']].to_numpy()

X_new_combined = np.hstack([X_new_emb, X_new_extra])
pred_delta_v = regressor.predict(X_new_combined)
results_df['Predicted_Δv'] = pred_delta_v


pred_people = clf_people.predict(X_new_emb)
results_df['Predicted_People_at_risk'] = pred_people


asil_predictions = [
    determine_asil(s, int(e), c)
    for s, e, c in zip(pred_severity, new_df['E'], pred_controllability)
]
results_df['Predicted_ASIL'] = asil_predictions


def determine_hazardous_event_details(hazardous_event):
    if 'front-end' in hazardous_event.lower():
        return "front-end collisions with the barrier or collision with the traffic in the next lane"
    elif 'side' in hazardous_event.lower():
        return "side collisions with the traffic in the next lane"
    elif 'rear-end' in hazardous_event.lower():
        return "rear-end collisions with traffic behind"
    elif 'pedestrians' in hazardous_event.lower():
        return "collisions with pedestrians"
    elif 'cyclist' in hazardous_event.lower():
        return "collisions with cyclists in bike lanes"
    else:
        return hazardous_event

results_df['Predicted_Details_of_Hazardous_event'] = results_df['Hazardous Event'].apply(determine_hazardous_event_details)


def determine_severity_rationale(hazardous_event, delta_v, severity):
    if delta_v > 20:
        speed_desc = "high impact speed"
    elif delta_v < 10:
        speed_desc = "low impact speed"
    else:
        speed_desc = "impact speed"

    return f"{hazardous_event} at {speed_desc} (Δv = {delta_v:.1f} km/h) driver: S{severity}"


def determine_controllability_rationale(controllability):
    
    if controllability == 'C3':
        return ("drivers controllability: hardly controllable\n"
                "less than 90% of all drivers or other traffic participants are usually able or barely able to avoid harm")
    elif controllability == 'C2':
        return ("drivers controllability: enough time for braking, keeping within the lane is possible, normally controllable\n"
                "90% or more of all drivers or other traffic participants are usually able to avoid harm")
    else:
        return ("drivers controllability: easily controllable\n"
                "more than 99% of all drivers can avoid harm")


results_df['Predicted_Severity_Rational'] = results_df.apply(
    lambda row: determine_severity_rationale(row['Hazardous Event'], row['Predicted_Δv'], row['Predicted_S']),
    axis=1
)

results_df['Controllability_Label'] = results_df['Predicted_C'].apply(lambda x: f"C{x}")

results_df['Predicted_Controllability_Rationale'] = results_df['Controllability_Label'].apply(determine_controllability_rationale)

results_df = results_df.drop("Controllability_Label", axis=1)

output_file = "C:/Users/017721457/Downloads/new_predictions.xlsx"
results_df.to_excel(output_file, index=False)
print(f"Predictions saved to {output_file}")


Batches: 100%|███████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 12.51it/s]

Predictions saved to C:/Users/017721457/Downloads/new_predictions.xlsx



