In [None]:
%pip install pandas matplotlib seaborn plotly dash scikit-learn joblib
import pandas as pd
from pptx import Presentation
from pptx.util import Inches
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import plotly.express as px
import dash
from dash import dcc, html, Input, Output
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib
import re

In [None]:
accidents = pd.read_csv("cleaned_data/accidents.csv")

In [None]:
accidents['worker_age'].fillna(0).astype(int)

In [None]:
accidents.describe(include='all').T.fillna('')

# Reports

In [None]:
path = "report/dataset2/"

### Geographic Trends

In [None]:
combined_geographic_data = (
    accidents.groupby(['organization_province_code', 'city'])
    .size()
    .reset_index(name='Accident_Count')
    .sort_values(by='Accident_Count', ascending=False)
)

top_combined_data = combined_geographic_data.head(20)

# Plotting a bar chart for top cities and provinces
plt.figure(figsize=(14, 8))
for province in top_combined_data['organization_province_code'].unique():
    subset = top_combined_data[top_combined_data['organization_province_code'] == province]
    plt.bar(
        subset['city'] + f" ({province})",
        subset['Accident_Count'],
        label=f"Province: {province}",
        alpha=0.7
    )

plt.title("Top Cities with Workplace Accidents by Province", fontsize=16)
plt.xlabel("City (Province)", fontsize=12)
plt.ylabel("Accident Count", fontsize=12)
plt.xticks(rotation=45, ha='right', fontsize=10)
plt.legend(title="Province", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.savefig(path+"combined_geographic_bar.png")
plt.close()

In [None]:
# Convert 'Accident_Date' to datetime format
accidents['date'] = pd.to_datetime(accidents['date'], errors='coerce')
accidents['Year-Month'] = accidents['date'].dt.to_period('M').astype(str)

# Get unique values for dropdown filters
unique_industries = accidents['industry_sector_description'].dropna().unique()
unique_accident_categories = accidents['accident_category_description'].dropna().unique()


### severity Analysis Dashboard + Prediction Model

In [None]:
# Define Severity Mapping
severity_mapping = {
    'FALL ON SAME LEVEL': 1, 'FALL TO LOWER LEVEL': 2, 'STRUCK BY OBJECT': 2,
    'CAUGHT IN OR COMPRESSED BY EQUIP./OBJECTS': 3, 'HIGHWAY ACCIDENT': 3,
    'OVEREXERTION': 1, 'PEDESTRIAN STRUCK BY VEHICLE, MOBILE EQUIPMENT': 3,
    'EXPLOSION': 3, 'CONTACT WITH ELECTRIC CURRENT': 3, 'FIRE--UNINTEDED OR UNCONTROLLED': 3
}
accidents['Severity_Score'] = accidents['accident_category_description'].map(severity_mapping).fillna(1)

app = dash.Dash(__name__)

app.layout = html.Div([
    html.H1("Workplace Accident Severity Dashboard", style={'textAlign': 'center'}),

    dcc.Dropdown(
        id='industry-dropdown',
        options=[{'label': i, 'value': i} for i in accidents['industry_sector_description'].dropna().unique()],
        value=None, placeholder="Select an industry", multi=True
    ),
    dcc.Dropdown(
        id='accident-dropdown',
        options=[{'label': i, 'value': i} for i in accidents['accident_category_description'].dropna().unique()],
        value=None, placeholder="Select an accident category", multi=True
    ),

    # dcc.Graph(id='accident-trends-graph'),
    dcc.Graph(id='severity-industry-bar'),
    dcc.Graph(id='severity-occupation-heatmap')
])

@app.callback(
    # [Output('accident-trends-graph', 'figure'),
     [Output('severity-industry-bar', 'figure'),
     Output('severity-occupation-heatmap', 'figure')],
    [Input('industry-dropdown', 'value'),
     Input('accident-dropdown', 'value')]
)
def update_graphs(selected_industries, selected_accidents):
    filtered_data = accidents.copy()
    if selected_industries:
        filtered_data = filtered_data[filtered_data['industry_sector_description'].isin(selected_industries)]
    if selected_accidents:
        filtered_data = filtered_data[filtered_data['accident_category_description'].isin(selected_accidents)]

    accident_trends_filtered = filtered_data.groupby('Year-Month').size().reset_index(name='Accident Count')
    accident_trends_filtered['Accident Count'] = accident_trends_filtered['Accident Count'].astype(int)
    # print(accident_trends_filtered)
    # fig1 = px.line(accident_trends_filtered, x='Year-Month', y='Accident Count',
    #                title='Monthly Workplace Accident Trends', markers=True)

    # Industry Severity
    industry_severity_filtered = filtered_data.groupby('industry_sector_description')['Severity_Score'].mean().reset_index()
    industry_severity_filtered['Severity_Score'] = industry_severity_filtered['Severity_Score'].astype(float)
    # print(industry_severity_filtered['Severity_Score'].unique())
    fig2 = px.bar(industry_severity_filtered, x='Severity_Score', y='industry_sector_description',
                  title="Average Severity Score by Industry", color='Severity_Score')

    # Occupation Severity Heatmap
    occupation_severity_filtered = filtered_data.groupby('occupation_description')['Severity_Score'].mean().reset_index()
    occupation_severity_filtered['Severity_Score'] = occupation_severity_filtered['Severity_Score'].astype(float)
    # print(occupation_severity_filtered['Severity_Score'].unique())
    top_occupations = occupation_severity_filtered.sort_values(by='Severity_Score', ascending=False).head(30)
    fig3 = px.imshow(top_occupations.set_index('occupation_description').T,
                     labels=dict(color="Severity_Score"),
                     title="Top 30 Occupations by Severity Score")

    return fig2, fig3


if __name__ == '__main__':
    app.run_server(debug=True)



In [None]:
# Train a Prediction Model
features = ['worker_age', 'worker_experience_in_years', 'NOC']
dataset2_clean = accidents.dropna(subset=features + ['Severity_Score'])
# dataset2_clean['NOC'] = dataset2_clean['NOC'].astype(str)
# dataset2_clean['NOC'] = dataset2_clean['NOC'].apply(lambda x: pd.NA if re.findall("\D", x) else x)
# dataset2_clean = dataset2_clean.dropna(subset=['NOC'])
# dataset2_clean['NOC'].unique()
# dataset2_clean

In [None]:
# ML
X = dataset2_clean[features]
y = dataset2_clean['Severity_Score']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
"""
I chose Random Forest Classifier becuase it is suitable for structured data like this one. And can deduce overfitting via ensemble learning and is capable of analyzing feature importance which is the main goal of using ML for this dataset.
"""
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred = [int(y) for y in y_pred]


In [None]:
set(y_pred)

In [None]:

model_accuracy = accuracy_score(y_test, y_pred)
joblib.dump(model, "report/dataset2/severity_prediction_model.pkl")

if __name__ == '__main__':
    app.run_server(debug=True)