In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

# Load the data
wsdp = pd.read_csv('workshop_incident_cleaned.csv')
wsdp['Date'] = pd.to_datetime(wsdp['Date'])

# Select numerical features for scaling
#features = ['Age', 'Experience', 'Reliability', 'Sleep', 'Temperature','Humidity']
features = ['Age', 'Year of Tech', 'Experience', 'Reliability', 'Sleep', 'Temperature']
#features = ['Age', 'Year of Tech', 'Experience', 'Reliability', 'Sleep']

# Standardize the features
# To ensure they have a mean of 0 and a standard deviation of 1.
scaler = StandardScaler()
wsdp_scaled = scaler.fit_transform(wsdp[features])
print(wsdp_scaled[:5])

# K-means Clustering
kmeans = KMeans(n_clusters=3, n_init=10, random_state=0) # Set the number of clusters
kmeans.fit(wsdp_scaled) # Fit the model
clusters = kmeans.predict(wsdp_scaled) # Predict the clusters

# Add the cluster labels to the original DataFrame
wsdp['Cluster'] = clusters

# Get the cluster centroids
centroids = kmeans.cluster_centers_

# Calculate distances 
distances = np.sqrt(((wsdp_scaled - centroids[clusters]) ** 2).sum(axis=1))
wsdp['Distance_to_Centroid'] = distances

# Define the features to be used
likelihood_features = ['Age', 'Experience', 'Reliability', 'Sleep','Temperature', 'Year of Tech']
severity_features = [ 'Distance_to_Centroid', 'Difficulty Level']

# Standardize the features
scaler = StandardScaler()
wsdp_scaled_likelihood = scaler.fit_transform(wsdp[likelihood_features])
wsdp_scaled_severity = scaler.fit_transform(wsdp[severity_features])

# Combine the scaled dataframes for full PCA
wsdp_scaled_combined = np.hstack((wsdp_scaled_likelihood, wsdp_scaled_severity))

# Convert the scaled array back to DataFrames for readability
wsdp_scaled_likelihood_df = pd.DataFrame(wsdp_scaled_likelihood, columns=likelihood_features)
wsdp_scaled_severity_df = pd.DataFrame(wsdp_scaled_severity, columns=severity_features)
wsdp_scaled_combined_df = pd.DataFrame(wsdp_scaled_combined, columns=likelihood_features + severity_features)

# Apply PCA
pca = PCA(n_components=len(likelihood_features + severity_features))
pca.fit(wsdp_scaled_combined)

# Get the loadings
loadings = pca.components_.T

# Use the absolute values of the loadings for the first few principal components
num_components = np.argmax(np.cumsum(pca.explained_variance_ratio_) >= 0.95) + 1
weights = np.sum(np.abs(loadings[:, :num_components]), axis=1)

# Normalize the weights to sum to 1
weights /= np.sum(weights)

# Create a dictionary of weights for both likelihood and severity features
pca_weights = dict(zip(likelihood_features + severity_features, weights))
print("Feature weights from PCA:", pca_weights)

# Calculate individual feature scores using the original values and the PCA weights
for feature in likelihood_features + severity_features:
    wsdp[f'{feature}_Score'] = wsdp[feature] * pca_weights[feature]

# Aggregate the likelihood and severity scores
wsdp['Likelihood_Score'] = wsdp[[f'{feature}_Score' for feature in likelihood_features]].sum(axis=1)
wsdp['Severity_Score'] = wsdp[[f'{feature}_Score' for feature in severity_features]].sum(axis=1)

# Combine the likelihood and severity scores to calculate the total risk score
wsdp['Total_Risk_Score'] = wsdp['Likelihood_Score'] * wsdp['Severity_Score']

# Normalize the total risk score to be between 0 and 100
wsdp['Normalized_Risk_Score'] = (wsdp['Total_Risk_Score'] - wsdp['Total_Risk_Score'].min()) / (wsdp['Total_Risk_Score'].max() - wsdp['Total_Risk_Score'].min()) * 100

# Sort the DataFrame by Normalized_Risk_Score in descending order
sorted_wsdp = wsdp.sort_values(by='Normalized_Risk_Score', ascending=False)

# Set options to display all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Display the DataFrame with the calculated risk scores in descending order of Normalized_Risk_Score
sorted_wsdp[['Age', 'Year of Tech', 'Experience', 'Reliability', 'Sleep','Difficulty Level', 'Temperature', 'Distance_to_Centroid', 'Likelihood_Score', 'Severity_Score', 'Total_Risk_Score', 'Normalized_Risk_Score']]


[[ 4.7116097   2.38047614  3.79212135 -0.31448545  0.16552118 -1.12266699]
 [-0.23496216 -0.42008403 -1.00803226 -0.31448545  0.16552118 -1.12266699]
 [-0.48229076 -0.42008403  0.19200614 -0.31448545  0.16552118 -1.12266699]
 [-0.23496216 -0.42008403  0.19200614  3.17979734  0.16552118  0.31664967]
 [-0.23496216 -0.42008403  0.19200614 -0.31448545  0.16552118  0.31664967]]




Feature weights from PCA: {'Age': 0.0931591113088183, 'Experience': 0.15068139454579316, 'Reliability': 0.1487009988259718, 'Sleep': 0.08658913451652926, 'Temperature': 0.15284267994120065, 'Year of Tech': 0.10594762049021703, 'Distance_to_Centroid': 0.09704480090670654, 'Difficulty Level': 0.1650342594647633}


Unnamed: 0,Age,Year of Tech,Experience,Reliability,Sleep,Difficulty Level,Temperature,Distance_to_Centroid,Likelihood_Score,Severity_Score,Total_Risk_Score,Normalized_Risk_Score
92,21,1,2,1,6,4,31,4.054549,7.770011,1.05361,8.186561,100.0
97,20,1,1,1,5,2,30,8.108983,7.286738,1.117003,8.139309,99.265286
78,21,2,1,1,7,4,32,3.135767,7.964709,0.964447,7.681539,92.147381
85,35,2,3,1,7,4,31,1.122649,9.417456,0.769084,7.242818,85.325677
82,30,2,3,1,7,4,31,0.695654,8.951661,0.727647,6.513646,73.987733
11,30,2,5,1,7,3,32,1.980427,9.405866,0.687293,6.464585,73.224887
59,21,1,1,2,7,4,31,1.423048,7.854619,0.798236,6.269843,70.196834
90,19,1,1,1,7,4,32,1.590438,7.672443,0.814481,6.249057,69.873634
66,22,1,1,1,7,4,31,1.198248,7.799077,0.776421,6.055366,66.861915
71,21,1,1,1,7,4,31,1.150444,7.705918,0.771782,5.947287,65.181382


### 