In [267]:
import matplotlib.pyplot as plt
import matplotlib.gridspec as gs
import numpy as np
import pandas as pd
import seaborn as sns
import pickle
import joblib
from sklearn.svm import SVC
import shap

In [268]:
df = pd.read_csv("CleanedColumn.csv", delimiter=",")

In [269]:
len(df)

229835

In [270]:
df.columns

Index(['idart', 'idrt', 'weight_final', 'PSU', 'STRATA', 'province', 'gender',
       'age', 'education', 'work_status', 'injured_past_year', 'head_injury',
       'chest_injury', 'back_injury', 'stomach_injury', 'upper_body_injury',
       'lower_body_injury', 'bruises', 'cuts', 'sprains', 'broken_bones',
       'severed_limbs', 'eye_injury', 'brain_damage', 'internal_damage',
       'burns', 'other', 'impaired', 'missing_body_parts', 'permanent_injury',
       'place_of_injury', 'emotional_mental_health_disorder', 'weight_normal',
       'filter_$', 'time', 'has_injury'],
      dtype='object')

In [271]:
df.drop(columns=['idart','idrt','weight_final','weight_normal','PSU','STRATA','province','time','filter_$'], inplace=True)

In [272]:
df.columns

Index(['gender', 'age', 'education', 'work_status', 'injured_past_year',
       'head_injury', 'chest_injury', 'back_injury', 'stomach_injury',
       'upper_body_injury', 'lower_body_injury', 'bruises', 'cuts', 'sprains',
       'broken_bones', 'severed_limbs', 'eye_injury', 'brain_damage',
       'internal_damage', 'burns', 'other', 'impaired', 'missing_body_parts',
       'permanent_injury', 'place_of_injury',
       'emotional_mental_health_disorder', 'has_injury'],
      dtype='object')

In [273]:
injury_cols = [
    "head_injury", 
    "chest_injury", 
    "back_injury", 
    "stomach_injury", 
    "upper_body_injury", 
    "lower_body_injury", 
    "bruises", 
    "cuts", 
    "sprains", 
    "broken_bones", 
    "severed_limbs", 
    "eye_injury", 
    "brain_damage", 
    "internal_damage", 
    "burns", 
    "other"
]

df['total_injuries'] = df[injury_cols].gt(0).sum(axis=1)

In [274]:
severity_weights = {
    'head_injury': 5,          # Very severe
    'chest_injury': 4,         # Severe
    'back_injury': 3,          # Moderate
    'stomach_injury': 3,       # Moderate
    'upper_body_injury': 2,    # Mild
    'lower_body_injury': 2,    # Mild
    'bruises': 1,              # Minor
    'cuts': 1,                 # Minor
    'sprains': 2,              # Mild
    'broken_bones': 4,         # Severe
    'severed_limbs': 5,        # Very severe
    'eye_injury': 4,           # Severe
    'brain_damage': 5,         # Very severe
    'internal_damage': 5,      # Very severe
    'burns': 4,                # Severe
    'other': 1                 # Minor
}
df['weighted_injury_severity'] = df[severity_weights.keys()].mul(severity_weights.values()).sum(axis=1)

In [275]:
df['total_injury_severity'] = df[['head_injury', 'chest_injury', 'back_injury', 'stomach_injury',
                                  'upper_body_injury', 'lower_body_injury', 'bruises', 'cuts', 
                                  'sprains', 'broken_bones', 'severed_limbs', 'eye_injury', 
                                  'brain_damage', 'internal_damage', 'burns', 'other']].sum(axis=1)

In [276]:
df['injured_body_parts_count'] = df[['head_injury', 'chest_injury', 'back_injury', 'stomach_injury',
                                     'upper_body_injury', 'lower_body_injury', 'bruises', 'cuts', 
                                     'sprains', 'broken_bones', 'severed_limbs', 'eye_injury', 
                                     'brain_damage', 'internal_damage', 'burns', 'other']].gt(0).sum(axis=1)

In [277]:
df['has_injury'] = df[['head_injury', 'chest_injury', 'back_injury', 'stomach_injury',
                                     'upper_body_injury', 'lower_body_injury', 'bruises', 'cuts', 
                                     'sprains', 'broken_bones', 'severed_limbs', 'eye_injury', 
                                     'brain_damage', 'internal_damage', 'burns', 'other']].gt(0).any(axis=1).astype(int)

In [278]:
df['severe_injuries_count'] = df[['head_injury', 'chest_injury', 'back_injury', 'stomach_injury',
                                  'upper_body_injury', 'lower_body_injury', 'bruises', 'cuts', 
                                  'sprains', 'broken_bones', 'severed_limbs', 'eye_injury', 
                                  'brain_damage', 'internal_damage', 'burns', 'other']].eq(2).sum(axis=1)

In [279]:
injury_columns = [
    'head_injury', 
    'chest_injury', 
    'back_injury', 
    'stomach_injury', 
    'upper_body_injury', 
    'lower_body_injury', 
    'bruises', 
    'cuts', 
    'sprains', 
    'broken_bones', 
    'severed_limbs', 
    'eye_injury', 
    'brain_damage', 
    'internal_damage', 
    'burns', 
    'other'
]



In [280]:
df['injured_past_year'].dtypes

dtype('int64')

In [None]:
blue_collar_jobs = [6, 7, 8]

df['blue_collar'] = np.where(df['work_status'].isin(blue_collar_jobs), 1, 0)

print(df)

        gender  age  education  work_status  injured_past_year  head_injury  \
0            1   52          3            7                  2            0   
1            2   24          5            1                  2            0   
2            2   17          3            1                  2            0   
3            1   32          4            5                  2            0   
4            1   58          3            7                  2            0   
...        ...  ...        ...          ...                ...          ...   
229830       1   23          4            4                  1            2   
229831       1   17          3            2                  1            2   
229832       2   65          4            8                  1            2   
229833       2   39          6            4                  1            2   
229834       1   66          7            1                  1            2   

        chest_injury  back_injury  stomach_injury  

In [282]:
# df = df.drop(df[df['injured_past_year'] == 2].index)
# df = df.drop(df[df['age'] > 65].index)
# df = df.drop(df[df['total_injury_severity'] < 26].index)
# df = df.drop(df[df['severe_injuries_count'] < 10].index)
# df = df.drop(df[df['severed_limbs'] == 0].index)
# df = df.drop(df[df['back_injury'] <= 1].index)
# df = df.drop(df[df['broken_bones'] <= 1].index)
# df = df.drop(df[df['place_of_injury'].isin([0, 1])].index)
# df = df.drop(df[df['education'] > 4].index)
# df = df.drop(df[df['work_status'].isin([0, 1, 2, 7 ,8, 9])].index)
# df_men = df.drop(df[df['gender'] == 2].index)
# df = df.drop(df[df['gender'] == 1].index)

In [283]:
df['injured_past_year'].value_counts()

injured_past_year
2    210173
1     19662
Name: count, dtype: int64

In [284]:
# df['work_status'].value_counts()

In [285]:
# df_0 = df[df['injured_past_year'] == 1]
# df_1 = df[df['injured_past_year'] == 2]

# df_0_sample = df_0.sample(n=1668)
# df_1_sample = df_1.sample(n=1668)
# df_mental = pd.concat([df_0_sample, df_1_sample])

In [286]:
# df_0_m = df_men[df_men['has_injury'] == 1]
# df_1_m = df_women[df_women['has_injury'] == 1]

# df_0_sample_m = df_0_m.sample(n=8000)
# df_1_sample_m = df_1_m.sample(n=8000)
# df_mental_m = pd.concat([df_0_sample_m, df_1_sample_m])

In [287]:
corr_matrix = df.corr()
low_corr_features = corr_matrix['emotional_mental_health_disorder'].abs().sort_values(ascending=False)
print(low_corr_features.head(40))
weak_features = low_corr_features[low_corr_features < 0.01].index
df.drop(columns=weak_features, inplace=True)

emotional_mental_health_disorder    1.000000
severed_limbs                       0.106764
total_injuries                      0.106635
has_injury                          0.106635
injured_body_parts_count            0.106635
injured_past_year                   0.106635
missing_body_parts                  0.106557
brain_damage                        0.106553
eye_injury                          0.106546
burns                               0.106473
impaired                            0.106383
chest_injury                        0.106035
weighted_injury_severity            0.105892
broken_bones                        0.105699
total_injury_severity               0.105596
internal_damage                     0.105519
other                               0.105399
stomach_injury                      0.104996
cuts                                0.104854
head_injury                         0.104584
severe_injuries_count               0.104227
back_injury                         0.103732
permanent_

In [288]:
df.columns

Index(['gender', 'age', 'education', 'work_status', 'injured_past_year',
       'head_injury', 'chest_injury', 'back_injury', 'stomach_injury',
       'upper_body_injury', 'lower_body_injury', 'bruises', 'cuts', 'sprains',
       'broken_bones', 'severed_limbs', 'eye_injury', 'brain_damage',
       'internal_damage', 'burns', 'other', 'impaired', 'missing_body_parts',
       'permanent_injury', 'place_of_injury',
       'emotional_mental_health_disorder', 'has_injury', 'total_injuries',
       'weighted_injury_severity', 'total_injury_severity',
       'injured_body_parts_count', 'severe_injuries_count', 'blue_collar'],
      dtype='object')

In [289]:
# df_mental

In [290]:
df.to_csv("dataFinal.csv", index=False)