In [479]:
import matplotlib.pyplot as plt
import matplotlib.gridspec as gs
import numpy as np
import pandas as pd
import seaborn as sns
import pickle
import joblib
from sklearn.svm import SVC


In [480]:
df = pd.read_csv("CleanedColumn.csv", delimiter=",")

In [481]:
len(df)

229835

In [482]:
df.columns

Index(['idart', 'idrt', 'weight_final', 'PSU', 'STRATA', 'province', 'gender',
       'age', 'education', 'work_status', 'injured_past_year', 'head_injury',
       'chest_injury', 'back_injury', 'stomach_injury', 'upper_body_injury',
       'lower_body_injury', 'bruises', 'cuts', 'sprains', 'broken_bones',
       'severed_limbs', 'eye_injury', 'brain_damage', 'internal_damage',
       'burns', 'other', 'impaired', 'missing_body_parts', 'permanent_injury',
       'place_of_injury', 'emotional_mental_health_disorder', 'weight_normal',
       'filter_$', 'time', 'has_injury'],
      dtype='object')

In [483]:
df.drop(columns=['idart','idrt','weight_final','weight_normal','PSU','STRATA','province','time','filter_$'], inplace=True)

In [484]:
df.columns

Index(['gender', 'age', 'education', 'work_status', 'injured_past_year',
       'head_injury', 'chest_injury', 'back_injury', 'stomach_injury',
       'upper_body_injury', 'lower_body_injury', 'bruises', 'cuts', 'sprains',
       'broken_bones', 'severed_limbs', 'eye_injury', 'brain_damage',
       'internal_damage', 'burns', 'other', 'impaired', 'missing_body_parts',
       'permanent_injury', 'place_of_injury',
       'emotional_mental_health_disorder', 'has_injury'],
      dtype='object')

In [485]:
injury_cols = [
    "head_injury", 
    "chest_injury", 
    "back_injury", 
    "stomach_injury", 
    "upper_body_injury", 
    "lower_body_injury", 
    "bruises", 
    "cuts", 
    "sprains", 
    "broken_bones", 
    "severed_limbs", 
    "eye_injury", 
    "brain_damage", 
    "internal_damage", 
    "burns", 
    "other"
]

df['total_injuries'] = df[injury_cols].gt(0).sum(axis=1)

In [486]:
severity_weights = {
    'head_injury': 5,          # Very severe
    'chest_injury': 4,         # Severe
    'back_injury': 3,          # Moderate
    'stomach_injury': 3,       # Moderate
    'upper_body_injury': 2,    # Mild
    'lower_body_injury': 2,    # Mild
    'bruises': 1,              # Minor
    'cuts': 1,                 # Minor
    'sprains': 2,              # Mild
    'broken_bones': 4,         # Severe
    'severed_limbs': 5,        # Very severe
    'eye_injury': 4,           # Severe
    'brain_damage': 5,         # Very severe
    'internal_damage': 5,      # Very severe
    'burns': 4,                # Severe
    'other': 1                 # Minor
}
df['weighted_injury_severity'] = df[severity_weights.keys()].mul(severity_weights.values()).sum(axis=1)

In [487]:
df['total_injury_severity'] = df[['head_injury', 'chest_injury', 'back_injury', 'stomach_injury',
                                  'upper_body_injury', 'lower_body_injury', 'bruises', 'cuts', 
                                  'sprains', 'broken_bones', 'severed_limbs', 'eye_injury', 
                                  'brain_damage', 'internal_damage', 'burns', 'other']].sum(axis=1)

In [488]:
df['injured_body_parts_count'] = df[['head_injury', 'chest_injury', 'back_injury', 'stomach_injury',
                                     'upper_body_injury', 'lower_body_injury', 'bruises', 'cuts', 
                                     'sprains', 'broken_bones', 'severed_limbs', 'eye_injury', 
                                     'brain_damage', 'internal_damage', 'burns', 'other']].gt(0).sum(axis=1)

In [489]:
df['has_injury'] = df[['head_injury', 'chest_injury', 'back_injury', 'stomach_injury',
                                     'upper_body_injury', 'lower_body_injury', 'bruises', 'cuts', 
                                     'sprains', 'broken_bones', 'severed_limbs', 'eye_injury', 
                                     'brain_damage', 'internal_damage', 'burns', 'other']].gt(0).any(axis=1).astype(int)

In [490]:
df['severe_injuries_count'] = df[['head_injury', 'chest_injury', 'back_injury', 'stomach_injury',
                                  'upper_body_injury', 'lower_body_injury', 'bruises', 'cuts', 
                                  'sprains', 'broken_bones', 'severed_limbs', 'eye_injury', 
                                  'brain_damage', 'internal_damage', 'burns', 'other']].eq(2).sum(axis=1)

In [491]:
injury_columns = [
    'head_injury', 
    'chest_injury', 
    'back_injury', 
    'stomach_injury', 
    'upper_body_injury', 
    'lower_body_injury', 
    'bruises', 
    'cuts', 
    'sprains', 
    'broken_bones', 
    'severed_limbs', 
    'eye_injury', 
    'brain_damage', 
    'internal_damage', 
    'burns', 
    'other'
]



In [492]:
df['injured_past_year'].dtypes

dtype('int64')

In [493]:
# # df = df.drop(df[df['injured_past_year'] == 2].index)
df = df.drop(df[df['age'] > 75].index)
# # df = df.drop(df[df['total_injury_severity'] < 26].index)
# # df = df.drop(df[df['severe_injuries_count'] < 10].index)
# df = df.drop(df[df['work_status'] > 2].index)
# df = df.drop(df[~df['education'].between(2, 5)].index)
df = df.drop(df[df['work_status'].isin([1, 2, 3, 4, 5, 9])].index)
# df_men = df.drop(df[df['gender'] == 2].index)
df = df.drop(df[df['gender'] == 1].index)

In [494]:
df['has_injury'].value_counts()

has_injury
0    22489
1     1606
Name: count, dtype: int64

In [495]:
df_0 = df[df['has_injury'] == 0]
df_1 = df[df['has_injury'] == 1]

df_0_sample = df_0.sample(n=1606)
df_1_sample = df_1.sample(n=1606)
df_mental = pd.concat([df_0_sample, df_1_sample])

In [496]:
# df_0_m = df_men[df_men['has_injury'] == 1]
# df_1_m = df_women[df_women['has_injury'] == 1]

# df_0_sample_m = df_0_m.sample(n=8000)
# df_1_sample_m = df_1_m.sample(n=8000)
# df_mental_m = pd.concat([df_0_sample_m, df_1_sample_m])

In [497]:
corr_matrix = df.corr()
low_corr_features = corr_matrix['emotional_mental_health_disorder'].abs().sort_values(ascending=False)
print(low_corr_features.head(40))
weak_features = low_corr_features[low_corr_features < 0.01].index
df.drop(columns=weak_features, inplace=True)

emotional_mental_health_disorder    1.000000
burns                               0.115907
head_injury                         0.115875
impaired                            0.115683
eye_injury                          0.115630
missing_body_parts                  0.115628
severed_limbs                       0.115628
injured_past_year                   0.115518
total_injuries                      0.115518
has_injury                          0.115518
injured_body_parts_count            0.115518
chest_injury                        0.115423
cuts                                0.115365
brain_damage                        0.115356
other                               0.115210
weighted_injury_severity            0.114997
broken_bones                        0.114800
total_injury_severity               0.114614
stomach_injury                      0.113560
internal_damage                     0.113504
severe_injuries_count               0.113432
back_injury                         0.112514
permanent_

In [498]:
df.columns

Index(['gender', 'education', 'work_status', 'injured_past_year',
       'head_injury', 'chest_injury', 'back_injury', 'stomach_injury',
       'upper_body_injury', 'lower_body_injury', 'bruises', 'cuts', 'sprains',
       'broken_bones', 'severed_limbs', 'eye_injury', 'brain_damage',
       'internal_damage', 'burns', 'other', 'impaired', 'missing_body_parts',
       'permanent_injury', 'place_of_injury',
       'emotional_mental_health_disorder', 'has_injury', 'total_injuries',
       'weighted_injury_severity', 'total_injury_severity',
       'injured_body_parts_count', 'severe_injuries_count'],
      dtype='object')

In [499]:
df_mental

Unnamed: 0,gender,age,education,work_status,injured_past_year,head_injury,chest_injury,back_injury,stomach_injury,upper_body_injury,...,missing_body_parts,permanent_injury,place_of_injury,emotional_mental_health_disorder,has_injury,total_injuries,weighted_injury_severity,total_injury_severity,injured_body_parts_count,severe_injuries_count
65974,2,74,2,8,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22936,2,59,3,6,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
174619,2,52,3,6,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
76608,2,35,4,8,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
162785,2,63,2,6,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214038,2,53,3,6,1,2,2,1,2,2,...,2,2,2,1,1,16,97,30,16,14
220911,2,49,3,6,1,2,2,2,2,2,...,2,1,5,1,1,16,99,30,16,14
217422,2,40,3,6,1,2,2,2,2,2,...,2,2,2,0,1,16,98,30,16,14
220447,2,44,3,8,1,2,2,2,2,2,...,2,2,1,1,1,16,97,29,16,13


In [500]:
df_mental.to_csv("dataFinal.csv", index=False)