**Table of Contents** <br>

* [1.Import Libraries](#importlibraries)
* [2.Import Dataset](#importdataset)
* [3.Initial Analysis](#initialanalysis)
* [4. Data Pre-Processing](#datapreprocessing)
    * [4.1 Data Partition](#datapartition)
    * [4.2 Missing Values](#missingvalues)

<hr>
<a class="anchor" id="importlibraries">
    
# 1. Import libraries
    
</a>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

#data partition
from sklearn.model_selection import train_test_split

#filter methods
# spearman 
# chi-square
import scipy.stats as stats
from scipy.stats import chi2_contingency

#wrapper methods
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import RFE

# embedded methods
from sklearn.linear_model import LassoCV

from sklearn.metrics import confusion_matrix

import warnings
warnings.filterwarnings('ignore')

<hr>
<a class="anchor" id="importdataset">
    
# 2. Import Dataset
    
</a>

In [2]:
train = pd.read_csv('train_data.csv')
test = pd.read_csv('test_data.csv')

<hr>
<a class="anchor" id="initialanalysis">
    
# 3. Initial Analysis
    
</a>

In [3]:
train.head(20)

Unnamed: 0,Accident Date,Age at Injury,Alternative Dispute Resolution,Assembly Date,Attorney/Representative,Average Weekly Wage,Birth Year,C-2 Date,C-3 Date,Carrier Name,...,WCIO Cause of Injury Code,WCIO Cause of Injury Description,WCIO Nature of Injury Code,WCIO Nature of Injury Description,WCIO Part Of Body Code,WCIO Part Of Body Description,Zip Code,Agreement Reached,WCB Decision,Number of Dependents
0,2019-12-30,31.0,N,2020-01-01,N,0.0,1988.0,2019-12-31,,NEW HAMPSHIRE INSURANCE CO,...,27.0,FROM LIQUID OR GREASE SPILLS,10.0,CONTUSION,62.0,BUTTOCKS,13662.0,0.0,Not Work Related,1.0
1,2019-08-30,46.0,N,2020-01-01,Y,1745.93,1973.0,2020-01-01,2020-01-14,ZURICH AMERICAN INSURANCE CO,...,97.0,REPETITIVE MOTION,49.0,SPRAIN OR TEAR,38.0,SHOULDER(S),14569.0,1.0,Not Work Related,4.0
2,2019-12-06,40.0,N,2020-01-01,N,1434.8,1979.0,2020-01-01,,INDEMNITY INSURANCE CO OF,...,79.0,OBJECT BEING LIFTED OR HANDLED,7.0,CONCUSSION,10.0,MULTIPLE HEAD INJURY,12589.0,0.0,Not Work Related,6.0
3,,,,2020-01-01,,,,,,,...,,,,,,,,,,
4,2019-12-30,61.0,N,2020-01-01,N,,1958.0,2019-12-31,,STATE INSURANCE FUND,...,16.0,"HAND TOOL, UTENSIL; NOT POWERED",43.0,PUNCTURE,36.0,FINGER(S),12603.0,0.0,Not Work Related,1.0
5,2019-12-26,67.0,N,2020-01-01,N,0.0,1952.0,2019-12-31,,INDEMNITY INS. OF N AMERICA,...,31.0,"FALL, SLIP OR TRIP, NOC",10.0,CONTUSION,38.0,SHOULDER(S),11772.0,0.0,Not Work Related,5.0
6,2019-12-28,48.0,N,2020-01-01,N,0.0,1971.0,2019-12-31,,LM INSURANCE CORP,...,19.0,"CUT, PUNCTURE, SCRAPE, NOC",40.0,LACERATION,36.0,FINGER(S),13029.0,0.0,Not Work Related,1.0
7,2019-12-30,33.0,N,2020-01-01,N,0.0,1986.0,2019-12-31,2020-03-04,STATE INSURANCE FUND,...,99.0,"OTHER - MISCELLANEOUS, NOC",71.0,"ALL OTHER OCCUPATIONAL DISEASE INJURY, NOC",38.0,SHOULDER(S),10305.0,0.0,Not Work Related,6.0
8,2019-12-23,55.0,N,2020-01-01,N,0.0,1964.0,2020-01-01,,"ROCHESTER, UNIVERSITY OF",...,99.0,"OTHER - MISCELLANEOUS, NOC",59.0,"ALL OTHER SPECIFIC INJURIES, NOC",60.0,LUNGS,14620.0,0.0,Not Work Related,6.0
9,2019-12-29,20.0,N,2020-01-01,N,225.0,,2019-12-31,,LM INSURANCE CORP,...,81.0,"STRUCK OR INJURED, NOC",59.0,"ALL OTHER SPECIFIC INJURIES, NOC",14.0,EYE(S),11231.0,0.0,Not Work Related,6.0


In [4]:
test.head(20)

Unnamed: 0,Accident Date,Age at Injury,Alternative Dispute Resolution,Assembly Date,Attorney/Representative,Average Weekly Wage,Birth Year,C-2 Date,C-3 Date,Carrier Name,...,Medical Fee Region,OIICS Nature of Injury Description,WCIO Cause of Injury Code,WCIO Cause of Injury Description,WCIO Nature of Injury Code,WCIO Nature of Injury Description,WCIO Part Of Body Code,WCIO Part Of Body Description,Zip Code,Number of Dependents
0,2022-12-24,19,N,2023-01-02,N,,2003.0,2023-01-02,,INDEMNITY INSURANCE CO OF,...,IV,,31.0,"FALL, SLIP OR TRIP, NOC",10.0,CONTUSION,54.0,LOWER LEG,10466,1
1,2022-11-20,19,N,2023-01-02,N,,2003.0,2023-01-02,,A I U INSURANCE COMPANY,...,IV,,75.0,FALLING OR FLYING OBJECT,10.0,CONTUSION,10.0,MULTIPLE HEAD INJURY,11691,1
2,2022-12-26,59,N,2023-01-02,N,0.0,1963.0,2022-12-31,,AMGUARD INSURANCE COMPANY,...,III,,68.0,STATIONARY OBJECT,49.0,SPRAIN OR TEAR,62.0,BUTTOCKS,10604,0
3,2022-12-28,55,N,2023-01-02,N,0.0,0.0,2023-01-02,,INDEMNITY INS. OF N AMERICA,...,IV,,25.0,FROM DIFFERENT LEVEL (ELEVATION),10.0,CONTUSION,53.0,KNEE,11411,6
4,2022-12-20,25,N,2023-01-02,N,0.0,1997.0,2022-12-31,,NEW HAMPSHIRE INSURANCE CO,...,IV,,79.0,OBJECT BEING LIFTED OR HANDLED,40.0,LACERATION,37.0,THUMB,11212,5
5,2022-12-28,36,N,2023-01-02,N,0.0,1986.0,2023-01-02,,NYC TRANSIT AUTHORITY,...,III,,90.0,OTHER THAN PHYSICAL CAUSE OF INJURY,77.0,MENTAL STRESS,66.0,NO PHYSICAL INJURY,10941,4
6,2022-12-22,19,N,2023-01-02,N,688.2,2003.0,2022-12-30,,"WAL-MART ASSOCIATES, INC.",...,I,,56.0,LIFTING,49.0,SPRAIN OR TEAR,38.0,SHOULDER(S),14131,6
7,2022-12-13,43,N,2023-01-02,N,0.0,0.0,2023-01-02,,ERIE INSURANCE CO OF NY,...,I,,27.0,FROM LIQUID OR GREASE SPILLS,49.0,SPRAIN OR TEAR,53.0,KNEE,13357,4
8,2022-12-28,40,N,2023-01-02,N,0.0,1982.0,2022-12-31,,STARR INDEMNITY & LIABILITY CO,...,IV,,87.0,FOREIGN MATTER (BODY) IN EYE(S),25.0,FOREIGN BODY,14.0,EYE(S),11735,3
9,2022-11-01,48,N,2023-01-02,Y,1180.74,1974.0,2023-01-02,2023-01-09,STATE INSURANCE FUND,...,I,,25.0,FROM DIFFERENT LEVEL (ELEVATION),49.0,SPRAIN OR TEAR,38.0,SHOULDER(S),14720,0


In [5]:
train.shape

(593471, 33)

In [6]:
test.shape

(387975, 30)

In [7]:
columns_name = train.columns
columns_name

Index(['Accident Date', 'Age at Injury', 'Alternative Dispute Resolution',
       'Assembly Date', 'Attorney/Representative', 'Average Weekly Wage',
       'Birth Year', 'C-2 Date', 'C-3 Date', 'Carrier Name', 'Carrier Type',
       'Claim Identifier', 'Claim Injury Type', 'County of Injury',
       'COVID-19 Indicator', 'District Name', 'First Hearing Date', 'Gender',
       'IME-4 Count', 'Industry Code', 'Industry Code Description',
       'Medical Fee Region', 'OIICS Nature of Injury Description',
       'WCIO Cause of Injury Code', 'WCIO Cause of Injury Description',
       'WCIO Nature of Injury Code', 'WCIO Nature of Injury Description',
       'WCIO Part Of Body Code', 'WCIO Part Of Body Description', 'Zip Code',
       'Agreement Reached', 'WCB Decision', 'Number of Dependents'],
      dtype='object')

In [8]:
test.columns

Index(['Accident Date', 'Age at Injury', 'Alternative Dispute Resolution',
       'Assembly Date', 'Attorney/Representative', 'Average Weekly Wage',
       'Birth Year', 'C-2 Date', 'C-3 Date', 'Carrier Name', 'Carrier Type',
       'Claim Identifier', 'County of Injury', 'COVID-19 Indicator',
       'District Name', 'First Hearing Date', 'Gender', 'IME-4 Count',
       'Industry Code', 'Industry Code Description', 'Medical Fee Region',
       'OIICS Nature of Injury Description', 'WCIO Cause of Injury Code',
       'WCIO Cause of Injury Description', 'WCIO Nature of Injury Code',
       'WCIO Nature of Injury Description', 'WCIO Part Of Body Code',
       'WCIO Part Of Body Description', 'Zip Code', 'Number of Dependents'],
      dtype='object')

In [9]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 593471 entries, 0 to 593470
Data columns (total 33 columns):
 #   Column                              Non-Null Count   Dtype  
---  ------                              --------------   -----  
 0   Accident Date                       570337 non-null  object 
 1   Age at Injury                       574026 non-null  float64
 2   Alternative Dispute Resolution      574026 non-null  object 
 3   Assembly Date                       593471 non-null  object 
 4   Attorney/Representative             574026 non-null  object 
 5   Average Weekly Wage                 545375 non-null  float64
 6   Birth Year                          544948 non-null  float64
 7   C-2 Date                            559466 non-null  object 
 8   C-3 Date                            187245 non-null  object 
 9   Carrier Name                        574026 non-null  object 
 10  Carrier Type                        574026 non-null  object 
 11  Claim Identifier          

In [10]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 387975 entries, 0 to 387974
Data columns (total 30 columns):
 #   Column                              Non-Null Count   Dtype  
---  ------                              --------------   -----  
 0   Accident Date                       385531 non-null  object 
 1   Age at Injury                       387975 non-null  int64  
 2   Alternative Dispute Resolution      387975 non-null  object 
 3   Assembly Date                       387975 non-null  object 
 4   Attorney/Representative             387975 non-null  object 
 5   Average Weekly Wage                 368771 non-null  float64
 6   Birth Year                          368505 non-null  float64
 7   C-2 Date                            378841 non-null  object 
 8   C-3 Date                            85216 non-null   object 
 9   Carrier Name                        387975 non-null  object 
 10  Carrier Type                        387975 non-null  object 
 11  Claim Identifier          

In [11]:
train.describe(include ='all')

Unnamed: 0,Accident Date,Age at Injury,Alternative Dispute Resolution,Assembly Date,Attorney/Representative,Average Weekly Wage,Birth Year,C-2 Date,C-3 Date,Carrier Name,...,WCIO Cause of Injury Code,WCIO Cause of Injury Description,WCIO Nature of Injury Code,WCIO Nature of Injury Description,WCIO Part Of Body Code,WCIO Part Of Body Description,Zip Code,Agreement Reached,WCB Decision,Number of Dependents
count,570337,574026.0,574026,593471,574026,545375.0,544948.0,559466,187245,574026,...,558386.0,558386,558369.0,558369,556944.0,556944,545389.0,574026.0,574026,574026.0
unique,5539,,3,1096,2,,,2475,1648,2046,...,,74,,56,,54,10060.0,,1,
top,2020-03-01,,N,2020-03-06,N,,,2021-05-11,2021-04-21,STATE INSURANCE FUND,...,,LIFTING,,STRAIN OR TEAR,,LOWER BACK AREA,11236.0,,Not Work Related,
freq,1245,,571412,1422,392291,,,1847,350,111144,...,,46610,,153373,,51862,3302.0,,574026,
mean,,42.11427,,,,491.0883,1886.767604,,,,...,54.381143,,41.013839,,39.738146,,,0.046665,,3.006559
std,,14.256432,,,,6092.918,414.644423,,,,...,25.874281,,22.207521,,22.36594,,,0.210921,,2.000801
min,,0.0,,,,0.0,0.0,,,,...,1.0,,1.0,,-9.0,,,0.0,,0.0
25%,,31.0,,,,0.0,1965.0,,,,...,31.0,,16.0,,33.0,,,0.0,,1.0
50%,,42.0,,,,0.0,1977.0,,,,...,56.0,,49.0,,38.0,,,0.0,,3.0
75%,,54.0,,,,841.0,1989.0,,,,...,75.0,,52.0,,53.0,,,0.0,,5.0


In [12]:
test.describe(include = 'all')

Unnamed: 0,Accident Date,Age at Injury,Alternative Dispute Resolution,Assembly Date,Attorney/Representative,Average Weekly Wage,Birth Year,C-2 Date,C-3 Date,Carrier Name,...,Medical Fee Region,OIICS Nature of Injury Description,WCIO Cause of Injury Code,WCIO Cause of Injury Description,WCIO Nature of Injury Code,WCIO Nature of Injury Description,WCIO Part Of Body Code,WCIO Part Of Body Description,Zip Code,Number of Dependents
count,385531,387975.0,387975,387975,387975,368771.0,368505.0,378841,85216,387975,...,387975,0.0,377627.0,377627,377415.0,377415,378426.0,378426,368633.0,387975.0
unique,3438,,3,434,2,,,1048,626,1598,...,5,,,74,,56,,51,6276.0,
top,2024-01-16,,N,2023-09-21,N,,,2023-10-11,2023-10-04,STATE INSURANCE FUND,...,IV,,,"FELLOW WORKER, PATIENT OR OTHER PERSON",,STRAIN OR TEAR,,MULTIPLE,11368.0,
freq,1263,,386314,1789,306476,,,1687,341,66189,...,182276,,,33293,,108326,,67465,2068.0,
mean,,41.414944,,,,183.3438,1875.383466,,,,...,,,53.335678,,38.373674,,31.516109,,,3.000284
std,,14.501056,,,,3542.31,444.659075,,,,...,,,26.176833,,20.9661,,23.35995,,,1.997982
min,,0.0,,,,0.0,0.0,,,,...,,,1.0,,1.0,,-9.0,,,0.0
25%,,30.0,,,,0.0,1967.0,,,,...,,,29.0,,10.0,,15.0,,,1.0
50%,,40.0,,,,0.0,1980.0,,,,...,,,56.0,,43.0,,36.0,,,3.0
75%,,53.0,,,,0.0,1992.0,,,,...,,,74.0,,52.0,,51.0,,,5.0


In [13]:
for i in columns_name:
    print(i)
    print(train[i].unique())
    print(train[i].value_counts())
    print('\n')

Accident Date
['2019-12-30' '2019-08-30' '2019-12-06' ... '2000-05-24' '2007-05-01'
 '1980-05-17']
Accident Date
2020-03-01    1245
2020-12-18    1001
2022-02-07     977
2022-01-05     883
2021-02-18     851
              ... 
2017-05-17       1
2015-05-28       1
2002-08-18       1
2017-09-01       1
1980-05-17       1
Name: count, Length: 5539, dtype: int64


Age at Injury
[ 31.  46.  40.  nan  61.  67.  48.  33.  55.  20.  21.  51.  62.  35.
  54.  32.  34.  38.  30.  36.  39.  44.  56.  29.  60.  49.  50.  18.
  58.  24.  45.  53.  52.  47.  25.  23.  22.  64.  70.  41.  69.  57.
  27.  42.  77.  72.  19.  28.  63.  65.  59.  43.  26.  74.  66.  76.
  68.  37.   0.  16.  71.  82.  75.  17.  78.  73.  89.  85.  84.  81.
  87.  80.  11.  88.  79.  15.  83.  86.  90.   1.  10. 102.  94. 117.
  14. 113.  95.  92. 104.   5.   8. 115. 110.  91.  96.  99.  12.  13.
  93. 101.   7. 111.   9.  97. 112. 109. 100.  98. 114.]
Age at Injury
31.0     14041
30.0     14022
32.0     13994
29.0     

In [14]:
train.duplicated().sum()

0

In [15]:
train['Claim Injury Type'].value_counts()

Claim Injury Type
2. NON-COMP        291078
4. TEMPORARY       148507
3. MED ONLY         68906
5. PPD SCH LOSS     48280
1. CANCELLED        12477
6. PPD NSL           4211
8. DEATH              470
7. PTD                 97
Name: count, dtype: int64

<hr>
<a class="anchor" id="datapreprocessing">
    
# 4. Data Pre-Processing
    
</a>

<hr>
<a class="anchor" id="missingvalues">
    
## 4.2 Missing Values
    
</a>

In [16]:
train.isna().sum()

Accident Date                          23134
Age at Injury                          19445
Alternative Dispute Resolution         19445
Assembly Date                              0
Attorney/Representative                19445
Average Weekly Wage                    48096
Birth Year                             48523
C-2 Date                               34005
C-3 Date                              406226
Carrier Name                           19445
Carrier Type                           19445
Claim Identifier                           0
Claim Injury Type                      19445
County of Injury                       19445
COVID-19 Indicator                     19445
District Name                          19445
First Hearing Date                    442673
Gender                                 19445
IME-4 Count                           460668
Industry Code                          29403
Industry Code Description              29403
Medical Fee Region                     19445
OIICS Natu

In [17]:
train.drop('OIICS Nature of Injury Description', inplace = True, axis = 1)

In [18]:
# for metric columns
#fill with median as it does not get influenced by extreme outliers
for col in train.columns:
    if pd.api.types.is_numeric_dtype(train[col]): 
        median_to_fill = train[col].median()
        train[col].fillna(median_to_fill, inplace=True)

In [19]:
train.dropna(subset=['Claim Injury Type'], inplace=True)
train['Claim Injury Type']

0          2. NON-COMP
1         4. TEMPORARY
2         4. TEMPORARY
4          2. NON-COMP
5          3. MED ONLY
              ...     
593451     2. NON-COMP
593455     2. NON-COMP
593456    4. TEMPORARY
593457     2. NON-COMP
593467     2. NON-COMP
Name: Claim Injury Type, Length: 574026, dtype: object

In [20]:
#for non-metric columns
#replace NaN with Unknown
object_columns = train.select_dtypes(include=['object']).columns

for col in object_columns:
    train[col] = train[col].fillna('Unknown')

In [21]:
train.isna().sum()

Accident Date                        0
Age at Injury                        0
Alternative Dispute Resolution       0
Assembly Date                        0
Attorney/Representative              0
Average Weekly Wage                  0
Birth Year                           0
C-2 Date                             0
C-3 Date                             0
Carrier Name                         0
Carrier Type                         0
Claim Identifier                     0
Claim Injury Type                    0
County of Injury                     0
COVID-19 Indicator                   0
District Name                        0
First Hearing Date                   0
Gender                               0
IME-4 Count                          0
Industry Code                        0
Industry Code Description            0
Medical Fee Region                   0
WCIO Cause of Injury Code            0
WCIO Cause of Injury Description     0
WCIO Nature of Injury Code           0
WCIO Nature of Injury Des

In [22]:
train['Claim Injury Type_float'] = train['Claim Injury Type'].str.extract('(\d+)')
train['Claim Injury Type_float']

0         2
1         4
2         4
4         2
5         3
         ..
593451    2
593455    2
593456    4
593457    2
593467    2
Name: Claim Injury Type_float, Length: 574026, dtype: object

<hr>
<a class="anchor" id="datapartition">
    
## 4.1 Data Partition
    
</a>

In [23]:
X = train.drop(['Claim Injury Type' ,'Claim Injury Type_float'], axis = 1)
y = train['Claim Injury Type_float']

In [24]:
X_train, X_val, y_train, y_val = train_test_split(X,y, test_size = 0.3, 
                                                  random_state = 0, 
                                                  stratify = y, 
                                                  shuffle = True)

In [25]:
from sklearn.linear_model import LogisticRegression
log_model = LogisticRegression()

In [26]:
log_model.fit(X_train, y_train)

ValueError: could not convert string to float: '2021-05-16'