In [39]:

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler,Normalizer,LabelEncoder
from category_encoders.target_encoder import TargetEncoder
from sklearn.model_selection import train_test_split,cross_val_score,RandomizedSearchCV,StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier,ExtraTreeClassifier
from sklearn.ensemble import AdaBoostClassifier,GradientBoostingClassifier,RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix


## Feature Engineering

In [40]:
data = pd.read_csv('Cleaned_Insuarance_1',index_col=0)

In [41]:
data['Abortion'].value_counts()

N    1041265
Name: Abortion, dtype: int64

In [42]:
data = data.drop('Abortion',axis=1)

In [43]:
data['Cultural_group'].value_counts()

White                     777034
Black/African American    134127
Other Race                126454
Unknown                     3650
Name: Cultural_group, dtype: int64

In [44]:
data = data.replace({'Unknown':'Other Race'})

In [45]:
data['ethnicity'].value_counts()

Not Span/Hispanic    904866
Spanish/Hispanic      86918
Other Race            49481
Name: ethnicity, dtype: int64

In [46]:
data['Payment_Typology'].value_counts()

2    428217
1    340415
3    269067
4      3500
5        66
Name: Payment_Typology, dtype: int64

In [47]:
data = data[(data['Payment_Typology']!=5)&(data['Payment_Typology']!=4)]

In [48]:
data = data[data['Gender']!= 'U']

In [49]:
data['Admission_type'].value_counts()

Emergency        604218
Elective         234245
Urgent           109560
Newborn           86383
Trauma             2252
Not Available      1014
Name: Admission_type, dtype: int64

In [50]:
data = data[data['Admission_type']!='Not Available']

In [51]:
data = data.replace({'Trauma':'Urgent'})

In [52]:
data['Home or self care,'].value_counts()

Home or Self Care                        685088
Home w/ Home Health Services             131278
Skilled Nursing Home                     108456
Expired                                   22645
Short-term Hospital                       21303
Inpatient Rehabilitation Facility         17227
Left Against Medical Advice               16928
Psychiatric Hospital or Unit of Hosp       6658
Hospice - Medical Facility                 4770
Hospice - Home                             4695
Another Type Not Listed                    4519
Facility w/ Custodial/Supportive Care      3475
Court/Law Enforcement                      3254
Medicare Cert Long Term Care Hospital      2124
Cancer Center or Children's Hospital       1874
Hosp Basd Medicare Approved Swing Bed      1871
Federal Health Care Facility                454
Critical Access Hospital                     32
Medicaid Cert Nursing Facility                7
Name: Home or self care,, dtype: int64

In [53]:
data = data.replace({'Hospice - Home':'Home w/ Home Health Services',
                     'Hospice - Medical Facility':'Medical Facility',
                    'Medicaid Cert Nursing Facility':'Medical Facility',
                     'Critical Access Hospital':'Medical Facility',
                    'Federal Health Care Facility':'Medical Facility',
                     "Cancer Center or Children's Hospital":'Medical Facility',
                    'Hosp Basd Medicare Approved Swing Bed':'Medical Facility',
                     'Medicare Cert Long Term Care Hospital':'Medical Facility',
                     'Facility w/ Custodial/Supportive Care':'Medical Facility',
                     'Psychiatric Hospital or Unit of Hosp':'Medical Facility'})

In [54]:
data['Home or self care,'].value_counts()

Home or Self Care                    685088
Home w/ Home Health Services         135973
Skilled Nursing Home                 108456
Expired                               22645
Short-term Hospital                   21303
Medical Facility                      21265
Inpatient Rehabilitation Facility     17227
Left Against Medical Advice           16928
Another Type Not Listed                4519
Court/Law Enforcement                  3254
Name: Home or self care,, dtype: int64

In [55]:
data['Area_Service'].value_counts()

Hudson Valley      258458
Western NY         178503
Central NY         169922
Capital/Adirond    168512
Finger Lakes       155350
New York City       74307
Southern Tier       31606
Name: Area_Service, dtype: int64

In [56]:
np.sort(data['ccs_diagnosis_code'].unique())

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  76,
        77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
        90,  91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102,
       103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115,
       116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128,
       129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141,
       142, 143, 144, 145, 146, 147, 148, 149, 151, 152, 153, 154, 155,
       156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       169, 170, 171, 172, 173, 174, 175, 176, 181, 182, 183, 184, 185,
       186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 19

In [57]:
np.sort(data['ccs_procedure_code'].unique())

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
        92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104,
       105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
       118, 119, 120, 121, 123, 124, 125, 127, 128, 129, 130, 131, 132,
       133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145,
       146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158,
       159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171,
       172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 18

In [58]:
data3 = data

In [59]:
data = data.drop(['Area_Service','Admission_type','ethnicity','Hospital County'],axis=1)

## Encoding


In [60]:
l_encoder = LabelEncoder()

In [61]:
t_encoder = TargetEncoder()

In [62]:
pd.set_option('display.max_columns',500)

In [63]:
data

Unnamed: 0,Hospital Id,Age,Gender,Cultural_group,Days_spend_hsptl,"Home or self care,",ccs_diagnosis_code,ccs_procedure_code,apr_drg_description,Code_illness,Mortality risk,Surg_Description,Weight_baby,Emergency dept_yes/No,Tot_charg,Tot_cost,ratio_of_total_costs_to_total_charges,Result,Payment_Typology
0,37.0,3,F,White,4,Home or Self Care,122,0,Other pneumonia,1,1.0,Medical,0,Y,5511.95,5582.49,1.012798,1,1
1,37.0,5,F,White,4,Short-term Hospital,197,0,Cellulitis & other skin infections,3,2.0,Medical,0,Y,4783.20,5162.82,1.079365,1,1
2,37.0,3,F,White,3,Home or Self Care,122,0,Other pneumonia,1,1.0,Medical,0,Y,3829.15,4056.52,1.059379,1,1
3,37.0,1,F,White,1,Home or Self Care,122,0,Other pneumonia,1,1.0,Medical,0,Y,1108.20,1644.75,1.484167,0,1
4,37.0,5,F,White,3,Home or Self Care,122,0,Other pneumonia,2,3.0,Medical,0,Y,3418.18,3370.87,0.986161,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1048570,1169.0,5,F,Other Race,6,Skilled Nursing Home,106,29,Cardiac arrhythmia & conduction disorders,2,3.0,Medical,0,Y,69650.41,13896.82,0.199522,1,3
1048571,1169.0,1,F,Other Race,2,Home or Self Care,8,231,Other infectious & parasitic diseases,2,1.0,Medical,0,N,13587.38,3918.35,0.288382,1,3
1048572,1169.0,4,M,Other Race,13,Medical Facility,151,88,Hepatic coma & other major acute liver disorders,4,4.0,Medical,0,Y,104629.78,24746.64,0.236516,0,3
1048573,1169.0,3,F,Black/African American,2,Home or Self Care,143,86,"Inguinal, femoral & umbilical hernia procedures",1,1.0,Surgical,0,Y,27521.41,6766.04,0.245846,1,3


In [65]:
columns =['Hospital Id', 'Gender',
       'Cultural_group','apr_drg_description',
       'Surg_Description', 'Emergency dept_yes/No']

In [66]:
def ecoder(x):
    for col in x:
        data[col] = l_encoder.fit_transform(data[col])


In [67]:
ecoder(columns)

In [68]:
data

Unnamed: 0,Hospital Id,Age,Gender,Cultural_group,Days_spend_hsptl,"Home or self care,",ccs_diagnosis_code,ccs_procedure_code,apr_drg_description,Code_illness,Mortality risk,Surg_Description,Weight_baby,Emergency dept_yes/No,Tot_charg,Tot_cost,ratio_of_total_costs_to_total_charges,Result,Payment_Typology
0,5,3,0,2,4,Home or Self Care,122,0,237,1,1.0,0,0,1,5511.95,5582.49,1.012798,1,1
1,5,5,0,2,4,Short-term Hospital,197,0,38,3,2.0,0,0,1,4783.20,5162.82,1.079365,1,1
2,5,3,0,2,3,Home or Self Care,122,0,237,1,1.0,0,0,1,3829.15,4056.52,1.059379,1,1
3,5,1,0,2,1,Home or Self Care,122,0,237,1,1.0,0,0,1,1108.20,1644.75,1.484167,0,1
4,5,5,0,2,3,Home or Self Care,122,0,237,2,3.0,0,0,1,3418.18,3370.87,0.986161,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1048570,138,5,0,1,6,Skilled Nursing Home,106,29,28,2,3.0,0,0,1,69650.41,13896.82,0.199522,1,3
1048571,138,1,0,1,2,Home or Self Care,8,231,227,2,1.0,0,0,0,13587.38,3918.35,0.288382,1,3
1048572,138,4,1,1,13,Medical Facility,151,88,99,4,4.0,0,0,1,104629.78,24746.64,0.236516,0,3
1048573,138,3,0,0,2,Home or Self Care,143,86,109,1,1.0,1,0,1,27521.41,6766.04,0.245846,1,3


## Feature Selection

In [69]:
X = data.drop(['Result','Home or self care,'],axis=1)
y = data['Result']

In [None]:
ext_model = ExtraTreeClassifier()
ext_model.fit(X,y)
ext_model.feature_importances_

In [None]:
pd.DataFrame(ext_model.feature_importances_,X.columns).sort_values(by=0,ascending=False)

## Balancing The Dataset

In [70]:
data = data.drop(['Surg_Description','Emergency dept_yes/No'],axis=1)

In [71]:
data = data.drop('Home or self care,',axis=1)

In [72]:
data.columns

Index(['Hospital Id', 'Age', 'Gender', 'Cultural_group', 'Days_spend_hsptl',
       'ccs_diagnosis_code', 'ccs_procedure_code', 'apr_drg_description',
       'Code_illness', 'Mortality risk', 'Weight_baby', 'Tot_charg',
       'Tot_cost', 'ratio_of_total_costs_to_total_charges', 'Result',
       'Payment_Typology'],
      dtype='object')

In [73]:
X = data.drop('Result',axis=1)
y = data['Result']

In [74]:
from imblearn.over_sampling import SMOTE,RandomOverSampler

In [75]:
balancer = RandomOverSampler()

In [76]:
X,y = balancer.fit_resample(X,y)

In [77]:
y.value_counts()

1    776907
0    776907
Name: Result, dtype: int64

## Normalizing The Dataset

In [78]:
scaler = Normalizer()
X = scaler.fit_transform(X)

In [79]:
np.round(X,4)

array([[6.000e-04, 4.000e-04, 0.000e+00, ..., 7.112e-01, 1.000e-04,
        1.000e-04],
       [7.000e-04, 7.000e-04, 0.000e+00, ..., 7.333e-01, 2.000e-04,
        1.000e-04],
       [9.000e-04, 5.000e-04, 0.000e+00, ..., 7.264e-01, 2.000e-04,
        2.000e-04],
       ...,
       [1.610e-02, 1.000e-04, 0.000e+00, ..., 3.348e-01, 0.000e+00,
        4.000e-04],
       [4.900e-03, 7.000e-04, 2.000e-04, ..., 4.938e-01, 1.000e-04,
        3.000e-04],
       [5.500e-03, 0.000e+00, 0.000e+00, ..., 3.520e-01, 0.000e+00,
        0.000e+00]])

In [80]:
len(X)

1553814

## Logistic Regression Model

In [89]:
X_samp = X[:1000]
y_samp = y[:1000]

In [88]:
len(X_samp)

1000

In [55]:
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42,stratify=y)

In [315]:
lr_model = LogisticRegression()

In [316]:
lr_model.fit(X_train,y_train)

LogisticRegression()

In [317]:
lr_preds = lr_model.predict(X_test)
confusion_matrix(lr_preds,y_test)

array([[ 93650,  92644],
       [159079, 160085]])

In [318]:
print(classification_report(lr_preds,y_test))

              precision    recall  f1-score   support

           0       0.37      0.50      0.43    186294
           1       0.63      0.50      0.56    319164

    accuracy                           0.50    505458
   macro avg       0.50      0.50      0.49    505458
weighted avg       0.54      0.50      0.51    505458



## Emsamble Models :  Random Forest & Decision Trees

In [91]:
rf_model = RandomForestClassifier()

In [92]:
rf_model.fit(X_samp,y_samp)

RandomForestClassifier()

In [94]:
X[:1]

array([[6.36971627e-04, 3.82182976e-04, 0.00000000e+00, 2.54788651e-04,
        5.09577301e-04, 1.55421077e-02, 0.00000000e+00, 3.01924551e-02,
        1.27394325e-04, 1.27394325e-04, 0.00000000e+00, 7.02191151e-01,
        7.11177547e-01, 1.29024668e-04, 1.27394325e-04]])

In [95]:
lists = [6.36971627e-04, 3.82182976e-04, 0.00000000e+00, 2.54788651e-04,
        5.09577301e-04, 1.55421077e-02, 0.00000000e+00, 3.01924551e-02,
        1.27394325e-04, 1.27394325e-04, 0.00000000e+00, 7.02191151e-01,
        7.11177547e-01, 1.29024668e-04, 1.27394325e-04]

In [99]:
lists = np.array(lists)

In [100]:
rf_model.predict(lists.reshape(1, -1))

array([1])

In [93]:
preds = rf_model.predict(X)
confusion_matrix(preds,y)

KeyboardInterrupt: 

In [64]:
print(classification_report(preds,y))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    778754
           1       1.00      1.00      1.00    775060

    accuracy                           1.00   1553814
   macro avg       1.00      1.00      1.00   1553814
weighted avg       1.00      1.00      1.00   1553814



In [220]:
dt_model = DecisionTreeClassifier()

In [None]:
dt_model.fit(X_train,y_train)

In [None]:
preds = dt_model.predict(X_test)
confusion_matrix(preds,y_test)

In [None]:
print(classification_report(preds,y_test))

## HyperParameter Tuning

In [93]:
dt_model.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': None,
 'splitter': 'best'}

In [48]:
params = {'max_depth':[3,4,8,20,50,80,100,110,120,130],
         'ccp_alpha':[0.00002,0.0001,0.0002,0.00001],
         'min_samples_leaf': [1,2,4,6,8,12,20],
         'min_samples_split': [1,2,4,6,8,12,16,22]}

In [94]:
param_2 = [0.00000000000121,0.0000000012,0.000000000012] 

In [107]:
import matplotlib.pyplot as plt
import sklearn.tree as tree

In [95]:
tot_param2 = []
tot_score = []
def testing(model,y1):
        for y in y1:
                model = DecisionTreeClassifier(ccp_alpha=0.0000000012,)
                model.fit(X_train,y_train)
                predis = model.predict(X_test)
                score = accuracy_score(predis,y_test)
                tot_score.append(score)
                tot_param2.append(y)                                    

In [96]:
testing(dt_model,param_2)

In [97]:
print(tot_param2)
print('/n')
print(tot_score) #0.0000000001 best yet

[1.21e-12, 1.2e-09, 1.2e-11]
/n
[0.7901606797735389, 0.7905273237524841, 0.78995590521083]


In [52]:
RSCV = RandomizedSearchCV(estimator=dt_model,param_distributions=params,verbose=10)

In [53]:
len(X_train)

1041055

In [None]:
RSCV.best_params_

In [None]:
RSCV.best_score_

In [65]:
import pickle

In [67]:
pickle.dump(rf_model,open('dep_mod','wb'))