In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import auc
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state = 42)
from pprint import pprint

In [2]:
df = pd.read_csv('Mb_Aprl_May_DataforBN.csv')

In [3]:
df.head()

Unnamed: 0,SOC5B,SOC5C,SOC5D,PHYS3H,AGE4,SOC5A,SOC5E,PHYS7_4,PHYS2_18
0,(1) Not at all or less than 1 day,(1) Not at all or less than 1 day,(2) 1-2 days,(1) Yes,(2) 30-44,(1) Not at all or less than 1 day,(2) 1-2 days,(0) No,(1) Yes
1,(1) Not at all or less than 1 day,(1) Not at all or less than 1 day,(1) Not at all or less than 1 day,(2) No,(4) 60+,(1) Not at all or less than 1 day,(1) Not at all or less than 1 day,(1) Yes,(0) No
2,(1) Not at all or less than 1 day,(1) Not at all or less than 1 day,(2) 1-2 days,(2) No,(4) 60+,(1) Not at all or less than 1 day,(1) Not at all or less than 1 day,(1) Yes,(0) No
3,(1) Not at all or less than 1 day,(2) 1-2 days,(1) Not at all or less than 1 day,(1) Yes,(4) 60+,(1) Not at all or less than 1 day,(1) Not at all or less than 1 day,(1) Yes,(0) No
4,(1) Not at all or less than 1 day,(1) Not at all or less than 1 day,(1) Not at all or less than 1 day,(2) No,(4) 60+,(1) Not at all or less than 1 day,(1) Not at all or less than 1 day,(1) Yes,(0) No


In [4]:
df.columns

Index(['SOC5B', 'SOC5C', 'SOC5D', 'PHYS3H', 'AGE4', 'SOC5A', 'SOC5E',
       'PHYS7_4', 'PHYS2_18'],
      dtype='object')

In [5]:
df = df[['SOC5A','AGE4','PHYS7_4','PHYS2_18','PHYS3H']]

In [6]:
from collections import Counter

In [7]:
Counter(df['SOC5A'])

Counter({'(1) Not at all or less than 1 day': 11067,
         '(2) 1-2 days': 3893,
         '(3) 3-4 days': 1600,
         '(4) 5-7 days': 1131,
         "(77) DON'T KNOW": 15,
         '(98) SKIPPED ON WEB': 56,
         '(99) REFUSED': 2})

In [8]:
df.shape

(17764, 5)

In [9]:
df = df[~df['SOC5A'].isin(["(77) DON'T KNOW",'(98) SKIPPED ON WEB','(99) REFUSED','(2) 1-2 days','(3) 3-4 days'])]


In [10]:
df.shape

(12198, 5)

In [11]:
df['SOC5A']=df['SOC5A'].replace('(1) Not at all or less than 1 day', 1)


In [12]:
df.head()

Unnamed: 0,SOC5A,AGE4,PHYS7_4,PHYS2_18,PHYS3H
0,1,(2) 30-44,(0) No,(1) Yes,(1) Yes
1,1,(4) 60+,(1) Yes,(0) No,(2) No
2,1,(4) 60+,(1) Yes,(0) No,(2) No
3,1,(4) 60+,(1) Yes,(0) No,(1) Yes
4,1,(4) 60+,(1) Yes,(0) No,(2) No


In [13]:
df.loc[df['SOC5A'] != 1, 'SOC5A'] = 0


In [14]:
Counter(df['SOC5A'])

Counter({1: 11067, 0: 1131})

In [15]:
from sklearn.preprocessing import LabelEncoder 
import numpy as np
le = LabelEncoder() 
df['AGE4']= le.fit_transform(df['AGE4']) 
df['PHYS7_4']= le.fit_transform(df['PHYS7_4']) 
df['PHYS2_18']= le.fit_transform(df['PHYS2_18']) 
df['PHYS3H']= le.fit_transform(df['PHYS3H']) 



In [16]:
df.head()

Unnamed: 0,SOC5A,AGE4,PHYS7_4,PHYS2_18,PHYS3H
0,1,1,0,1,0
1,1,3,1,0,1
2,1,3,1,0,1
3,1,3,1,0,0
4,1,3,1,0,1


In [17]:
X = df[['AGE4','PHYS7_4','PHYS2_18','PHYS3H']]
Y = df[['SOC5A']]




In [18]:
Y=Y.astype('int')

In [19]:
X

Unnamed: 0,AGE4,PHYS7_4,PHYS2_18,PHYS3H
0,1,0,1,0
1,3,1,0,1
2,3,1,0,1
3,3,1,0,0
4,3,1,0,1
...,...,...,...,...
17756,1,1,0,1
17757,3,1,0,1
17760,1,1,0,1
17761,1,1,0,1


In [20]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.20)
print(x_train.shape)
print(x_test.shape)

(9758, 4)
(2440, 4)


In [21]:
Counter(y_train['SOC5A'])

Counter({1: 8830, 0: 928})

In [22]:
#pip install imblearn -U

In [23]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

ros = RandomOverSampler(random_state=123)
rus = RandomUnderSampler(random_state=0)
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state = 2) 


In [24]:
sx_train, sy_train = sm.fit_sample(x_train, y_train)


In [25]:
Counter(sy_train['SOC5A'])

Counter({1: 8830, 0: 8830})

# Grid Search 

In [26]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


In [27]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 10 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 50, cv = 5, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(sx_train, sy_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:  8.6min finished
  self.best_estimator_.fit(X, y, **fit_params)


RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(), n_iter=50,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42, verbose=2)

In [28]:
rf_random.best_params_


{'n_estimators': 1600,
 'min_samples_split': 2,
 'min_samples_leaf': 4,
 'max_features': 'sqrt',
 'max_depth': 10,
 'bootstrap': True}

In [29]:
from sklearn.ensemble import RandomForestClassifier

RF = RandomForestClassifier(n_estimators=1600,min_samples_split=2,min_samples_leaf=4,max_features='sqrt'
                            ,max_depth=10,
                            bootstrap=True)
RF.fit(sx_train,sy_train)
RF_predictions = RF.predict(x_test)

  


In [30]:
RF_Accuracy = accuracy_score(y_test,RF_predictions)
print(RF_Accuracy)

0.8020491803278689


In [31]:
S_cm = confusion_matrix(y_test, RF_predictions)
print(S_cm)

[[1820  387]
 [  96  137]]


In [32]:
sensitivity = S_cm[0,0]/(S_cm[0,0]+S_cm[0,1])
print('Sensitivity : ', sensitivity)

specificity = S_cm[1,1]/(S_cm[1,0]+S_cm[1,1])
print('Specificity : ', specificity)

Sensitivity :  0.8246488445854101
Specificity :  0.5879828326180258


In [33]:
auroc = roc_auc_score(y_test, RF_predictions)
print(auroc)

precision, recall, thresholds = precision_recall_curve(y_test, RF_predictions)
auprc = auc(recall, precision)
print(auprc)

0.706315838601718
0.44438873829624853


# SVM 

In [34]:
from sklearn.svm import SVC

svm = SVC()
svm.fit(sx_train, sy_train)
SVM_predictions = svm.predict(x_test)

  return f(**kwargs)


In [35]:
SVM_Accuracy = accuracy_score(y_test,SVM_predictions)
print(SVM_Accuracy)

0.8008196721311476


In [36]:
S_cm = confusion_matrix(y_test, SVM_predictions)
print(S_cm)

[[1815  392]
 [  94  139]]


In [37]:
sensitivity = S_cm[0,0]/(S_cm[0,0]+S_cm[0,1])
print('Sensitivity : ', sensitivity)

specificity = S_cm[1,1]/(S_cm[1,0]+S_cm[1,1])
print('Specificity : ', specificity)

Sensitivity :  0.822383325781604
Specificity :  0.5965665236051502


In [38]:
auroc = roc_auc_score(y_test, SVM_predictions)
print(auroc)

precision, recall, thresholds = precision_recall_curve(y_test, SVM_predictions)
auprc = auc(recall, precision)
print(auprc)

0.709474924693377
0.44843067929508845


# Naive Bayes 

In [39]:
from sklearn.naive_bayes import GaussianNB
NB = GaussianNB()
NB.fit(sx_train, sy_train)
NB_predictions = NB.predict(x_test)

  return f(**kwargs)


In [40]:
NB_Accuracy = accuracy_score(y_test,NB_predictions)
print(NB_Accuracy)

0.7795081967213114


In [41]:
S_cm = confusion_matrix(y_test, NB_predictions)
print(S_cm)

[[1762  445]
 [  93  140]]


In [42]:
sensitivity = S_cm[0,0]/(S_cm[0,0]+S_cm[0,1])
print('Sensitivity : ', sensitivity)

specificity = S_cm[1,1]/(S_cm[1,0]+S_cm[1,1])
print('Specificity : ', specificity)

Sensitivity :  0.7983688264612596
Specificity :  0.6008583690987125


In [43]:
auroc = roc_auc_score(y_test, NB_predictions)
print(auroc)

precision, recall, thresholds = precision_recall_curve(y_test, NB_predictions)
auprc = auc(recall, precision)
print(auprc)

0.6996135977799861
0.43914468125665623


# Logistic Regression 

In [44]:
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression()
LR.fit(sx_train, sy_train)
LR_predictions = LR.predict(x_test)

  return f(**kwargs)


In [45]:
LR_Accuracy = accuracy_score(y_test,LR_predictions)
print(LR_Accuracy)

0.7680327868852459


In [46]:
S_cm = confusion_matrix(y_test, LR_predictions)
print(S_cm)

[[1734  473]
 [  93  140]]


In [47]:
sensitivity = S_cm[0,0]/(S_cm[0,0]+S_cm[0,1])
print('Sensitivity : ', sensitivity)

specificity = S_cm[1,1]/(S_cm[1,0]+S_cm[1,1])
print('Specificity : ', specificity)

Sensitivity :  0.7856819211599456
Specificity :  0.6008583690987125


In [48]:
auroc = roc_auc_score(y_test, LR_predictions)
print(auroc)

precision, recall, thresholds = precision_recall_curve(y_test, LR_predictions)
auprc = auc(recall, precision)
print(auprc)

0.693270145129329
0.4336790575202331
