# vLife Virtusa
## Behavioural Assessment of Patient Health
### Usecase Description
_Solution to build a predictive model in order to asses patient health based on its behavioural assesment. Powered with Logistic Regression binary classification Model for predicting wheather individual is in Good or Poor health.Problem Statement we’ll solve is a binary classification task with the goal of predicting an individual’s health_.
### Data Source
- [Click Here for Data Source](https://www.kaggle.com/cdc/behavioral-risk-factor-surveillance-system)
- [Click Here to view BRFSS Handbook](https://www.cdc.gov/brfss/annual_data/2015/pdf/codebook15_llcp.pdf)

### Dataset Description
- This dataset was collected by the Centers for Disease Control and Prevention.
- Each year contains a few hundred columns. Please see one of the annual code books for complete details.
- CSV files were converted from a SAS data format using pandas; there may be some data artifacts as a result.





In [None]:
import numpy as np # linear algebra
import pandas as pd 

import datetime
import pickle
from time import strftime

from sklearn.metrics import accuracy_score,precision_score,roc_curve,roc_auc_score,classification_report
from sklearn.model_selection import GridSearchCV,KFold,train_test_split,learning_curve
from sklearn.preprocessing import LabelEncoder,StandardScaler,MinMaxScaler
import seaborn as se
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import keras
from keras.models import Sequential
from keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping


import matplotlib.pyplot as plt
%matplotlib inline
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.figure_factory as ff
import seaborn as sns
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.linear_model import RidgeClassifier

In [None]:
df = pd.read_csv('../input/behavioral-risk-factor-surveillance-system/2015.csv')
df.head()

In [None]:
print('Shape of Dataset {}'.format(df.shape))

In [None]:
for col in df.columns: 
    print(col) 

In [None]:
display(df.describe())

## Exploratory Data Analysis
### Data Preprocessing & Generating Target variable

- As per the Hand book **_RFHLTH** is our Target column denoting Adults with good or better health

In [None]:
df.head()

In [None]:
df['_RFHLTH'].value_counts()
df['_RFHLTH'] = df['_RFHLTH'].replace({2: 0})


In [None]:
df['_RFHLTH'].value_counts()
df = df.loc[df['_RFHLTH'].isin([0, 1])].copy()

In [None]:
df = df.rename(columns = {'_RFHLTH': 'Health'})
df['Health'] = df['Health'].astype('int')
df['Health'].value_counts()

In [None]:
M = df[(df['Health'] == 0)]
B = df[(df['Health'] == 1)]
trace = go.Bar(x = (len(M), len(B)), y = ['0','1'], orientation = 'h', opacity = 0.8, marker=dict(
        color=['blue','grey'],
        line=dict(color='#000000',width=1.5)))

layout = dict(title =  'Count of Health variable')
                    
fig = dict(data = [trace], layout=layout)
py.iplot(fig)

In [None]:
percent_missing = (df.isnull().sum() / len(df)).sort_values(ascending = False)
percent_missing.head()

In [None]:
null_feat = pd.DataFrame(len(df['SEQNO']) - df.isnull().sum(), columns = ['Count'])

trace = go.Bar(x = null_feat.index, y = null_feat['Count'] ,opacity = 0.8, marker=dict(color = 'lightblue',
        line=dict(color='#000000',width=1.5)))

layout = dict(title =  "Missing Values")
                    
fig = dict(data = [trace], layout=layout)
py.iplot(fig)

> **As per the BFSS Handbook there are 330 features in dataset filtering out features based on domain knowledge and kaggle kernals.**

In [None]:
filtered_df = df[['MENTHLTH','_AGEG5YR', 'SEX','EDUCA','EMPLOY1','INCOME2','_RACE','NUMADULT','MARITAL','VETERAN3','PREGNANT','ADPLEASR','ADDOWN','ADSLEEP','ADENERGY','ADEAT1','ADFAIL','ADTHINK','ADMOVE','Health']]

In [None]:
filtered_df.head()

In [None]:
for column in filtered_df.columns.values:
    print(f" Unique values of {column} : {filtered_df[column].nunique()}")

In [None]:
def missing_data(data):
    total = data.isnull().sum()
    percent = (data.isnull().sum()/data.isnull().count()*100)
    tt = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    types = []
    for col in data.columns:
        dtype = str(data[col].dtype)
        types.append(dtype)
    tt['Types'] = types
    return(np.transpose(tt))

missing_data(filtered_df)

In [None]:
def impute_df(filtered_df):
    #Changing values from 77,88,7,8,9,14 to NAN | Also, changing 88 values to 0
    for x in ['ADPLEASR','ADDOWN','ADSLEEP','ADENERGY','ADEAT1','ADFAIL','ADTHINK','ADMOVE']:
        filtered_df[x].replace(77, np.NaN, inplace= True)
        filtered_df[x].replace(99, np.NaN, inplace= True)
        filtered_df[x].replace(88, 0, inplace= True)


    for x in ['EDUCA','EMPLOY1','_RACE', 'MARITAL']:
        filtered_df[x].replace(9, np.NaN, inplace=True)

    for x in ['VETERAN3','PREGNANT']:
        filtered_df[x].replace(9, np.NaN, inplace= True)
        filtered_df[x].replace(7, np.NaN, inplace= True)

    filtered_df['_AGEG5YR'].replace(14, np.NaN, inplace= True)
    filtered_df['INCOME2'].replace(77, np.NaN, inplace= True)
    filtered_df['INCOME2'].replace(99, np.NaN, inplace= True)
    filtered_df['MENTHLTH'].replace(88, 0, inplace= True)
    
    return filtered_df


In [None]:
imputed_df = impute_df(filtered_df)

In [None]:
imputed_df.head()

In [None]:
imputed_df.isnull().sum()

In [None]:
df_ = imputed_df.copy()

In [None]:
## Dropping NaN with no qestions answered for Motion features
clean_data = filtered_df.dropna(subset=['ADPLEASR','ADDOWN','ADSLEEP','ADENERGY','ADEAT1','ADFAIL','ADTHINK','ADMOVE'],how='all')
print(clean_data.shape)

In [None]:
missing_data(clean_data)

In [None]:
def napreg(x):
    if x['SEX'] == 1 or (x['_AGEG5YR'] >= 6 and x['_AGEG5YR'] <= 13):
        return 2
    else:
        return x['PREGNANT']

clean_data['PREGNANT'] = df.apply(napreg, axis=1)

In [None]:
cleaned_df = clean_data.dropna(subset=['_AGEG5YR', 'SEX','EDUCA','EMPLOY1','INCOME2','_RACE','NUMADULT','MARITAL','VETERAN3','PREGNANT'])
print(cleaned_df.shape)

In [None]:
cleaned_df = cleaned_df.dropna(subset=['ADPLEASR','ADDOWN','ADSLEEP','ADENERGY','ADEAT1','ADFAIL','ADTHINK','ADMOVE'],how='all')
cleaned_df = cleaned_df.reset_index(drop=True)
print(cleaned_df.shape)

In [None]:
missing_data(cleaned_df)

In [None]:
train_data = cleaned_df.dropna().reset_index(drop=True)
print(train_data.head(5))
train_data.columns

In [None]:
train_data

In [None]:
missing_data(train_data)

In [None]:
X = train_data.drop('Health',1)
y = train_data.Health

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

## Predictive Models
### Model Selection

In [None]:
clfs = []
seed = 3

clfs.append(("LogReg", 
             Pipeline([("Scaler", StandardScaler()),
                       ("LogReg", LogisticRegression())])))

clfs.append(("XGBClassifier",
             Pipeline([("Scaler", StandardScaler()),
                       ("XGB", XGBClassifier())]))) 
clfs.append(("KNN", 
             Pipeline([("Scaler", StandardScaler()),
                       ("KNN", KNeighborsClassifier())]))) 

clfs.append(("DecisionTreeClassifier", 
             Pipeline([("Scaler", StandardScaler()),
                       ("DecisionTrees", DecisionTreeClassifier())]))) 

clfs.append(("RandomForestClassifier", 
             Pipeline([("Scaler", StandardScaler()),
                       ("RandomForest", RandomForestClassifier())]))) 

clfs.append(("GradientBoostingClassifier", 
             Pipeline([("Scaler", StandardScaler()),
                       ("GradientBoosting", GradientBoostingClassifier(max_features=15, 
                                                                       n_estimators=600))]))) 

clfs.append(("RidgeClassifier", 
             Pipeline([("Scaler", StandardScaler()),
                       ("RidgeClassifier", RidgeClassifier())])))


clfs.append(("ExtraTreesClassifier",
             Pipeline([("Scaler", StandardScaler()),
                       ("ExtraTrees", ExtraTreeClassifier())])))

scoring = 'accuracy'
n_folds = 10
msgs = []
results, names  = [], [] 

for name, model  in clfs:
    kfold = KFold(n_splits=n_folds, random_state=seed)
    cv_results = cross_val_score(model, X_train, y_train, 
                                 cv=kfold, scoring=scoring, n_jobs=-1)    
    names.append(name)
    results.append(cv_results)    
    msg = "%s: %f (+/- %f)" % (name, cv_results.mean(),  
                               cv_results.std())
    msgs.append(msg)
    print(msg)
    

> _Logistic Regression Outperforms others ML Models_.

In [None]:
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# creating a model
model = LogisticRegression()

# feeding the training set into the model
model.fit(x_train_scaled, y_train)

# predicting the test set results
y_pred = model.predict(X_test_scaled)

# Calculating the accuracies
print("Training accuracy :", model.score(x_train_scaled, y_train))
print("Testing accuarcy :", model.score(X_test_scaled, y_test))

# classification report
cr = classification_report(y_test, y_pred)
print(cr)

# confusion matrix 
cm = confusion_matrix(y_test, y_pred)
plt.rcParams['figure.figsize'] = (5, 5)
se.heatmap(cm, annot = True, cmap = 'winter')
plt.title('Confusion Matrix', fontsize = 20)
plt.show()

In [None]:
log_clf = LogisticRegression(random_state = 42)
param_grid = {
            'penalty' : ['l2','l1'],  
            'C' : [0.001, 0.01, 0.1, 1, 10, 100, 1000]
            }

CV_log_clf = GridSearchCV(estimator = log_clf, param_grid = param_grid , scoring = 'accuracy', verbose = 1, n_jobs = -1)
CV_log_clf.fit(x_train_scaled, y_train)

best_parameters = CV_log_clf.best_params_
print('The best parameters for using this model is', best_parameters)

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize = False,
                          title = 'Confusion matrix"',
                          cmap = plt.cm.Blues) :
    plt.imshow(cm, interpolation = 'nearest', cmap = cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation = 0)
    plt.yticks(tick_marks, classes)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])) :
        plt.text(j, i, cm[i, j],
                 horizontalalignment = 'center',
                 color = 'white' if cm[i, j] > thresh else 'black')

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    
# Show metrics 
def show_metrics():
    tp = cm[1,1]
    fn = cm[1,0]
    fp = cm[0,1]
    tn = cm[0,0]
    print('Accuracy  =     {:.3f}'.format((tp+tn)/(tp+tn+fp+fn)))
    print('Precision =     {:.3f}'.format(tp/(tp+fp)))
    print('Recall    =     {:.3f}'.format(tp/(tp+fn)))
    print('F1_score  =     {:.3f}'.format(2*(((tp/(tp+fp))*(tp/(tp+fn)))/
                                                 ((tp/(tp+fp))+(tp/(tp+fn))))))

In [None]:
def cross_val_metrics(model) :
    scores = ['accuracy', 'precision', 'recall']
    for sc in scores:
        scores = cross_val_score(model, X, y, cv = 5, scoring = sc)
        print('[%s] : %0.5f (+/- %0.5f)'%(sc, scores.mean(), scores.std()))

In [None]:
def plot_roc():
    plt.plot(fpr, tpr, label = 'ROC curve', linewidth = 2)
    plt.plot([0,1],[0,1], 'k--', linewidth = 2)
   # plt.xlim([0.0,0.001])
   # plt.ylim([0.0,1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.show();

In [None]:
CV_log2_clf = LogisticRegression(C = best_parameters['C'], 
                                 penalty = best_parameters['penalty'], 
                                 random_state = 42)


CV_log2_clf.fit(x_train_scaled, y_train)

y_pred = CV_log2_clf.predict(X_test_scaled)
y_score = CV_log2_clf.decision_function(X_test_scaled)
# Confusion maxtrix & metrics
cm = confusion_matrix(y_test, y_pred)
class_names = [0,1]

In [None]:
# save the model to disk
filename = 'pickled_model.pkl'
pickle.dump(CV_log2_clf, open(filename, 'wb'))
  
# load the model from disk
pickle_model = pickle.load(open(filename, 'rb'))
print(pickle_model)

In [None]:
show_metrics()

In [None]:
cross_val_metrics(CV_log2_clf)

In [None]:
fpr, tpr, t = roc_curve(y_test, y_score)
plot_roc()

### Deep learning Model with Keras

In [None]:
model = Sequential()
model.add(Dense(64,input_dim = 19,activation='relu'))
model.add(Dense(32,activation='relu',init = 'uniform'))
model.add(Dense(16,activation='relu',init = 'uniform'))
model.add(Dense(1,activation = 'sigmoid'))
model.summary()

In [None]:
model.compile(loss = 'binary_crossentropy',optimizer='adam',metrics=['accuracy'])
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1,patience=2)
history=model.fit(x_train_scaled,y_train ,epochs=100,batch_size=128, validation_data=(X_test_scaled,y_test))

In [None]:
_, accuracy = model.evaluate(x_train_scaled, y_train)
__, Accuracy = model.evaluate(X_test_scaled, y_test)

print('Accuracy Test: %.2f' % (accuracy*100))
print('accuracy Train: %.2f' % (Accuracy*100))

In [None]:
from matplotlib import pyplot
pyplot.plot(history.history['loss'], label='train')
pyplot.plot(history.history['val_loss'], label='test')
pyplot.legend()
pyplot.show()

## END