<a href="https://colab.research.google.com/github/Redwoods/Py/blob/master/pdm2020/my-note/py-tensorflow/DL1-FCN/proj01_diabetes_FCN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **DL: FCN of Pima-Indians-Diabetes-Data** 

> https://medium.com/@soumen.atta/analyzing-pima-indians-diabetes-data-using-python-89a021b5f4eb

### Diabetes Prediction using Neural Networks 
- [Deep learning approach for diabetes prediction using PIMA Indian dataset](https://link.springer.com/article/10.1007/s40200-020-00520-5)

In [None]:
# Load the required packages 
import numpy as np
import tensorflow as tf
from tensorflow import keras
import pandas as pd
from pandas import read_csv
import matplotlib.pyplot as plt
# %matplotlib inline

tf.random.set_seed(0)

print(tf.__version__)

**Load CSV file using Pandas**

In [None]:
# Specify the file name 
url = "https://github.com/Redwoods/Py/raw/master/pdm2020/my-note/py-pandas/data/diabetes.csv"
filename = url
# filename = 'diabetes.csv'  # access to local file

# Read the data 
df = read_csv(filename) 

# Print the shape 
df.shape

In [None]:
# Print the first 5 rows 
df.head()

## create X and y datasets for training from df

In [None]:
from sklearn import model_selection

In [None]:
# create X and Y datasets for training from raw df
X = np.array(df.drop(['Outcome'], 1))
y = np.array(df['Outcome'])

# set random number seed
np.random.seed(1234)
tf.random.set_seed(1234)
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size = 0.2)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
print(X_train[0])
y_train[:10]

In [None]:
np.unique(y_train, return_counts=True),np.unique(y_test, return_counts=True)  #[1]

In [None]:
X_train[:3]

In [None]:
# import TF2 submodules
from tensorflow.keras import layers, models, callbacks
from keras.models import Sequential
from keras.layers import Dense,Flatten
from keras.callbacks import ModelCheckpoint, EarlyStopping

## Deep learning of Diabetes dataset
- noDM(0), DM(1)

### Prediction by sigmoid function in the output layer
- 0 if output < 0.5
- 1 if output >= 0.5

## Simple FCN
- 3 layers FCN: 2 hidden + output

In [None]:
# set random number seed
np.random.seed(1234)
tf.random.set_seed(1234)

# define the keras model: simple FCN
model = Sequential()
model.add(Dense(12, input_dim=8, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
# compile the keras model
model.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])


In [None]:
# fit the keras model on the dataset
hist = model.fit(X_train, y_train, epochs=150, batch_size=10, verbose=1, 
                 validation_data=(X_test, y_test))


In [None]:
# 훈련 데이터와 검증 데이터에 대한 loss 시각화.
epochs = range(1, len(hist.history['loss']) + 1)

loss_list = hist.history['loss'] #[100 * i for i in history.history['loss']]
vloss_list = hist.history['val_loss'] #[100 * i for i in history.history['val_loss']]

plt.plot(epochs,loss_list)  
plt.plot(epochs,vloss_list)

plt.plot(np.argmin(np.array(vloss_list))+1,vloss_list[np.argmin(np.array(vloss_list))], 'r*')
plt.title('Diabetes: FCN model - val_loss, min:' + str(np.round(vloss_list[np.argmin(np.array(vloss_list))],2)))
plt.ylabel('val-Loss (%)')
plt.xlabel('Epoch')
plt.legend(['loss','val_loss','best'], loc='best')
plt.show()

In [None]:
# 훈련 데이터와 검증 데이터에 대한 accuracy 시각화.
epochs = range(1, len(hist.history['accuracy']) + 1)

acc_list = [100 * i for i in hist.history['accuracy']]
vacc_list = [100 * i for i in hist.history['val_accuracy']]

plt.plot(epochs,acc_list)  
plt.plot(epochs,vacc_list)

plt.plot(np.argmax(np.array(vacc_list))+1,vacc_list[np.argmax(np.array(vacc_list))], 'r*')
plt.title('Diabetes: FCN model - val_accuracy, max:' + str(np.round(vacc_list[np.argmax(np.array(vacc_list))],2)))
plt.ylabel('val-Accuracy (%)')
plt.xlabel('Epoch')
plt.legend(['accuracy','val_accuracy','best'], loc='best')
plt.show()

In [None]:
# model performance
model.evaluate(X_test, y_test)

## Predictiion of the train data

In [None]:
y_train[:5],y_train.shape

In [None]:
model.predict(X_train)[0:5],[int(np.round(prob)) for prob in model.predict(X_train)[0:5]]

## Predictiion of the test data

In [None]:
# Prediction of the test data
print(y_test[:10])
model.predict(X_test)[0:10],[int(np.round(prob)) for prob in model.predict(X_test)[0:10]]

In [None]:
# y_test[:10]==[int(np.round(prob)) for prob in model.predict(X_test)[0:10]]

In [None]:
# make class predictions for test data with the model
predictions = [int(np.round(prob)) for prob in model.predict(X_test)]
predictions[:10]

In [None]:
# summarize the first 10 cases in testset
for i in range(10):
   print('%s => %d (expected %d)' % (X_test[i], predictions[i], y_test[i]))

### Confusion matrix

In [None]:
import seaborn as sns
from sklearn.metrics import confusion_matrix

In [None]:
cm = confusion_matrix(y_test, predictions)

In [None]:
class_names = ['noDM','DM']

In [None]:
plt.figure(figsize=(5,5))
sns.heatmap(cm, cbar=False, xticklabels=class_names, yticklabels=class_names, fmt='d', annot=True, cmap=plt.cm.coolwarm)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

## [DIY-1] Save the best model
- hdf5 file

In [None]:
# set random number seed
np.random.seed(1234)
tf.random.set_seed(1234)
# define the keras model
model2 = Sequential([
    Dense(12, input_dim=8, activation='relu'),  # 1st hidden layer 
    Dense(8, activation='relu'), 
    Dense(1, activation='sigmoid')                 
])
# compile the keras model
model2.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])


### callbacks

In [None]:
cp_callback = callbacks.ModelCheckpoint(filepath="./diabetes_DL_best_weights.{epoch:03d}-{val_accuracy:.3f}.hdf5", 
                              monitor='val_accuracy', verbose=0, save_best_only=True)
es_callback = callbacks.EarlyStopping(monitor='val_accuracy', 
                            mode='max', verbose=1, patience=50)    # patience=10, 20, 50

In [None]:
# fit the keras model on the dataset
hist = model2.fit(X_train, y_train, epochs=500, batch_size=10, verbose=1, 
                  callbacks=[cp_callback, es_callback], 
                  validation_data=(X_test, y_test))


In [None]:
# 훈련 데이터와 검증 데이터에 대한 accuracy 시각화.
epochs = range(1, len(hist.history['accuracy']) + 1)

acc_list = [100 * i for i in hist.history['accuracy']]
vacc_list = [100 * i for i in hist.history['val_accuracy']]

plt.plot(epochs,acc_list)  
plt.plot(epochs,vacc_list)

plt.plot(np.argmax(np.array(vacc_list))+1,vacc_list[np.argmax(np.array(vacc_list))], 'r*')
plt.title('Diabetes: FCN model - val_accuracy, max:' + str(np.round(vacc_list[np.argmax(np.array(vacc_list))],2)))
plt.ylabel('val-Accuracy (%)')
plt.xlabel('Epoch')
plt.legend(['accuracy','val_accuracy','best'], loc='best')
plt.show()

### best model
> diabetes_DL_best_weights.088-0.786.hdf5

In [None]:
# model2 performance
model2.evaluate(X_test, y_test)

In [None]:
from keras.models import load_model

# model = load_model('your_best_model.hdf5')
model2_best = load_model('diabetes_DL_best_weights.088-0.786.hdf5')
model2_best.evaluate(X_test, y_test)

### Confusion matrix ot the best model

In [None]:
# import seaborn as sns
# from sklearn.metrics import confusion_matrix

# make class predictions for test data with the model
predictions = [int(np.round(prob)) for prob in model2_best.predict(X_test)]

cm = confusion_matrix(y_test, predictions)

In [None]:
class_names = ['noDM','DM']

In [None]:
plt.figure(figsize=(5,5))
sns.heatmap(cm, cbar=False, xticklabels=class_names, yticklabels=class_names, fmt='d', annot=True, cmap=plt.cm.coolwarm)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

---

## [DIY-2] Save the best model after preprocessing data
- StandardScaler

In [None]:
# StandardScaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test  = scaler.transform(X_test)

In [None]:
# set random number seed
np.random.seed(1234)
tf.random.set_seed(1234)
# define the keras model
model2s = Sequential([
    Dense(12, input_dim=8, activation='relu'),  
    Dense(8, activation='relu'), 
    Dense(1, activation='sigmoid')                 
])
# compile the keras model
model2s.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])


### callbacks

In [None]:
cp_callback = callbacks.ModelCheckpoint(filepath="./diabetes_DL_SS_best_weights.{epoch:03d}-{val_accuracy:.3f}.hdf5", 
                              monitor='val_accuracy', verbose=0, save_best_only=True)
es_callback = callbacks.EarlyStopping(monitor='val_accuracy', 
                            mode='max', verbose=1, patience=50)    # patience=10, 20, 50

In [None]:
# fit the keras model on the dataset
hist = model2s.fit(X_train, y_train, epochs=500, batch_size=10, verbose=1, 
                  callbacks=[cp_callback, es_callback], 
                  validation_data=(X_test, y_test))


In [None]:
# 훈련 데이터와 검증 데이터에 대한 accuracy 시각화.
epochs = range(1, len(hist.history['accuracy']) + 1)

acc_list = [100 * i for i in hist.history['accuracy']]
vacc_list = [100 * i for i in hist.history['val_accuracy']]

plt.plot(epochs,acc_list)  
plt.plot(epochs,vacc_list)

plt.plot(np.argmax(np.array(vacc_list))+1,vacc_list[np.argmax(np.array(vacc_list))], 'r*')
plt.title('Diabetes: FCN model - val_accuracy, max:' + str(np.round(vacc_list[np.argmax(np.array(vacc_list))],2)))
plt.ylabel('val-Accuracy (%)')
plt.xlabel('Epoch')
plt.legend(['accuracy','val_accuracy','best'], loc='best')
plt.show()

### best model
> diabetes_DL_SS_best_weights.004-0.766.hdf5

In [None]:
# model2 performance
model2s.evaluate(X_test, y_test)

In [None]:
from keras.models import load_model

# model = load_model('your_best_model.hdf5')
model2s_best = load_model('diabetes_DL_SS_best_weights.004-0.766.hdf5')
model2s_best.evaluate(X_test, y_test)

### Confusion matrix ot the best model

In [None]:
# import seaborn as sns
# from sklearn.metrics import confusion_matrix

# make class predictions for test data with the model
predictions = [int(np.round(prob)) for prob in model2s_best.predict(X_test)]

cm = confusion_matrix(y_test, predictions)

In [None]:
class_names = ['noDM','DM']

In [None]:
plt.figure(figsize=(5,5))
sns.heatmap(cm, cbar=False, xticklabels=class_names, yticklabels=class_names, fmt='d', annot=True, cmap=plt.cm.coolwarm)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

***

## [DIY-2] 데이터 전처리 후 딥러닝 모델 만들어서 평가
- 결과가 좋아지는지, 아니면 나빠지는지를 확인하시오.

## Check data
- null
- NaN

In [None]:
# Check NaN
# data.isna().sum()
df.isnull().sum()

### 과연 diabetes 데이터는 문제가 없는가?

- 각 항목에서 0이 허용되지 않는 경우에 0이 있다면 => 문제 있는 데이터!
- Pregnancies, Outcome은 0이 가능한 값이므로 제외하고 0이 있는 항목(column) 조사

In [None]:
df.head(10)

In [None]:
# Pregnancies, Outcome은 0이 가능한 값이므로 제외하고 0이 있는 항목(column) 조사
columns_with_zero = df.columns[(df==0).sum() > 0][1:-1]
columns_with_zero

In [None]:
# Clean the data : zero2median()
# 1. Check zeros in features with Pregnancies, Outcome excluded.
# 2. Replace zero with NaN 
# 3. Replace NaN with the median of the corresponding featurs
def zero2median(df):
    columns_with_zero = df.columns[(df==0).sum() > 0][1:-1]
    # Index(['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI'], dtype='object')
    df[columns_with_zero]=df[columns_with_zero].replace(0,np.nan)
    for feature in columns_with_zero:
        df[feature].fillna(df[feature].median(),inplace=True)
    
    return df

# Make clean dataframe, df2 from df
df2 = zero2median(df)

In [None]:
df2.head(10)

## [DIY] 전처리된 데이터프레임 df2를 이용해서 FCN으로 best model을 찾으시오.
- best model: hdf5 file
- Accuracy graph
- Confusion matrix graph

In [None]:
# Your code here


