In [1]:
import matplotlib.pyplot as plt
import numpy as np
import xgboost as xgb

from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import learning_curve
from sklearn.model_selection import train_test_split

from numpy import loadtxt
from keras.layers import Input, Dense, Dropout, Flatten, Embedding
from keras.layers import Conv1D, Conv2D, MaxPooling1D, MaxPooling2D
from keras.models import Sequential
from keras.metrics import *

Using TensorFlow backend.


## Load Data

In [2]:
dataset = loadtxt('pima-indians-diabetes.data.csv', delimiter=',')

In [3]:
x = dataset[:,0:8]
y = dataset[:,8]

## Data Pre-processing

In [4]:
def process_data(x, y, is_scale=False):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)
    
    if is_scale:
        scaler = preprocessing.MinMaxScaler().fit(x_train)
        x_train_scaled = scaler.transform(x_train)
        x_test_scaled  = scaler.transform(x_test)
        x_train = x_train_scaled
        x_test = x_test_scaled
        #x_scaled = preprocessing.scale(x)
        #x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size=0.33, random_state=1)
        
    return (x_train, x_test, y_train, y_test)

In [5]:
x_train, x_test, y_train, y_test = process_data(x, y, is_scale=True)
print(x_train.mean(axis=0))
print(x_train.std(axis=0))

[0.22247654 0.60952623 0.57364292 0.32579828 0.09366118 0.47645074
 0.16869748 0.20317769]
[0.19258807 0.1624474  0.14746915 0.24581615 0.13656028 0.11388596
 0.14588237 0.19532655]


## Build Traditional ML Model

### Logistic Regression

In [None]:
lm = LogisticRegression()
lm.fit(x_train, y_train)
y_pred = lm.predict(x_test)

In [None]:
def print_classify_result(y_test, y_pred):
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))

def print_learning_curve(model, x, y):
    train_size, train_scores, test_scores = learning_curve(model, x, y, cv=10, scoring='accuracy', train_sizes=np.linspace(0.1, 1.0, 10))

    train_mean = np.mean(train_scores, axis=1)
    train_std  = np.std(train_scores, axis=1)
    test_mean  = np.mean(test_scores, axis=1)
    test_std   = np.std(test_scores, axis=1)

    plt.plot(train_size, train_mean, '--', color="#111111", label='train score')
    plt.ylim([0.5, 0.95])
    plt.fill_between(train_size, train_mean - train_std, train_mean + train_std, color="#11DDDD")

    plt.plot(train_size, test_mean, label='test score')
    plt.fill_between(train_size, test_mean - test_std, test_mean + test_std, color="#DDDDDD")
    plt.legend(loc="lower right")

In [None]:
print_classify_result(y_test, y_pred)

In [None]:
print_learning_curve(LogisticRegression(), x, y)

### SVM model

In [None]:
model = LinearSVC(C=1.0)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

In [None]:
print_classify_result(y_test, y_pred)

In [None]:
print_learning_curve(SVC(gamma='scale'), x, y)

In [None]:
model = SVC(C=1.0, kernel='linear')

model.fit(x_train, y_train)
y_pred = model.predict(x_test)

In [None]:
print_classify_result(y_test, y_pred)

### Random Forest Model

In [None]:
model = RandomForestClassifier(n_estimators=200, max_depth=5, random_state=0)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

In [None]:
print_classify_result(y_test, y_pred)

### Xgboost Model Model

In [None]:
model = xgb.XGBClassifier(max_depth=3, n_estimators=100)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

print_classify_result(y_test, y_pred)

## Build Deep Learning ML Model

### Model: Multiple Layer Perceptron

In [None]:
input_layer  = Input(shape=(8,))
dense_layer1 = Dense(12, input_dim=8, activation='relu')
dense_layer2 = Dense(8, activation='relu')
output_layer = Dense(1, activation='sigmoid')

In [None]:
model = Sequential()
#model.add(input_layer)
model.add(dense_layer1)
model.add(Dropout(0.5))
model.add(dense_layer2)
model.add(Dropout(0.5))
model.add(output_layer)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
history = model.fit(x_train, y_train, validation_split=0.3, epochs=300, batch_size=10, verbose=1)

In [None]:
#print(history.history.keys())
#print(history.history['loss'])
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.ylim([0.5, 0.9])
plt.show()

In [None]:
y_pred_prob = model.predict(x_test)
y_pred = np.array([1.0 if prob[0]>0.5 else 0.0 for prob in y_pred_prob])

print_classify_result(y_test, y_pred)

In [None]:
#print(accuracy(y_test, y_pred))
#print(binary_accuracy(y_test, y_pred))
#print(categorical_accuracy(y_test, y_pred))

### CNN Model

In [6]:
print(x_train.shape)

(514, 8)


In [32]:
cnn_layer1    = Conv1D(filters=10, kernel_size=1, activation='relu', input_shape=(1, 10))
pool_layer1   = MaxPooling1D(pool_size=1)
flatten_layer = Flatten()
dense_layer   = Dense(8, activation='relu')
output_layer  = Dense(1, activation='sigmoid')

In [36]:
model = Sequential()
model.add(cnn_layer1)
model.add(pool_layer1)
model.add(flatten_layer)
model.add(dense_layer)
model.add(output_layer)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_12 (Conv1D)           (None, 1, 10)             110       
_________________________________________________________________
max_pooling1d_12 (MaxPooling (None, 1, 10)             0         
_________________________________________________________________
flatten_12 (Flatten)         (None, 10)                0         
_________________________________________________________________
dense_23 (Dense)             multiple                  88        
_________________________________________________________________
dense_24 (Dense)             multiple                  9         
Total params: 207
Trainable params: 207
Non-trainable params: 0
_________________________________________________________________


In [34]:
history = model.fit(x_train, y_train, validation_split=0.3, epochs=300, batch_size=10, verbose=1)

ValueError: Error when checking input: expected conv1d_12_input to have 3 dimensions, but got array with shape (514, 8)