# Load the data

## Import Libraries

In [1]:
import pandas
import numpy
%matplotlib inline
import matplotlib.pyplot as plt
from pandas.tools.plotting import scatter_matrix
import numpy as np
import time
#from sklearn import metrics
from sklearn import cross_validation
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
# fix random seed for reproducibility
plt.rcParams['figure.figsize'] = (15, 6)

 ## Load Dataset

In [None]:
# fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']
data = pandas.read_csv(url, names=names)
peek = data.head(10)
print(peek)

In [None]:
shape = data.shape
print(shape)

In [None]:
types = data.dtypes
print(types)

In [None]:
pandas.set_option('display.width', 100)
pandas.set_option('precision', 3)
description = data.describe()
print(description)

In [None]:
class_counts = data.groupby('class').size()
print(class_counts)

In [None]:
correlations = data.corr(method='pearson')
print(correlations)

In [None]:
skew = data.skew()
print(skew)

# Visulize the data

In [None]:
data.hist()
plt.show()

In [None]:
data.plot(kind='density', subplots=True, layout=(3,3), sharex=False)
plt.show()

In [None]:
data.plot(kind='box', subplots=True, layout=(3,3), sharex=False, sharey=False)
plt.show()

In [None]:
correlations = data.corr()
# plot correlation matrix
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(correlations, vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = numpy.arange(0,4,1)
ax.set_xticks(ticks)
ax.set_yticks(ticks)
ax.set_xticklabels(names)
ax.set_yticklabels(names)
plt.show()

In [None]:
scatter_matrix(data)
plt.show()

# Prepare the data for classification

In [None]:
from sklearn import datasets
iris = datasets.load_iris()
features = iris.data
target = iris.target

In [None]:
from sklearn.cross_validation import train_test_split
# Extracting 20% testing data
X_train_feature, X_deploy, y_train_feature, y_deploy = train_test_split(
    features, target, test_size=0.20, random_state=17)
# Make training and validation data sets for building the models and choose the hyperparameters
X_train, X_test, y_train, y_test = train_test_split(
    X_train_feature, y_train_feature, test_size=0.20, random_state=17)
print 'Number of training examples',len(X_train)
print 'Number of validation examples',len(X_test)
print 'Number of testing examples',len(X_deploy)


# Building classifiers

## KNN classifier
It turns out that k=5 is the best choose of k on the validation set

In [None]:
from sklearn.neighbors import KNeighborsClassifier
clf_neigh = KNeighborsClassifier(n_neighbors=5)
#Training
start_time = time.time()
clf_neigh.fit(X_train, y_train)
print("The time for training KNN is  %s seconds " % (time.time() - start_time))

# Make a prediction
y_pred = clf_neigh.predict(X_test)

print "==================================================================="
print "The accuracy on validation dataset of KNN: \t", metrics.accuracy_score(y_test, y_pred)
print "Precision on validation dataset of KNN:    \t", metrics.precision_score(y_test, y_pred)
print "Recall on validation dataset of KNN :      \t", metrics.recall_score(y_test, y_pred)
print "F1 score on validation dataset of KNN:     \t", metrics.f1_score(y_test, y_pred)

## SVM classifier
### Linear SVM "LinearSVC"
It turns out that C=20 is the best on the validation set

In [None]:
from sklearn.svm import LinearSVC
clf_svm_linear = LinearSVC(C=20.0)
#Training
start_time = time.time()
clf_svm_linear.fit(X_train, y_train)
print("The time for training SVM is  %s seconds " % (time.time() - start_time))

# Make a prediction
y_pred = clf_svm_linear.predict(X_test)

print "==================================================================="
print "The accuracy on validation dataset of Linear SVM: \t", metrics.accuracy_score(y_test, y_pred)
print "Precision on validation dataset of Linear SVM:    \t", metrics.precision_score(y_test, y_pred)
print "Recall on validation dataset of Linear SVM :      \t", metrics.recall_score(y_test, y_pred)
print "F1 score on validation dataset of Linear SVM:     \t", metrics.f1_score(y_test, y_pred)

###  SVM "SVC" with kernel='rbf' 
It turns out that C=5 is the best on the validation set


In [None]:
from sklearn.svm import SVC
clf_svm = SVC(C=5.0, kernel='rbf')
#Training
start_time = time.time()
clf_svm.fit(X_train, y_train)
print("The time for training SVM is  %s seconds " % (time.time() - start_time))

# Make a prediction
y_pred = clf_svm.predict(X_test)

print "==================================================================="
print "The accuracy on validation dataset of SVM: \t", metrics.accuracy_score(y_test, y_pred)
print "Precision on validation dataset of SVM:    \t", metrics.precision_score(y_test, y_pred)
print "Recall on validation dataset of SVM :      \t", metrics.recall_score(y_test, y_pred)
print "F1 score on validation dataset of SVM:     \t", metrics.f1_score(y_test, y_pred)

## Decsion Tree classifier
### DT with Gini impurity "gini", CART (Classification and Regression Trees)
It turns out that min_samples_split=2 is the best on the validation set

In [None]:
# CART tree
from sklearn.tree import DecisionTreeClassifier
clf_dt = DecisionTreeClassifier(min_samples_split=2)
#Training
start_time = time.time()
clf_dt.fit(X_train, y_train)
print("The time for training Decision Tree is  %s seconds " % (time.time() - start_time))

# Make a prediction
y_pred = clf_dt.predict(X_test)

print "==================================================================="
print "The accuracy on validation dataset of Decision Tree: \t", metrics.accuracy_score(y_test, y_pred)
print "Precision on validation dataset of Decision Tree:    \t", metrics.precision_score(y_test, y_pred)
print "Recall on validation dataset of Decision Tree :      \t", metrics.recall_score(y_test, y_pred)
print "F1 score on validation dataset of Decision Tree:     \t", metrics.f1_score(y_test, y_pred)

### DT with “entropy” for the information gain, C4.5 (aka J48 is an open source Java Weka) and C5.0
It turns out that min_samples_split=2 is the best on the validation set

In [None]:
# Here I use C5.0 tree as its more accurate that C4.5 and more recent.
from sklearn.tree import DecisionTreeClassifier
clf_dt_IG = DecisionTreeClassifier(criterion='entropy', min_samples_split=2)
#Training
start_time = time.time()
clf_dt_IG.fit(X_train, y_train)
print("The time for training Decision Tree is  %s seconds " % (time.time() - start_time))

# Make a prediction
y_pred = clf_dt_IG.predict(X_test)

print "==================================================================="
print "The accuracy on validation dataset of Decision Tree: \t", metrics.accuracy_score(y_test, y_pred)
print "Precision on validation dataset of Decision Tree:    \t", metrics.precision_score(y_test, y_pred)
print "Recall on validation dataset of Decision Tree :      \t", metrics.recall_score(y_test, y_pred)
print "F1 score on validation dataset of Decision Tree:     \t", metrics.f1_score(y_test, y_pred)

 Ensemble methods

## Bagging Method


### Bagging Method with Knn


Using Best KNN classifier fitted the data, each built on random subsets of 50% of the samples and 50% of the features.

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
clf_dt_BGKN = BaggingClassifier(KNeighborsClassifier(n_neighbors=5),
                             max_samples=0.5, max_features=0.5)
start_time = time.time()
clf_dt_BGKN.fit(X_train, y_train)
print("The time for training Bagging Knn is  %s seconds " % (time.time() - start_time))

# Make a prediction
y_pred = clf_dt_BGKN.predict(X_test)

print "==================================================================="
print "The accuracy on validation dataset of Bagging Knn: \t", metrics.accuracy_score(y_test, y_pred)
print "Precision on validation dataset of Bagging Knn:    \t", metrics.precision_score(y_test, y_pred)
print "Recall on validation dataset of Bagging Knn :      \t", metrics.recall_score(y_test, y_pred)
print "F1 score on validation dataset of Bagging Knn:     \t", metrics.f1_score(y_test, y_pred)

### Bagging Method with DT


Using Best DT with 'entropy', each built on random subsets of 50% of the samples and 50% of the features.

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
clf_dt_BGDT = BaggingClassifier(DecisionTreeClassifier(criterion='entropy', min_samples_split=2),
                             max_samples=0.5, max_features=0.5)
start_time = time.time()
clf_dt_BGDT.fit(X_train, y_train)
print("The time for training Bagging DT is  %s seconds " % (time.time() - start_time))

# Make a prediction
y_pred = clf_dt_BGDT.predict(X_test)

print "==================================================================="
print "The accuracy on validation dataset of Bagging DT: \t", metrics.accuracy_score(y_test, y_pred)
print "Precision on validation dataset of Bagging DT:    \t", metrics.precision_score(y_test, y_pred)
print "Recall on validation dataset of Bagging DT :      \t", metrics.recall_score(y_test, y_pred)
print "F1 score on validation dataset of Bagging DT:     \t", metrics.f1_score(y_test, y_pred)

## Random Forest
DT with 'gini'

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf_dt_RF = RandomForestClassifier()
#Training
start_time = time.time()
clf_dt_RF.fit(X_train, y_train)
print("The time for training Decision Tree is  %s seconds " % (time.time() - start_time))

# Make a prediction
y_pred = clf_dt_RF.predict(X_test)

print "==================================================================="
print "The accuracy on validation dataset of Random Forest: \t", metrics.accuracy_score(y_test, y_pred)
print "Precision on validation dataset of Random Forest:    \t", metrics.precision_score(y_test, y_pred)
print "Recall on validation dataset of Random Forest :      \t", metrics.recall_score(y_test, y_pred)
print "F1 score on validation dataset of Random Forest:     \t", metrics.f1_score(y_test, y_pred)

### Ada Boost
DT with 'gini'

In [None]:
from sklearn.ensemble import AdaBoostClassifier
clf_dt_AD = AdaBoostClassifier()
#Training
start_time = time.time()
clf_dt_AD.fit(X_train, y_train)
print("The time for training Decision Tree is  %s seconds " % (time.time() - start_time))

# Make a prediction
y_pred = clf_dt_AD.predict(X_test)

print "==================================================================="
print "The accuracy on validation dataset of Ada Boost: \t", metrics.accuracy_score(y_test, y_pred)
print "Precision on validation dataset of Ada Boost:    \t", metrics.precision_score(y_test, y_pred)
print "Recall on validation dataset of Ada Boost :      \t", metrics.recall_score(y_test, y_pred)
print "F1 score on validation dataset of Ada Boost:     \t", metrics.f1_score(y_test, y_pred)

## Naive Bayes methods

In [None]:
from sklearn.naive_bayes import GaussianNB
clf_NB = GaussianNB()
#Training
start_time = time.time()
clf_NB.fit(X_train, y_train)
print("The time for training Naive Bayes is  %s seconds " % (time.time() - start_time))

# Make a prediction
y_pred = clf_NB.predict(X_test)

print "==================================================================="
print "The accuracy on validation dataset of Naive Bayes: \t", metrics.accuracy_score(y_test, y_pred)
print "Precision on validation dataset of Naive Bayes:    \t", metrics.precision_score(y_test, y_pred)
print "Recall on validation dataset of Naive Bayes :      \t", metrics.recall_score(y_test, y_pred)
print "F1 score on validation dataset of Naive Bayes:     \t", metrics.f1_score(y_test, y_pred)

## Linear Discriminant Analysis classifier

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
clf_dt_LDA = LinearDiscriminantAnalysis()
#Training
start_time = time.time()
clf_dt_LDA.fit(X_train, y_train)
print("The time for training Decision Tree is  %s seconds " % (time.time() - start_time))

# Make a prediction
y_pred = clf_dt_LDA.predict(X_test)

print "==================================================================="
print "The accuracy on validation dataset of Linear Discriminant Analysis: \t", metrics.accuracy_score(y_test, y_pred)
print "Precision on validation dataset of Linear Discriminant Analysis:    \t", metrics.precision_score(y_test, y_pred)
print "Recall on validation dataset of Linear Discriminant Analysis :      \t", metrics.recall_score(y_test, y_pred)
print "F1 score on validation dataset of Linear Discriminant Analysis:     \t", metrics.f1_score(y_test, y_pred)

## Quadratic Discriminant Analysis classifier

In [None]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
clf_dt_QDA = QuadraticDiscriminantAnalysis()
#Training
start_time = time.time()
clf_dt_QDA.fit(X_train, y_train)
print("The time for training Decision Tree is  %s seconds " % (time.time() - start_time))

# Make a prediction
y_pred = clf_dt_QDA.predict(X_test)

print "==================================================================="
print "The accuracy on validation dataset of Quadratic Discriminant Analysis: \t", metrics.accuracy_score(y_test, y_pred)
print "Precision on validation dataset of Quadratic Discriminant Analysis:    \t", metrics.precision_score(y_test, y_pred)
print "Recall on validation dataset of Quadratic Discriminant Analysis :      \t", metrics.recall_score(y_test, y_pred)
print "F1 score on validation dataset of Quadratic Discriminant Analysis:     \t", metrics.f1_score(y_test, y_pred)

## Neural Networks  classifier

In [None]:
# Create  with Keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.regularizers import l2, activity_l2
import numpy

# create model
model = Sequential()
model.add(Dense(40, input_dim=57, init='uniform', activation='relu')) # sigmoid, relu, tanh, W_regularizer=l2(0.01)
model.add(Dropout(0.25))
#model.add(Dense(40, init='uniform', activation='relu'))
#model.add(Dropout(0.25))
model.add(Dense(1, init='uniform', activation='sigmoid'))
# Compile model
'''
sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='mean_squared_error', optimizer=sgd)
model.add(Dense(64, input_dim=64, W_regularizer=l2(0.01), activity_regularizer=activity_l2(0.01)))

'''
# keras.optimizers.RMSprop(lr=0.001, rho=0.9, epsilon=1e-08)
# keras.optimizers.Adagrad(lr=0.01, epsilon=1e-08)
# keras.optimizers.Adadelta(lr=1.0, rho=0.95, epsilon=1e-08)
# keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
# keras.optimizers.Adamax(lr=0.002, beta_1=0.9, beta_2=0.999, epsilon=1e-08)

model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy']) # rmsprop, adam
# Fit the model
start_time = time.time()
model.fit(X_train, y_train, nb_epoch=100, batch_size=10, verbose=0)
print("The time for training NN is  %s seconds " % (time.time() - start_time))

# evaluate the model
print 'The evaluation :'
scores = model.evaluate(X_test, y_test)
print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

In [None]:
y_pred = model.predict_classes(X_test)
print "==================================================================="
print "The accuracy on validation dataset of Neural Network: \t", metrics.accuracy_score(y_test, y_pred)
print "Precision on validation dataset of Neural Network:    \t", metrics.precision_score(y_test, y_pred)
print "Recall on validation dataset of Neural Network :      \t", metrics.recall_score(y_test, y_pred)
print "F1 score on validation dataset of Neural Network:     \t", metrics.f1_score(y_test, y_pred)

In [None]:
#model.save_weights('pimaModelbest.hdf5',overwrite=True)

In [None]:
#np.save('Training', X_train)
#np.save('Training', y_train)
#np.save('validation', X_test)
#np.save('Training', y_test)
#np.save('Testing', X_deploy)
#np.save('Training', y_deploy)