In [14]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import pandas as pd
import numpy as np
import tensorflow as tf



# function for building the decision tree
# Return: trained decision tree
def decision_tree_build(train_data, label_data, criterion, min_samples_split, max_depth):
  print("BUILDING DECISION TREE ... ")
  print("Training Data Shape: " + str(train_data.shape))
  dt_clf = DecisionTreeClassifier(criterion = criterion,
                                  min_samples_split = min_samples_split,
                                  max_depth = max_depth)
  dt_clf.fit(train_data, label_data)
  return dt_clf

# function for building the random forest
# Return: trained random forest classifier
def random_forest_build(train_data, label_data, criterion, n_estimators):
  print("BUILDING RANDOM FOREST ... ")
  print("Training Data Shape: " + str(train_data.shape))
  rf_clf = RandomForestClassifier(n_estimators = n_estimators,
                                  criterion = criterion)
  rf_clf.fit(train_data, label_data)
  return rf_clf

# prediction function for the given data and classifier
# Return: output of the prediction
def classifier_prediction(data, trained_classifier):
  print("Feeded Data Shape: " + str(data.shape))
  pred = trained_classifier.predict(data)
  print("Output Prediction Shape: " + str(pred.shape))
  return pred

# show the accuracy of two data set, one should be label and another one is the prediction
# Simply print the result
def output_validation(pred_data, labels):
  # can be replaced by self developed accuracy function
  accuracy = accuracy_score(labels, pred_data)
  print("Acuracy: "+ str(accuracy))
  print("Confusion Matrix: \n" + str(confusion_matrix(labels, pred_data)))

# Traditional FNN network building method
# set the model
def fnn_build(train_data, label_data, optimizer, loss, epoch):
  # setting initializer
  # initializer = tf.keras.initializers.Constant(0.)
  # initializer = tf.keras.initializers.GlorotNormal()
  # initializer = tf.keras.initializers.Orthogonal()
  initializer = tf.keras.initializers.RandomNormal(mean=0., stddev=1.)
  model = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(), 
    tf.keras.layers.Dense(12, activation=tf.nn.relu, kernel_initializer=initializer),
    tf.keras.layers.Dense(24, activation=tf.nn.relu, kernel_initializer=initializer),
    tf.keras.layers.Dense(48, activation=tf.nn.relu, kernel_initializer=initializer),
    # tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(48, activation=tf.nn.relu, kernel_initializer=initializer),
    tf.keras.layers.Dense(24, activation=tf.nn.relu, kernel_initializer=initializer),
    tf.keras.layers.Dense(12, activation=tf.nn.relu, kernel_initializer=initializer),
    tf.keras.layers.Dense(6, activation=tf.nn.relu, kernel_initializer=initializer),
    # tf.keras.layers.Dense(1)
    tf.keras.layers.Dense(1, activation=tf.nn.relu)
  ])
  model.compile(optimizer=optimizer,
              loss=loss,
              metrics=['accuracy'])
  print("BUILDING NEURAL NETWORK ... ")
  # feeding the data into the model and repeat <epoch> times
  model.fit(train_data, label_data, epochs=epoch)
  return model

# Since the accuracy of fnn model is different, need to use evaluate function
def fnn_validate(test_data, test_labe, classifier):
  return classifier.evaluate(test_data, test_labe)




##### testing ######

# Preprocessing
vacc = pd.read_csv("/content/vacc_prog_clean.csv",  header = None)
vacc = vacc.replace("?", np.NaN)
vacc.dropna(axis=0,inplace=True)
vacc.info()
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for column in vacc.columns:
  if vacc[column].dtype == "object":
    vacc[column] = le.fit_transform(vacc[column])
# vacc = vacc.drop([11, 13], axis=1)
ca_arr = vacc.values
# print(str(ca_arr[ca_arr[:,2] != 0,1:11].shape))
# ca_arr = ca_arr[ca_arr[:,2] != 0, :]

# Segregate features and labels into separate variables
ca_arr = np.delete(ca_arr, 0, axis=1)
X = np.delete(ca_arr, 7, axis=1)
y = ca_arr[:,7]
# X,y = ca_arr[:,1:11] , ca_arr[:,12]


# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33)

scaler = MinMaxScaler()
# transform data
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)


# Example of using decision tree model
# decision_model = decision_tree_build(X_train, y_train, criterion="entropy",min_samples_split=4,max_depth=3)
# dt_prediction = classifier_prediction(X_test, decision_model)
# accuracy = output_validation(dt_prediction, y_test)
# print(accuracy)

# Example of using randome forest model
# forest_model = random_forest_build(X_train, y_train, n_estimators=20, criterion="entropy")
# rf_prediction = classifier_prediction(X_test, forest_model)
# accuracy = output_validation(rf_prediction, y_test)
# print(accuracy)

# Example of using fnn model
fnn_model = fnn_build(X_train, y_train, optimizer="adam", loss="mean_squared_error", epoch=10)
accuracy = fnn_validate(X_test, y_test, fnn_model)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36580 entries, 1 to 36580
Data columns (total 13 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       36580 non-null  float64
 1   1       36580 non-null  object 
 2   2       36580 non-null  object 
 3   3       36580 non-null  object 
 4   4       36580 non-null  object 
 5   5       36580 non-null  object 
 6   6       36580 non-null  object 
 7   7       36580 non-null  object 
 8   8       36580 non-null  object 
 9   9       36580 non-null  object 
 10  10      36580 non-null  object 
 11  11      36580 non-null  object 
 12  12      36580 non-null  object 
dtypes: float64(1), object(12)
memory usage: 3.9+ MB
BUILDING NEURAL NETWORK ... 
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
