In [1]:
import numpy as np # library that will convert data taken from pickle to numpy array
import pickle # library to import processed data from pickles

# create Numpy arrays to hold features and labels
# use pickle.load() method to import data from pickle while also using the parametere 'rb' which is for read binary
# convert data from pickle to Numpy array using the np.asarray() method
image_features = np.asarray(pickle.load(open("x_small.pickle", "rb")))
labels = np.asarray(pickle.load(open("y_small.pickle", "rb")))

In [2]:
# import different modules from machine-learning library -> sklearn 
from sklearn.preprocessing import MinMaxScaler # used to apply feature scaling to data
from sklearn.model_selection import train_test_split # method used to split data into training and test data
from sklearn.model_selection import cross_val_score # used to compute the score from training and testing data
from sklearn.model_selection import KFold #used to run a k-fold run of the algorithms during training and testing

# following objects are improted as Machine Learning Algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC # from the SVM algorihtm, use the SVC implementation

# python library to ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
seed = 9 # set random seed for selecting the test data
test_size = 0.05 # set  portion of the whole data that will be used for testing
scoring = "accuracy" # type of perfomance measure of algorithm
num_trees = 100 # set number of trees in the Decision Tree Algorithm

# create the scaler object from the MinMaxScaler object to match range 0-1
scaler = MinMaxScaler(feature_range = (0, 1))
# scale the image features using the scaler object
rescaled_features = scaler.fit_transform(image_features)

# again, the rescaled data are converted to a Numpy Array format
image_features = np.array(rescaled_features)

# method to split data into train and test batch, each batch has features and labels accordingly
# include parameteres:
# 1. image features
# 2. labels
# 3. test_size, indicates how much(percentage) of the whole data is allocated for testing
# 4. random_state = seed, when selecting the data for testing, because it is a random process, set a random seed
(train_image_features, test_image_features, train_labels, test_labels) = train_test_split(np.array(image_features),
                                                                                          np.array(labels),
                                                                                          test_size = test_size,
                                                                                          random_state = seed)
# list that will include the name and the Machine Learning Algorithm Object
models = []
models.append(('LR', LogisticRegression(random_state = seed)))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('DT', DecisionTreeClassifier(random_state = seed)))
models.append(('RF', RandomForestClassifier(n_estimators = num_trees, random_state = seed)))
models.append(('NB', GaussianNB()))
models.append(('SVC', SVC(random_state = seed)))

# for loop to go through all of the machine learning models
for name, model in models:
    # for each algorithm, do a kfold training process with selected split of data
    kfold = KFold(n_splits = 10, random_state = seed)
    # corss_val_score returns a list of accuracy results after each testing on data using kfold
    cv_results = cross_val_score(model, train_image_features, train_labels, cv = kfold, scoring = scoring)
    # compose message to be printed displaying each algorithm's accuracy and standard deviation
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

LR: 0.303732 (0.025940)
LDA: 0.300049 (0.024929)
KNN: 0.322462 (0.029459)
DT: 0.268719 (0.036310)
RF: 0.364449 (0.021542)
NB: 0.233253 (0.021786)
SVC: 0.292202 (0.015929)
