In [1]:
import tensorflow as tf
import numpy as np
import random

# Set seeds
np.random.seed(42)
random.seed(42)
tf.random.set_seed(42)

In [None]:
print(python --version)

In [2]:
import pandas as pd

# load the dataset
dataset = pd.read_csv('./multiclass-4190-linear-input.csv')

In [3]:
print(dataset)

       linesCount  assignmentsCount  selectionStatementsCount  \
0              10                 1                         0   
1              10                 1                         0   
2              10                 1                         0   
3              10                 1                         0   
4               7                 0                         0   
...           ...               ...                       ...   
25135          34                 0                         0   
25136          22                 1                         0   
25137           9                 0                         0   
25138          18                 1                         1   
25139          49                 3                         1   

       iterationStatementsCount  synchronizedStatementsCount  \
0                             1                            0   
1                             1                            0   
2                          

In [4]:
# split dataset into X and Y
# columns are 0-based index

# x = dataset.iloc[:, 0:28].values # 0:28 means 0th-indexed column to 27th-indexed column - wildcardTypesCount
# y = dataset.iloc[:, -1].values # -1 refers to the last column - cloneType

x = dataset.iloc[:, 0:-1].values
y = dataset.iloc[:, -1].values # -1 refers to the last column - cloneType

print(x)
print(y)

print(type(x))
print(type(y))

print(x.shape) # (25140, 48) = 25140 number of vectors in it and 48 number of scalars in each vector.
print(y.shape) # (25140,) = 25140 number of scalers. Vector. So one-dimensional data.

print(x.ndim)
print(y.ndim)

[[10  1  0 ... 10 11  3]
 [10  1  0 ... 10 11  3]
 [10  1  0 ... 10 11  3]
 ...
 [ 9  0  0 ... 11 14  3]
 [18  1  1 ... 11 14  3]
 [49  3  1 ... 11 14  3]]
['t1' 't1' 't1' ... 't0' 't0' 't0']
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(25140, 96)
(25140,)
2
1


In [5]:
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer, label_binarize, LabelEncoder
import numpy as np

# define One-Hot Encoding
# encoder = OneHotEncoder(sparse = False)
# transform data
# y = encoder.fit_transform(y.reshape(-1,1))

# lb = LabelBinarizer()
# lb.fit(y) # target classes are arranged in alphabetically and not in our wish. So, for a class occurance 1 will be added, others will be 0.
# print(lb.classes_) # ['mt3' 'st3' 't0' 't1' 'vst3' 'wt3']
# y = lb.transform(y) # according to classes_, 1 will be added. Example, [0 0 0 1 0 0] or 3 represents 't1' class.
# print(y)

# This function makes it possible to compute this transformation for a fixed set of class labels known ahead of time.
# label_binarize function preserves class order according to our wish!
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.label_binarize.html#sklearn.preprocessing.label_binarize
y = label_binarize(y, classes=['t1', 'vst3', 'st3', 'mt3', 'wt3', 't0']) # Example, [0 0 0 1 0 0] or 3 represents 'mt3' class.
print(y)
print(y.shape)
print(y.ndim)

# define LabelEncoder
# le = LabelEncoder()
# le.fit(y)
# print(list(le.classes_))
# y = le.transform(y)
# print(y)

[[1 0 0 0 0 0]
 [1 0 0 0 0 0]
 [1 0 0 0 0 0]
 ...
 [0 0 0 0 0 1]
 [0 0 0 0 0 1]
 [0 0 0 0 0 1]]
(25140, 6)
2


In [6]:
from tensorflow import keras
import time
from datetime import timedelta

# # Train-Test Split Method Begins!

In [7]:
# Split the x and y dataset into the Training set and Test set while performing the data shuffling
# https://www.kaggle.com/questions-and-answers/189700 - don't use train-test split, we'll use CV
from sklearn.model_selection import train_test_split
# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 42)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3)

print(x_train.shape)
print(y_train.shape)

(17598, 96)
(17598, 6)


In [8]:
# in Input object, shape = (24,) indicates that the expected input will be batches of 24-dimensional vectors.
input_shape = (48,1)
print(input_shape)


(48, 1)


In [9]:
# # function that creates and returns the subnetwork
# def create_subnetwork(input_shape, initializer):
#     input = keras.Input(shape = input_shape, name = 'subnetwork_input')
#     x = keras.layers.Dense(units = 200, kernel_initializer = initializer, activation = 'relu')(input)
#     x = keras.layers.Dropout(0.2)(x)
#     x = keras.layers.Dense(units = 200, kernel_initializer = initializer, activation = 'relu')(x)
#     x = keras.layers.Dense(units = 200, kernel_initializer = initializer, activation = 'relu')(x)
#     x = keras.layers.Dropout(0.2)(x)
#     x = keras.layers.Dense(units = 200, kernel_initializer = initializer, activation = 'relu')(x)
#     return keras.Model(name = 'subnetwork', inputs = input, outputs = x)

In [10]:
# function that creates and returns the subnetwork
def create_subnetwork(input_shape, initializer):
    input = keras.Input(shape = input_shape, name = 'subnetwork_input')
    
    x = keras.layers.Conv1D(filters = 6, strides = 1, kernel_size = 5, activation = 'relu')(input)
    x = keras.layers.AveragePooling1D(pool_size = 2, strides = 2)(x)
    
    x = keras.layers.Conv1D(filters = 16, strides = 1, kernel_size = 5, activation = 'relu')(x)
    x = keras.layers.AveragePooling1D(pool_size=2, strides = 2)(x)
    
    x = keras.layers.Conv1D(filters = 120, strides = 1, kernel_size = 5, activation = 'relu')(x)
    
    x = keras.layers.Flatten()(x)
    
    x = keras.layers.Dense(units = 420, activation = 'relu')(x)
    
#     x = keras.layers.Dense(units = y.shape[1], activation = 'softmax')(x)
    
    return keras.Model(name = 'subnetwork', inputs = input, outputs = x)

In [11]:
# function that creates and returns the comparatornetwork
def create_comparatornetwork(input_vect_dim, initializer):
    input = keras.Input(shape = (input_vect_dim,), name = 'comparatornetwork_input')
    x = keras.layers.Dense(units = 200, kernel_initializer = initializer, activation = 'relu')(input)
    x = keras.layers.Dropout(0.2)(x)
    x = keras.layers.Dense(units = 100, kernel_initializer = initializer, activation = 'relu')(x)
    x = keras.layers.Dense(units = 50, kernel_initializer = initializer, activation = 'relu')(x)
    x = keras.layers.Dropout(0.2)(x)
    x = keras.layers.Dense(units = 25, kernel_initializer = initializer, activation = 'relu')(x)
    return keras.Model(name = 'comparatornetwork', inputs = input, outputs = x)

In [12]:
# defining 'HeNormal' initializer
initializer = keras.initializers.HeNormal(42)
print(initializer)

<keras.initializers.initializers_v2.HeNormal object at 0x00000181EBD6FBB0>


In [13]:
# Building Siamese Network With Two Identical Subnetworks
left_input = keras.Input(shape = input_shape, name = 'left_input')
right_input = keras.Input(shape = input_shape, name = 'right_input')

subnetwork = create_subnetwork(input_shape, initializer)

encoded_left = subnetwork(left_input) # chain layers
encoded_right = subnetwork(right_input)

# concatenate outputs of the two subnetworks
concatted = keras.layers.Concatenate()([encoded_left, encoded_right])
print(concatted.shape)

comparatornetwork = create_comparatornetwork(concatted.shape[1], initializer)

comparator = comparatornetwork(concatted)

classificationUnit = keras.layers.Dense(name = 'classificationUnit', units = y.shape[1], kernel_initializer = initializer, activation = 'softmax')(comparator)

leonet = keras.Model(name = 'LeONet', inputs = [left_input, right_input], outputs = classificationUnit)

leonet.compile(loss = 'categorical_crossentropy', optimizer = 'Adam', metrics = ['accuracy'])

leonet.summary()

(None, 840)
Model: "LeONet"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 left_input (InputLayer)        [(None, 48, 1)]      0           []                               
                                                                                                  
 right_input (InputLayer)       [(None, 48, 1)]      0           []                               
                                                                                                  
 subnetwork (Functional)        (None, 420)          262672      ['left_input[0][0]',             
                                                                  'right_input[0][0]']            
                                                                                                  
 concatenate (Concatenate)      (None, 840)          0           ['subnetwork[0][

In [14]:
comparatornetwork.summary()
keras.utils.plot_model(comparatornetwork, show_shapes = True, show_layer_names = True)

Model: "comparatornetwork"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 comparatornetwork_input (In  [(None, 840)]            0         
 putLayer)                                                       
                                                                 
 dense_1 (Dense)             (None, 200)               168200    
                                                                 
 dropout (Dropout)           (None, 200)               0         
                                                                 
 dense_2 (Dense)             (None, 100)               20100     
                                                                 
 dense_3 (Dense)             (None, 50)                5050      
                                                                 
 dropout_1 (Dropout)         (None, 50)                0         
                                                 

In [15]:
print(x_train.shape)
print(y_train.shape)

# https://www.earthdatascience.org/courses/intro-to-earth-data-science/scientific-data-structures-python/numpy-arrays/indexing-slicing-numpy-arrays/

x_train_left_input = x_train[:, 0:48] # select all rows and columns from 0th indexed to 47th indexed columns
x_train_right_input = x_train[:, 48:96]

# x_train_left_input = x_train[:, 0:28] # Existing 28 features
# x_train_right_input = x_train[:, 48:76] # Existing 28 features

# x_train_left_input = x_train[:, 28:48] # Novel 20 features
# x_train_right_input = x_train[:, 76:96] # Novel 20 features

print(x_train_left_input.shape)
print(x_train_right_input.shape)

(17598, 96)
(17598, 6)
(17598, 48)
(17598, 48)


In [16]:
import time
from datetime import timedelta

# start timer
start_time = time.time()

from keras.callbacks import CSVLogger

csv_logger = CSVLogger('leonet_epoch_500_training.log', separator=',', append=False)

# Fit the ANN to the Training set
historyObject = leonet.fit(x = [x_train_left_input, x_train_right_input], y = y_train, batch_size = 32, epochs = 500, callbacks=[csv_logger])

# Stop timer clock
elapsed = time.time() - start_time

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

In [17]:
print(elapsed)

# Calculate K-fold execution duration
print("Training duration: " + str(timedelta(seconds=elapsed)))

22874.31223464012
Training duration: 6:21:14.312235


In [18]:
import os.path
if os.path.isfile('models/leonet_epoch_500.h5') is False:
    leonet.save('models/leonet_epoch_500.h5')

In [19]:
print(x_test.shape)
print(y_test.shape)

# https://www.earthdatascience.org/courses/intro-to-earth-data-science/scientific-data-structures-python/numpy-arrays/indexing-slicing-numpy-arrays/

x_test_left_input = x_test[:, 0:48] # select all rows and columns from 0th indexed to 47th indexed columns
x_test_right_input = x_test[:, 48:96]

# x_test_left_input = x_test[:, 0:28] # Existing 28 features
# x_test_right_input = x_test[:, 48:76] # Existing 28 features

# x_test_left_input = x_test[:, 28:48] # Novel 20 features
# x_test_right_input = x_test[:, 76:96] # Novel 20 features

print(x_test_left_input.shape)
print(x_test_right_input.shape)

(7542, 96)
(7542, 6)
(7542, 48)
(7542, 48)


In [20]:
# 4. Predict the Test Set Results
import numpy as np
y_pred = leonet.predict(x = [x_test_left_input, x_test_right_input])



In [21]:
# 5. Converting y_test, and y_pred from One-Hot encoding to integer encoding.

# https://stackoverflow.com/questions/47564495/what-does-numpy-ndarray-shape-do
# For a 1D array, the shape would be (n,) where n is the number of elements in your array.
# For a 2D array, the shape would be (n,m) where n is the number of rows and m is the number of columns in your array.

# Converting y_pred from One-Hot encoding to integer encoding.
# y_pred = <class 'numpy.ndarray'>

# shape is a tuple that always gives dimensions of the array.
print(y_pred.shape) # (7542, 6) means 7542 rows and 6 columns
y_pred = np.argmax(y_pred, axis = -1) # (axis = -1) represents the last axis. In this case, 6.
print(y_pred)

# collect y_test predicted class values
# https://stackoverflow.com/questions/47435526/what-is-the-meaning-of-axis-1-in-keras-argmax
print(y_test)
print(y_test.shape) # (7542, 6) means 7542 rows and 6 columns
y_test = np.argmax(y_test, axis = -1) # Returns the indices of the maximum values along an axis. (axis = -1) represents the last axis. In this case, values from 0 to 5 (predicted target classes).
print(y_test)
print(y_test.shape) # (7542,) means 1D array where 7542 elements available.

(7542, 6)
[3 5 4 ... 2 3 0]
[[0 0 0 1 0 0]
 [0 0 0 0 0 1]
 [0 0 0 0 1 0]
 ...
 [0 0 1 0 0 0]
 [0 0 0 1 0 0]
 [1 0 0 0 0 0]]
(7542, 6)
[3 5 4 ... 2 3 0]
(7542,)


In [22]:
# Classification report - https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, classification_report

target_names = ['Type 1', 'VST3', 'ST3', 'MT3', 'WT3/4', 'False']

print(classification_report(y_test, y_pred, target_names=target_names))

# macro average (averaging the unweighted mean per label)
# weighted average (averaging the support-weighted mean per label)
# Support is the number of actual occurrences of the class in the specified dataset

              precision    recall  f1-score   support

      Type 1       1.00      0.97      0.98      1273
        VST3       0.94      0.96      0.95      1267
         ST3       0.95      0.95      0.95      1262
         MT3       0.98      0.99      0.98      1261
       WT3/4       1.00      1.00      1.00      1243
       False       1.00      1.00      1.00      1236

    accuracy                           0.98      7542
   macro avg       0.98      0.98      0.98      7542
weighted avg       0.98      0.98      0.98      7542



In [23]:
# accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Final accuracy score is: {:0.2f}%".format(accuracy * 100))

Final accuracy score is: 97.83%


In [24]:
# precision weighted average - This parameter is required for multiclass/multilabel targets.
precisionweighted = precision_score(y_test, y_pred, average='weighted')
print("Final weighted precision score is: {:0.2f}%".format(precisionweighted * 100))

Final weighted precision score is: 97.85%


In [25]:
# recall weighted average - This parameter is required for multiclass/multilabel targets.
recallweighted = recall_score(y_test, y_pred, average='weighted')
print("Final weighted recall score is: {:0.2f}%".format(recallweighted * 100))

Final weighted recall score is: 97.83%


In [26]:
# f1 weighted average - This parameter is required for multiclass/multilabel targets.
f1weighted = f1_score(y_test, y_pred, average='weighted')
print("Final weighted f1 score is: {:0.2f}%".format(f1weighted * 100))

Final weighted f1 score is: 97.83%


In [27]:
t1_recall = []
vst3_recall = []
st3_recall = []
mt3_recall = []
wt3_recall = []
t0_recall = []

def appendRecallForEachClass(arr):
    t1_recall.append(arr[0])
    vst3_recall.append(arr[1])
    st3_recall.append(arr[2])
    mt3_recall.append(arr[3])
    wt3_recall.append(arr[4])
    t0_recall.append(arr[5])
    # printRecallForEachClass(arr)

def printRecallForEachClass(arr):
    print(arr)
    print(t1_recall)
    print(vst3_recall)
    print(st3_recall)
    print(mt3_recall)
    print(wt3_recall)
    print(t0_recall)

In [28]:
t1_precision = []
vst3_precision = []
st3_precision = []
mt3_precision = []
wt3_precision = []
t0_precision = []

def appendPrecisionForEachClass(arr):
    t1_precision.append(arr[0])
    vst3_precision.append(arr[1])
    st3_precision.append(arr[2])
    mt3_precision.append(arr[3])
    wt3_precision.append(arr[4])
    t0_precision.append(arr[5])
    # printPrecisionForEachClass(arr)

def printPrecisionForEachClass(arr):
    print(arr)
    print(t1_precision)
    print(vst3_precision)
    print(st3_precision)
    print(mt3_precision)
    print(wt3_precision)
    print(t0_precision)

In [29]:
t1_f1score = []
vst3_f1score = []
st3_f1score = []
mt3_f1score = []
wt3_f1score = []
t0_f1score = []

def appendF1ScoreForEachClass(arr):
    t1_f1score.append(arr[0])
    vst3_f1score.append(arr[1])
    st3_f1score.append(arr[2])
    mt3_f1score.append(arr[3])
    wt3_f1score.append(arr[4])
    t0_f1score.append(arr[5])
    # printF1ScoreForEachClass(arr)

def printF1ScoreForEachClass(arr):
    print(arr)
    print(t1_f1score)
    print(vst3_f1score)
    print(st3_f1score)
    print(mt3_f1score)
    print(wt3_f1score)
    print(t0_f1score)

In [30]:
# recall

# recall macro average - This parameter is required for multiclass/multilabel targets.
recallmacro = recall_score(y_test, y_pred, average='macro')
print("Final macro recall score is: {:0.2f}%".format(recallmacro * 100))

# recall micro average - This parameter is required for multiclass/multilabel targets.
recallmicro = recall_score(y_test, y_pred, average='micro')
print("Final micro recall score is: {:0.2f}%".format(recallmicro * 100))

# recall weighted average - This parameter is required for multiclass/multilabel targets.
recallweighted = recall_score(y_test, y_pred, average='weighted')
print("Final weighted recall score is: {:0.2f}%".format(recallweighted * 100))

# precision

# precision macro average - This parameter is required for multiclass/multilabel targets.
precisionmacro = precision_score(y_test, y_pred, average='macro')
print("Final macro precision score is: {:0.2f}%".format(precisionmacro * 100))

# precision micro average - This parameter is required for multiclass/multilabel targets.
precisionmicro = precision_score(y_test, y_pred, average='micro')
print("Final micro precision score is: {:0.2f}%".format(precisionmicro * 100))

# precision weighted average - This parameter is required for multiclass/multilabel targets.
precisionweighted = precision_score(y_test, y_pred, average='weighted')
print("Final weighted precision score is: {:0.2f}%".format(precisionweighted * 100))

# f1 score

# f1 macro average - This parameter is required for multiclass/multilabel targets.
f1macro = f1_score(y_test, y_pred, average='macro')
print("Final macro f1 score is: {:0.2f}%".format(f1macro * 100))

# f1 micro average - This parameter is required for multiclass/multilabel targets.
f1micro = f1_score(y_test, y_pred, average='micro')
print("Final micro f1 score is: {:0.2f}%".format(f1micro * 100))

# f1 weighted average - This parameter is required for multiclass/multilabel targets.
f1weighted = f1_score(y_test, y_pred, average='weighted')
print("Final weighted f1 score is: {:0.2f}%".format(f1weighted * 100))

# collect details class-vice
appendRecallForEachClass(recall_score(y_test, y_pred, average=None))
appendPrecisionForEachClass(precision_score(y_test, y_pred, average=None))
appendF1ScoreForEachClass(f1_score(y_test, y_pred, average=None))

Final macro recall score is: 97.84%
Final micro recall score is: 97.83%
Final weighted recall score is: 97.83%
Final macro precision score is: 97.86%
Final micro precision score is: 97.83%
Final weighted precision score is: 97.85%
Final macro f1 score is: 97.84%
Final micro f1 score is: 97.83%
Final weighted f1 score is: 97.83%


In [31]:
from statistics import mean

# Recall by Class
print(len(t1_recall))
print("Type 1 Recall: {:0.2f}%".format(mean(t1_recall) * 100))
print("VST3 Recall: {:0.2f}%".format(mean(vst3_recall) * 100))
print("ST3 Recall: {:0.2f}%".format(mean(st3_recall) * 100))
print("MT3 Recall: {:0.2f}%".format(mean(mt3_recall) * 100))
print("WT3/4 Recall: {:0.2f}%".format(mean(wt3_recall) * 100))
print("False Recall: {:0.2f}%".format(mean(t0_recall) * 100))

1
Type 1 Recall: 96.78%
VST3 Recall: 95.90%
ST3 Recall: 95.40%
MT3 Recall: 98.97%
WT3/4 Recall: 100.00%
False Recall: 100.00%


In [32]:
# Precision by Class
print(len(wt3_precision))
print("Type 1 Precision: {:0.2f}%".format(mean(t1_precision) * 100))
print("VST3 Precision: {:0.2f}%".format(mean(vst3_precision) * 100))
print("ST3 Precision: {:0.2f}%".format(mean(st3_precision) * 100))
print("MT3 Precision: {:0.2f}%".format(mean(mt3_precision) * 100))
print("WT3/4 Precision: {:0.2f}%".format(mean(wt3_precision) * 100))
print("False Precision: {:0.2f}%".format(mean(t0_precision) * 100))

1
Type 1 Precision: 99.92%
VST3 Precision: 94.11%
ST3 Precision: 95.40%
MT3 Precision: 97.73%
WT3/4 Precision: 100.00%
False Precision: 100.00%


In [33]:
# F1 Score by Class
print(len(mt3_f1score))
print("Type 1 F1 Score: {:0.2f}%".format(mean(t1_f1score) * 100))
print("VST3 F1 Score: {:0.2f}%".format(mean(vst3_f1score) * 100))
print("ST3 F1 Score: {:0.2f}%".format(mean(st3_f1score) * 100))
print("MT3 F1 Score: {:0.2f}%".format(mean(mt3_f1score) * 100))
print("WT3/4 F1 Score: {:0.2f}%".format(mean(wt3_f1score) * 100))
print("False F1 Score: {:0.2f}%".format(mean(t0_f1score) * 100))

1
Type 1 F1 Score: 98.32%
VST3 F1 Score: 95.00%
ST3 F1 Score: 95.40%
MT3 F1 Score: 98.35%
WT3/4 F1 Score: 100.00%
False F1 Score: 100.00%


# # Train-Test Method Ends!