-
Notifications
You must be signed in to change notification settings - Fork 0
/
one_neuron_classifier_optimizer.py
267 lines (229 loc) · 13.7 KB
/
one_neuron_classifier_optimizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
#!/usr/bin/env python
## one_neuron_classifier.py
"""
A one-neuron model is characterized by a single expression that you see in the value
supplied for the constructor parameter "expressions". In the expression supplied, the
names that being with 'x' are the input variables and the names that begin with the
other letters of the alphabet are the learnable parameters.
"""
import random
import numpy
seed = 0
random.seed(seed)
numpy.random.seed(seed)
#Setting a uniform learning rate
learn_rate = 1e-2
from ComputationalGraphPrimer import *
#Modify original class so loss_record could be plotted on same graph
class ComputationalGraphPrimer_mod(ComputationalGraphPrimer):
def run_training_loop_one_neuron_model(self, training_data):
"""
The training loop must first initialize the learnable parameters. Remember, these are the
symbolic names in your input expressions for the neural layer that do not begin with the
letter 'x'. In this case, we are initializing with random numbers from a uniform distribution
over the interval (0,1).
"""
self.vals_for_learnable_params = {param: random.uniform(0,1) for param in self.learnable_params}
self.bias = random.uniform(0,1) ## Adding the bias improves class discrimination.
## We initialize it to a random number.
class DataLoader:
"""
To understand the logic of the dataloader, it would help if you first understand how
the training dataset is created. Search for the following function in this file:
gen_training_data(self)
As you will see in the implementation code for this method, the training dataset
consists of a Python dict with two keys, 0 and 1, the former points to a list of
all Class 0 samples and the latter to a list of all Class 1 samples. In each list,
the data samples are drawn from a multi-dimensional Gaussian distribution. The two
classes have different means and variances. The dimensionality of each data sample
is set by the number of nodes in the input layer of the neural network.
The data loader's job is to construct a batch of samples drawn randomly from the two
lists mentioned above. And it mush also associate the class label with each sample
separately.
"""
def __init__(self, training_data, batch_size):
self.training_data = training_data
self.batch_size = batch_size
self.class_0_samples = [(item, 0) for item in self.training_data[0]] ## Associate label 0 with each sample
self.class_1_samples = [(item, 1) for item in self.training_data[1]] ## Associate label 1 with each sample
def __len__(self):
return len(self.training_data[0]) + len(self.training_data[1])
def _getitem(self):
cointoss = random.choice([0,1]) ## When a batch is created by getbatch(), we want the
## samples to be chosen randomly from the two lists
if cointoss == 0:
return random.choice(self.class_0_samples)
else:
return random.choice(self.class_1_samples)
def getbatch(self):
batch_data,batch_labels = [],[] ## First list for samples, the second for labels
maxval = 0.0 ## For approximate batch data normalization
for _ in range(self.batch_size):
item = self._getitem()
if np.max(item[0]) > maxval:
maxval = np.max(item[0])
batch_data.append(item[0])
batch_labels.append(item[1])
batch_data = [item/maxval for item in batch_data] ## Normalize batch data
batch = [batch_data, batch_labels]
return batch
data_loader = DataLoader(training_data, batch_size=self.batch_size)
loss_running_record = []
i = 0
avg_loss_over_iterations = 0.0 ## Average the loss over iterations for printing out
## every N iterations during the training loop.
for i in range(self.training_iterations):
data = data_loader.getbatch()
data_tuples = data[0]
class_labels = data[1]
y_preds, deriv_sigmoids = self.forward_prop_one_neuron_model(data_tuples) ## FORWARD PROP of data
loss = sum([(abs(class_labels[i] - y_preds[i]))**2 for i in range(len(class_labels))]) ## Find loss
loss_avg = loss / float(len(class_labels)) ## Average the loss over batch
avg_loss_over_iterations += loss_avg
if i%(self.display_loss_how_often) == 0:
avg_loss_over_iterations /= self.display_loss_how_often
loss_running_record.append(avg_loss_over_iterations)
print("[iter=%d] loss = %.4f" % (i+1, avg_loss_over_iterations)) ## Display average loss
avg_loss_over_iterations = 0.0 ## Re-initialize avg loss
y_errors = list(map(operator.sub, class_labels, y_preds))
y_error_avg = sum(y_errors) / float(len(class_labels))
deriv_sigmoid_avg = sum(deriv_sigmoids) / float(len(class_labels))
data_tuple_avg = [sum(x) for x in zip(*data_tuples)]
data_tuple_avg = list(map(operator.truediv, data_tuple_avg,
[float(len(class_labels))] * len(class_labels) ))
self.backprop_and_update_params_one_neuron_model(y_error_avg, data_tuple_avg, deriv_sigmoid_avg) ## BACKPROP loss
return loss_running_record
#Create a SGD+ subclass that would inherit from ComputationalGraphPrimer
class ComputationalGraphPrimer_SGDplus(ComputationalGraphPrimer_mod):
#Specifying a momentum coefficient and initializing the first bias
def __init__(self, *args, **kwargs):
ComputationalGraphPrimer.__init__(self, *args, **kwargs)
self.momentum = 0.99
self.bias_step = 0
#Initializing a dictionary to store the gradients based on their learnable parameters
self.learnable_gradients = {"ab":0, "bc":0, "cd":0, "ac":0}
def backprop_and_update_params_one_neuron_model(self, y_error, vals_for_input_vars, deriv_sigmoid):
"""
As should be evident from the syntax used in the following call to backprop function,
self.backprop_and_update_params_one_neuron_model( y_error_avg, data_tuple_avg, deriv_sigmoid_avg)
^^^ ^^^ ^^^
the values fed to the backprop function for its three arguments are averaged over the training
samples in the batch. This in keeping with the spirit of SGD that calls for averaging the
information retained in the forward propagation over the samples in a batch.
See Slide 59 of my Week 3 slides for the math of back propagation for the One-Neuron network.
"""
input_vars = self.independent_vars
input_vars_to_param_map = self.var_to_var_param[self.output_vars[0]]
param_to_vars_map = {param : var for var, param in input_vars_to_param_map.items()}
vals_for_input_vars_dict = dict(zip(input_vars, list(vals_for_input_vars)))
vals_for_learnable_params = self.vals_for_learnable_params
#Enumerate over each batch of learnable_params
for i,param in enumerate(self.vals_for_learnable_params):
# Calculate the next step in the parameter hyperplane
gradient = ((self.learning_rate * y_error * vals_for_input_vars_dict[param_to_vars_map[param]] * deriv_sigmoid) + self.momentum * self.learnable_gradients[param])
## Update the learnable parameters
self.vals_for_learnable_params[param] += gradient
self.learnable_gradients[param] = gradient
#Update bias based on momentum
self.bias_step = self.bias_step *self.momentum + self.learning_rate * y_error * deriv_sigmoid ## Update the bias
self.bias += self.bias_step
######################################################################################################
#Adam subclass for Computational GraphPrimer that inherits from ComputationalGraphPrimer_mod
class ComputationalGraphPrimer_adam(ComputationalGraphPrimer_mod):
#Specifying a momentum coefficient and initializing the first step size while inheriting from the superclass
def __init__(self, *args, **kwargs):
ComputationalGraphPrimer.__init__(self, *args, **kwargs)
self.beta1 = 0.9
self.beta2 = 0.99
self.epsilon = 1e-8
self.m = 0
self.v = 0
self.learnable_gradients = {"ab":0, "bc":0, "cd":0, "ac":0}
def backprop_and_update_params_one_neuron_model(self, y_error, vals_for_input_vars, deriv_sigmoid):
"""
As should be evident from the syntax used in the following call to backprop function,
self.backprop_and_update_params_one_neuron_model( y_error_avg, data_tuple_avg, deriv_sigmoid_avg)
^^^ ^^^ ^^^
the values fed to the backprop function for its three arguments are averaged over the training
samples in the batch. This in keeping with the spirit of SGD that calls for averaging the
information retained in the forward propagation over the samples in a batch.
See Slide 59 of my Week 3 slides for the math of back propagation for the One-Neuron network.
"""
input_vars = self.independent_vars
input_vars_to_param_map = self.var_to_var_param[self.output_vars[0]]
param_to_vars_map = {param : var for var, param in input_vars_to_param_map.items()}
vals_for_input_vars_dict = dict(zip(input_vars, list(vals_for_input_vars)))
vals_for_learnable_params = self.vals_for_learnable_params
#Enumerate over each batch of learnable_params and optimize steps based on adam
for i,param in enumerate(self.vals_for_learnable_params):
# Calculate the first & second moment estimates on a running-average basis
self.m = (self.beta1 * self.m) + ((1-self.beta1) * self.learnable_gradients[param])
self.v = (self.beta2 * self.v) + ((1-self.beta2)*(self.learnable_gradients[param]**2))
#Computing the bias-corrected estimate of m & v
m_hat = self.m/(1-self.beta1**(i+1))
v_hat = self.v/(1-self.beta2**(i+1))
step = -self.learning_rate * (m_hat)/((v_hat + self.epsilon)**(1/2))
## Update the learnable parameters
self.vals_for_learnable_params[param] += step
self.learnable_gradients[param] = step
self.bias += self.learning_rate * y_error * deriv_sigmoid ## Update the bias
######################################################################################################
#For original SGD without step-size optimizer
cgp = ComputationalGraphPrimer_mod(
one_neuron_model = True,
expressions = ['xw=ab*xa+bc*xb+cd*xc+ac*xd'],
output_vars = ['xw'],
dataset_size = 5000,
learning_rate = learn_rate,
# learning_rate = 5 * 1e-2,
training_iterations = 40000,
batch_size = 8,
display_loss_how_often = 100,
debug = True,
)
#For SGD+
cgp_plus = ComputationalGraphPrimer_SGDplus(
one_neuron_model = True,
expressions = ['xw=ab*xa+bc*xb+cd*xc+ac*xd'],
output_vars = ['xw'],
dataset_size = 5000,
learning_rate = learn_rate,
# learning_rate = 5 * 1e-2,
training_iterations = 40000,
batch_size = 8,
display_loss_how_often = 100,
debug = True,
)
#For SGD_adam
cgp_adam = ComputationalGraphPrimer_adam(
one_neuron_model = True,
expressions = ['xw=ab*xa+bc*xb+cd*xc+ac*xd'],
output_vars = ['xw'],
dataset_size = 5000,
learning_rate = learn_rate,
# learning_rate = 5 * 1e-2,
training_iterations = 40000,
batch_size = 8,
display_loss_how_often = 100,
debug = True,
)
cgp.parse_expressions()
cgp_plus.parse_expressions()
cgp_adam.parse_expressions()
#cgp.display_network1()
#cgp.display_network2()
# cgp.display_one_neuron_network()
training_data_original = cgp.gen_training_data()
training_data_plus = cgp_plus.gen_training_data()
training_data_adam = cgp_adam.gen_training_data()
#Calling on the methods to plot the graphs of loss_record
loss_running_record = cgp.run_training_loop_one_neuron_model( training_data_original)
loss_running_record_plus = cgp_plus.run_training_loop_one_neuron_model( training_data_plus )
loss_running_record_adam = cgp_adam.run_training_loop_one_neuron_model( training_data_adam )
#Plotting out the figures
plt.figure()
plt.plot(loss_running_record, label = 'SGP')
plt.plot(loss_running_record_plus, label = 'SGP+')
plt.plot(loss_running_record_adam, label = 'ADAM')
plt.legend()
plt.show()