# Unsupervised Convolutional MNIST Classification

This code places our information maximizing network on top of of a convolutional network. The backpropogation of the gradients into the convolutional network will make it such that the convolutional network captures as much meaningful information about the digits as possible. This in turn groups the images of numbers by value without any ground truth data. Although the results do not compare to unsupervised benchmarks, two images of the same number value are grouped into the same group with >50% probability

In [0]:
import tensorflow as tf
import math
import itertools
import numpy as np
import random
mnist = tf.keras.datasets.mnist

tf.enable_eager_execution()

(x_train, y_train),(x_test, y_test) = mnist.load_data()
print(len(x_train))
data_size = 1000
x_train = x_train[0:data_size]
y_train = y_train[0:data_size]
USING_MNIST = True
# x_train = []

# for i in range(300):
#   x = random.randint(0,7)
# #   x_train.append([(x//8)%2,(x//4)%2,(x//2)%2,x%2])
#   x_train.append([0 if x != i else 1 for i in range(8)])
# for x in range(8):
#   x_train.append([0 if x != i else 1 for i in range(8)])

#   r = random.random()   
#   if r < .3333:
#     x_train.append([1,1,1])
#   elif r < .666:
#     x_train.append([1,1,0])
#   else:
#     x_train.append([0,1,1])
# x_train = np.array(x_train)

s = tf.dtypes.cast(tf.constant(x_train > 125),dtype=tf.float32)
s = tf.reshape(s,[data_size,-1])


In [0]:
N = 3
K = .05
L = .4
MU = 1

s_size = s.shape[1]

#creating an array of the permutations to aid in later computation of H(Y^0...Y^N)
def get_options():
  bin_options = list(itertools.product([0,1], repeat=N+1))
  options = []
  for row_index, row in enumerate(bin_options):
    row_building = []
    for index,val in enumerate(row):
      row_building.append(index*2+val)
    options.append(row_building)
  return options

options = get_options()

#weights
# x = tf.Variable(tf.random_normal(shape=[int(s.shape[1]),N+1], stddev=0.1))
# B = tf.Variable(tf.random_normal(shape=[N+1], stddev=0.1))

In [0]:

# Create the neural network
def conv_net(x, n_classes, dropout, reuse, is_training):

    # TF Estimator input is a dict, in case of multiple inputs


    # MNIST data input is a 1-D vector of 784 features (28*28 pixels)
    # Reshape to match picture format [Height x Width x Channel]
    # Tensor input become 4-D: [Batch Size, Height, Width, Channel]
    x = tf.reshape(x, shape=[-1, 28, 28, 1])

    # Convolution Layer with 32 filters and a kernel size of 5
    conv1 = tf.layers.conv2d(x, 32, 5, activation=tf.nn.relu)
    # Max Pooling (down-sampling) with strides of 2 and kernel size of 2
    conv1 = tf.layers.max_pooling2d(conv1, 2, 2)

    # Convolution Layer with 64 filters and a kernel size of 3
    conv2 = tf.layers.conv2d(conv1, 64, 3, activation=tf.nn.relu)
    # Max Pooling (down-sampling) with strides of 2 and kernel size of 2
    conv2 = tf.layers.max_pooling2d(conv2, 2, 2)

    # Flatten the data to a 1-D vector for the fully connected layer
    fc1 = tf.contrib.layers.flatten(conv2)

    # Fully connected layer (in tf contrib folder for now)
    fc1 = tf.layers.dense(fc1, 1024)
    # Apply Dropout (if is_training is False, dropout is not applied)
    fc1 = tf.layers.dropout(fc1, rate=dropout, training=is_training)

    # Output layer, class prediction
    out = tf.layers.dense(fc1, n_classes)

    return out


#s: list of flattened images
#returns: num_images by N vector of node probabilities for each image
def compute_nodes(s):
  output = conv_net(s,N+1,.25,False,True)
#   layer = tf.matmul(s,x)+B
  
  output = tf.clip_by_value(tf.nn.sigmoid(output),-4,4)
  return tf.clip_by_value(tf.nn.sigmoid(output),.01,.99)



def compute_Hs(computed_nodes):
  one_over_num_samples = 1/tf.dtypes.cast(computed_nodes.shape[0],dtype=tf.float32)
  return -tf.math.log(one_over_num_samples)/math.log(2)
  

#p(y^k)
def compute_pyk(computed_nodes):
   return tf.math.reduce_mean(computed_nodes,axis=0)#CHECKED
  
def compute_Hs_y(computed_nodes):
  num_samples = tf.dtypes.cast(computed_nodes.shape[0],dtype=tf.float32)
  prob_y_and_s = (1./num_samples)*computed_nodes
  prob_y_and_not_s = (1./num_samples)*(1-computed_nodes)
  logedvals = prob_y_and_s*tf.math.log(prob_y_and_s)/math.log(2) + prob_y_and_not_s*tf.math.log(prob_y_and_not_s)/math.log(2)
  return -tf.math.reduce_sum(logedvals,axis=0)

#H(Y^k)
#kth elemenent of returned value = H(Y^k)
def compute_Hy_k(computed_nodes):
  pyk = compute_pyk(computed_nodes)
  logednodes_1 = tf.math.log(pyk)/math.log(2)
  logednodes_2 = tf.math.log(1-pyk)/math.log(2)
  
  return -logednodes_1*pyk - logednodes_2*(1-pyk) #CHECKED




#H(Y^0,...,Y^N)
def compute_Hy(computed_nodes):
  tf_options = tf.constant(options, dtype=tf.float32)
  computed_nodes_both = [computed_nodes,1-computed_nodes]
  
  def process_image(row):
    odds_both = [row,1-row]
    def fnc(y):
      def inner_func(x):
        x = tf.dtypes.cast(x,dtype=tf.int32)
        return tf.gather(tf.gather(odds_both,x%2),x//2)
      return tf.map_fn(inner_func, y)
    '''
      vals is 2^(N+1) by N, where the [i][j] entry is probability that
      in the ith permutation, say 0101,
      (y^0=0, y^1=1, y^2=0, y^3=1)
    '''
    vals = tf.map_fn(fnc,tf_options)

    reduced = tf.math.reduce_prod(vals,axis=1)

    return reduced
  
  #p(y^0,y^1,...) length 2^n+1
  pys = tf.math.reduce_mean(tf.map_fn(process_image,computed_nodes),axis=0)
  return - tf.math.reduce_sum(pys*tf.math.log(pys)/math.log(2)) #Checked
  
  
  
  


#I(S:Y^k)
#output[s][k] is mutual information between image s and node k
def compute_Is_yk(computed_nodes):
  pyk = compute_pyk(computed_nodes)
  num_samples = tf.dtypes.cast(computed_nodes.shape[0],dtype=tf.float32)
  
  #yk=1
  a = computed_nodes/pyk
  output = computed_nodes/num_samples * tf.math.log(a)/math.log(2)
  #yk=2
  b = (1-computed_nodes)/(1-pyk)
  output += (1-computed_nodes)/num_samples * tf.math.log(b)/math.log(2)
  return output  #CHECKED



#computer H(Y^i,Y^j)
def compute_H_2(computed_nodes):

  

  def sub_compute(computed_nodes_1, computed_nodes_2):
    def top(x):
      return tf.reshape(tf.tile(x,[x.shape[0]]),[x.shape[0],x.shape[0]])
    def bot(x):
      return tf.transpose(tf.reshape(tf.tile(x,[x.shape[0]]),[x.shape[0],x.shape[0]]))
    pairwise1 = tf.map_fn(bot,computed_nodes_1)
    pairwise2 = tf.map_fn(top,computed_nodes_2)
    prob_a_b = tf.math.reduce_mean(pairwise1*pairwise2,axis=0) #reduce mean across sample size
#     print(prob_a_b[0][1]*math.log(prob_a_b[0][1])/math.log(2))
#     print("\nprob a b:\n{}\n".format(prob_a_b))
    return prob_a_b*tf.math.log(prob_a_b)/math.log(2)


  info = sub_compute(computed_nodes, computed_nodes)+\
          sub_compute(1-computed_nodes, computed_nodes)+\
          sub_compute(computed_nodes, 1-computed_nodes)+\
          sub_compute(1-computed_nodes, 1-computed_nodes)
  return -info*(1-tf.eye(N+1,N+1))

#create NXN matrix  where [i][j]
#is the mutual information between Y^i and Y^j
def compute_I(computed_nodes):

  

  def sub_compute(computed_nodes_1, computed_nodes_2):
    def top(x):
      return tf.reshape(tf.tile(x,[x.shape[0]]),[x.shape[0],x.shape[0]])
    def bot(x):
      return tf.transpose(tf.reshape(tf.tile(x,[x.shape[0]]),[x.shape[0],x.shape[0]]))
    pairwise1 = tf.map_fn(bot,computed_nodes_1)
    pairwise2 = tf.map_fn(top,computed_nodes_2)
    prob_a_b = tf.math.reduce_mean(pairwise1*pairwise2,axis=0) #reduce mean across sample size

    
    prob_a = bot(tf.math.reduce_mean(computed_nodes_1,0))
    prob_b = top(tf.math.reduce_mean(computed_nodes_2,0))

  
    
#     print(prob_a*prob_b)
    return prob_a_b*tf.math.log(prob_a_b/(prob_a*prob_b))/math.log(2)


  info = sub_compute(computed_nodes, computed_nodes)+\
          sub_compute(1-computed_nodes, computed_nodes)+\
          sub_compute(computed_nodes, 1-computed_nodes)+\
          sub_compute(1-computed_nodes, 1-computed_nodes)
#   info = sub_compute(1-computed_nodes, computed_nodes)
  return info*(1-tf.eye(N+1,N+1))



  
#returns num_images by N array, where
#[i][j] is the utility of node j with respect to image i
def compute_utility(computed_nodes,image_outcome_info):
  
  first_term = MU*image_outcome_info #num_images by N vectir
  tmp = compute_I(computed_nodes)

  second_term = -(K-L)/N*(tf.reduce_sum(tmp,axis=0))
  third_term = -(MU-K)*compute_Hy_k(computed_nodes) #vector length N
  fourth_term = -L/N*tf.math.reduce_sum(compute_Hy(computed_nodes)) #number
  
#   print("\nfirst term:\n{}\n".format(first_term))
#   print("\nsecond term:\n{}\n".format(second_term))
#   print("\nthird term:\n{}\n".format(third_term))
#   print("\nfourth term:\n{}\n".format(fourth_term))
  
  return first_term+second_term+third_term+fourth_term

# def compute_total_info(computed_nodes, image_outcome


def compute_information(computed_nodes):
  term1 = (N+1)*compute_Hs(computed_nodes)
  print(term1)
#   print("term1:\n{}".format(term1))
  term2 = -tf.reduce_sum(compute_Hs_y(computed_nodes))
#   print("term2:\n{}".format(compute_Hs_y(computed_nodes)))
  term3 = compute_Hy(computed_nodes)
#   print("term3:\n{}".format(term3))
  return term1+term2+term3

def compute_utility_2(computed_nodes, mu, lamb, kappa):
  term1 = mu*(compute_Hs(computed_nodes)-compute_Hs_y(computed_nodes))
  hyk = compute_Hy_k(computed_nodes)
  term2 = lamb*N*hyk
  term3 = -kappa*(tf.reduce_sum(hyk))-hyk
  term4 = (lamb-kappa)*tf.reduce_sum(compute_H_2(computed_nodes),axis=0)
#   print("reduced sum:")
#   print(compute_H_2(computed_nodes))

                 
#   print("trem4")
#   print(term4)
  
  return term1+term2+term3+term4
  
 

In [0]:
def get_x_numb(x):
  return int(8*float(x[0])+4*float(x[1])+2*float(x[2])+float(x[3]))
def get_output(x):
#   print(type(x))
  output2 = map(lambda y:str(int(round(float(y)))),x)
  return str("".join(output2))
def get_counts(key_list):
    countdict = {}
    for outcome in key_list:
      if outcome not in countdict:
        countdict[outcome] = 1
      else:
        countdict[outcome] += 1
    return countdict

def get_info(output):

  if USING_MNIST:
    x_train_reshaped = x_train.reshape(data_size,784)
  outcomes_as_keys = list(map( get_output,output))
  inputs_as_keys = list(map(get_output,x_train_reshaped))
  
  count_dict_outcome = get_counts(outcomes_as_keys)
  count_dict_input = get_counts(inputs_as_keys)
  
  input_output = {}
  for i in range(len(inputs_as_keys)):
    input_output[inputs_as_keys[i]] = outcomes_as_keys[i]
  
#   print(odds_image)


  
  mutual_info = 0
  number_samples = float(len(output))
  #a
  for input_str, input_odds in count_dict_input.items():
    output_str = input_output[input_str]
    output_odds= count_dict_outcome[output_str]
    mutual_info += input_odds/number_samples * math.log(1/(output_odds/number_samples))/math.log(2)
    
  
  return mutual_info

In [0]:

def get_information_no_tf(computed_nodes, mu, lamb, kappa):
  computed_nodes_tf = computed_nodes
  
  computed_nodes = list(computed_nodes)
  computed_nodes_np = np.array(computed_nodes)
  

  num_images = float(len(computed_nodes))
  
  answer = [0]*len(computed_nodes[0])

  for row in computed_nodes:
    for node_index, val in enumerate(row):

      answer[node_index] += float(mu*(1./num_images)*val*math.log(val)/math.log(2))
      val2 = 1-val
      answer[node_index] += float(mu*(1./num_images)*val2*math.log(val2)/math.log(2))


 
  def get_conditionals(computed_nodes1,computed_nodes2):
    pyk = compute_pyk(computed_nodes2)
    conditional_probabilities = [[0 for i in range(N+1)] for k in range(N+1)]
    for i in range(N+1):
        for k in range(N+1):
          if i == k:
            continue
          sumer = 0.
          count = 0.
          for im in range(len(computed_nodes)):
            sumer += computed_nodes1[im][i]*computed_nodes2[im][k]
            count += 1.
          conditional_probabilities[i][k] = float((sumer/count)/pyk[k])
    return np.array(conditional_probabilities)
  
  

  conditional_probabilities = [get_conditionals(computed_nodes_np,computed_nodes_np), get_conditionals(1-computed_nodes_np,computed_nodes_np), 
                               get_conditionals(computed_nodes_np,1-computed_nodes_np), get_conditionals(1-computed_nodes_np,1-computed_nodes_np)]

  
  
  print_dictionairy = {0:"p(s{},y^{}=1,y^{}=1)",1:"p(s{},y^{}=2,y^{}=1)",2:"p(s{},y^{}=1,y^{}=2)",3:"p(s{},y^{}=2,y^{}=2)"}
  print_dictionairy_conditional = {0:"p(y^{}=1 | y^{}=1)",1:"p(y^{}=2 | y^{}=1)",2:"p(y^{}=1 | y^{}=2)",3:"p(y^{}=2 | y^{}=2)"}

  for i_index, image in enumerate(computed_nodes):
    for j, val_j_original in enumerate(image):
      for k, val_k_original in enumerate(image):
        if j == k:
          continue
        for value_index in range(4):
          val_j = val_j_original
          if value_index == 1 or value_index == 3:
            val_j = 1-val_j_original
          val_k = val_k_original
          if value_index == 2 or value_index == 3:
            val_k = 1-val_k_original

          
          addme =  float(lamb*(1./num_images)*val_j*val_k)*math.log(conditional_probabilities[value_index][j][k])/math.log(2)
#           if k == 0:
#             print(print_dictionairy[value_index].format(i_index,j,k) + "* log(" + print_dictionairy_conditional[value_index].format(j,k) + \
#                   ") = " + str(float((1./num_images)*val_j*val_k)) + " * log(" + str(conditional_probabilities[value_index][j][k]) + ") = " + str(addme))
          
          answer[k] += addme
 

  for i_index, image in enumerate(computed_nodes):
    for j, val_j_original in enumerate(image):
      for k, val_k_original in enumerate(image):
        if j == k:
          continue
        for value_index in range(4):
          val_j = val_j_original
          if value_index == 1 or value_index == 3:
            val_j = 1-val_j_original
          val_k = val_k_original
          if value_index == 2 or value_index == 3:
            val_k = 1-val_k_original
          
          answer[k] -= float(kappa*(1./num_images)*val_j*val_k)*math.log(conditional_probabilities[value_index][k][j])/math.log(2)
        
  return answer


# print(compute_Hs(computed_nodes))
# print(compute_Hs_y(computed_nodes))
# print(compute_Hy_k(computed_nodes))
# print(compute_H_2(computed_nodes))
# print(compute_Hy(computed_nodes))

# print("\nH(s):\n{}".format(compute_Hs(computed_nodes)))
# print("H(s,y^k):\n{}\n".format(compute_Hs_y(computed_nodes)))
# print("Real")
# print(get_information_no_tf(computed_nodes, 1, 2, 1))
# print("tf")
# computed_nodes = tf.constant([[0.75, 0.25, 0.5],[0.1, 0.9, 0.5]])
# computed_nodes2 = tf.constant([[0.99, 0.01, 0.5],[0.01, 0.99, 0.5]])
# computed_nodes3 = tf.constant([[1, 0, 0.01],[0.01, 0.99, 0.99]])

# print(compute_utility_2(computed_nodes,3,1,2))
# print(compute_utility_2(computed_nodes2,3,1,2))
# print(compute_utility_2(computed_nodes3,3,1,2))
# print(N*compute_Hy_k(computed_nodes))


In [0]:
iterations = 4000

output = None
output_checkpoint = None

computed_nodes = compute_nodes(s)
image_outcome_info = compute_Is_yk(computed_nodes)

utility = compute_utility_2(computed_nodes,1,.4,.05)
information = compute_information(computed_nodes)

train_ops = []
for i in range(N+1):
  train_ops.append(tf.train.AdamOptimizer(1e-3).minimize(-utility[i]))
train_op = tf.train.AdamOptimizer(1e-2).minimize(-tf.reduce_sum(utility))
init_op = tf.initialize_all_variables()

last_information = 0
with tf.Session() as sess:
    sess.run(init_op)

#     print("\nresults:\n{}".format(sess.run(computed_nodes)))
    for i in range(iterations):

      if i%2==0:
        output = sess.run(computed_nodes)
        print("trial {}: utility = {}, information={}".format(i,sess.run(utility),get_info(output)))
        print(sess.run(computed_nodes))
        if np.isfinite(output).all():
          output_checkpoint = output
          
        information = get_info(output)
#         if information == last_information:
#           x = random.randint(0,2)
#           print("entering fixit mode: Node {} selected".format(x+1))
#           for k in range(100):
#              sess.run(train_ops[x])
#           information = get_info(output)
#           print("ending, new info = {}".format(information))
          
        last_information = information
        
        
      for node in range(N+1):
        sess.run(train_ops[node])
#       sess.run(train_op)
      if i % 50 == 0 and not np.isfinite(output).all():
        break
    print("\nimages:\n{}".format(sess.run(s)))
#     print("\nweights:\n{}".format(sess.run(x)))
#     print("\nbiases:\n{}".format(sess.run(B)))
    print("\nresults:\n{}".format(sess.run(computed_nodes)))
    output = sess.run(computed_nodes)
    


Instructions for updating:
Use keras.layers.conv2d instead.
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use keras.layers.max_pooling2d instead.

For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Use keras.layers.flatten instead.
Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Use keras.layers.dropout instead.
tf.Tensor(39.863136, shape=(), dtype=float32)


RuntimeError: ignored

In [0]:
0print(output)

In [0]:
def dotheprint():
  output = output_checkpoint
  get_info(output)

  numbers_to_output = {}
  for i in range(len(x_train)):
    if USING_MNIST:
      num = y_train[i]
    else:
      num = x_train[i][0]*8+x_train[i][1]*4+x_train[i][2]*2+x_train[i][3]
  #     print(num)

    o = list(map(lambda x: int(x),list(get_output(output[i]))))
    
    o = 8*o[0]+4*o[1]+2*o[2]+o[3]
#     print(o)
#     print(o)
    num,o=o,num
  #   print(o)
  #   print("{}->{}".format(num,output))
    s = numbers_to_output.get(num,[])
    s.append(o)
    numbers_to_output[num] = s
  for i in range(16):
    if i in numbers_to_output:
      print("{} -> {}".format(i,set(numbers_to_output[i])))
  #     print(i)
  #     print(set(numbers_to_output[i]))

#   if USING_MNIST:
#     string_to_output = {'000':0,'001':1,'010':2,'011':3,'100':4,'101':5,'110':6,'111':7}
#     def get_output_count(num):
#       counts = [0]*8
#       for val in numbers_to_output[num]:
#         counts[string_to_output[val]] += 1
#       return counts
#     for n in range(10):
#       print(", ".join(map(lambda t: str(t),get_output_count(n))))
dotheprint()

0 -> {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}


In [0]:
# output = output_checkpoint
print(output)
# print(get_info(output)) 



numbs = {}
for a in range(2):
  for b in range(2):
    for c in range(2):
      numbs[str(a)+str(b)+str(c)] = set()
for i in range(len(output)):
  key = get_output(output[i])

  value = np.argmax(x_train[i])
#   if value == 15:
#     print()
#     print(output[i])
#     print(x_train[i])
  numbs[key].add(value)

    
print(numbs)

In [0]:
# |