In [1]:
import tensorflow as tf
import numpy as np
from sklearn.metrics import roc_auc_score
import time



In [2]:
bin_vec_dim = 88
embedding_dim = 6
dim = 128
keep_prob = 0.75

batch_size = 256
test_size = 256

In [3]:
def init_weights(shape, name):
    return tf.get_variable(name=name, shape=shape, dtype=tf.float32,
                           initializer=tf.contrib.layers.variance_scaling_initializer(
                               factor=1.0, mode='FAN_AVG', uniform=True))

def init_bias(shape, name):
    if len(shape) > 1:
        raise Exception('Bias should be a vector.')
    return tf.get_variable(name=name, shape=shape, dtype=tf.float32,
                           initializer=tf.constant_initializer(
                               0.01))
def batch_act(h, act, phase, scope):
    with tf.variable_scope(scope):
        return act(h)
def from_sparse_arr(sparse_arr):
    mat = np.zeros((dim, dim, bin_vec_dim), dtype=np.float32)
    for (i, j, k) in sparse_arr:
        mat[i, j, k] = 1
    return mat

def from_sparse_arrs(sparse_arrs):
    mats = []
    for sparse_arr in sparse_arrs:
        mats.append(from_sparse_arr(sparse_arr))
    mats = np.array(mats, dtype=np.float32)
    return mats


In [5]:
def variable_summaries(var):
    """Attach a lot of summaries to a Tensor (for TensorBoard visualization)."""
    with tf.name_scope('summaries'):
        mean = tf.reduce_mean(var)
        tf.summary.scalar('mean', mean)
        with tf.name_scope('stddev'):
            stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
        tf.summary.scalar('stddev', stddev)
        tf.summary.scalar('max', tf.reduce_max(var))
        tf.summary.scalar('min', tf.reduce_min(var))
        tf.summary.histogram('histogram', var)


In [6]:
def model(X, dropout, phase):
    global reg_term
    num = tf.shape(X)[0]
    with tf.name_scope('emb_layer'):
        wf = init_weights([bin_vec_dim, embedding_dim], 'wf')
        reg_term = tf.nn.l2_loss(wf)
        variable_summaries(wf)
        bf = init_bias([embedding_dim], 'bf')
        variable_summaries(bf)
        X = tf.reshape(X, [num * dim * dim, bin_vec_dim])
        h0 = tf.nn.bias_add(tf.matmul(X, wf), bf)
        h0 = batch_act(h0, phase=phase, act=tf.nn.elu, scope='emb_layer_bn')
        h0 = tf.reshape(h0, [num * dim, dim * embedding_dim])
        h0 = tf.nn.dropout(h0, dropout)
    with tf.name_scope('row_fc_layer1'):
        wr1 = init_weights([embedding_dim * dim, 256], 'wr1')  # 128
        reg_term += tf.nn.l2_loss(wr1)
        br1 = init_bias([256], 'br1')
        h1 = tf.nn.bias_add(tf.matmul(h0, wr1), br1)
        h1 = batch_act(h1, phase=phase, act=tf.nn.elu, scope='row_fc_layer1_bn')
        h1 = tf.nn.dropout(h1, dropout)
    with tf.name_scope('row_fc_layer2'):
        wr2 = init_weights([256, 64], 'wr2')  # 32
        reg_term += tf.nn.l2_loss(wr2)
        br2 = init_bias([64], 'br2')
        h2 = tf.nn.bias_add(tf.matmul(h1, wr2), br2)
        h2 = batch_act(h2, phase=phase, act=tf.nn.elu, scope='row_fc_layer2_bn')
        h2 = tf.reshape(h2, [num, dim, 64])  # 32
    with tf.name_scope('avg_pooling'):
        h3 = tf.reduce_mean(h2, 1)
    return h3


In [7]:
def classification_predict(hl, hr, dropout, phase):
    h41 = tf.concat(values=[hl, hr], axis=1)
    with tf.name_scope('fc_layer1_1'):
        w5 = init_weights([128, 32], 'w5')  # 64 16
        b5 = init_bias([32], 'b5')
        h5_1 = tf.nn.bias_add(tf.matmul(h41, w5), b5)
        h5_1 = batch_act(h5_1, phase=phase, act=tf.nn.elu,
                         scope='fc_layer1_1_bn')
    h42 = tf.concat(values=[hr, hl], axis=1)
    with tf.name_scope('fc_layer1_2'):
        h5_2 = tf.nn.bias_add(tf.matmul(h42, w5), b5)
        h5_2 = batch_act(h5_2, phase=phase, act=tf.nn.elu,
                         scope='fc_layer1_2_bn')
    h5 = (h5_1 + h5_2) / 2.
    with tf.name_scope('sm_layer'):
        w7 = init_weights([32, 2], 'w7')
        variable_summaries(w7)
        o = tf.matmul(h5, w7)
    return o


In [8]:
def stat(Y, predicted_Y, fout=None):
    real_positive_count = 0
    predict_positive_count = 0
    recall = 0
    precision = 0
    for i in xrange(Y.shape[0]):
        if Y[i] == 1:
            real_positive_count += 1
            if predicted_Y[i] == 1:
                recall += 1
        if predicted_Y[i] == 1:
            predict_positive_count += 1
            if Y[i] == 1:
                precision += 1
    retrieved_positive_count = recall
    recall /= real_positive_count * 1.0
    precision /= max(predict_positive_count * 1.0, 1.0)
    f1_score = 2 * recall * precision / max(
    recall + precision, 0.00001)
    print "Clone pairs: %d, non-clone pairs: %d " % (
    real_positive_count, Y.shape[0] - real_positive_count)
    print "Recall: %f, precision: %f, f1 score: %f" % (
    recall, precision, f1_score)
    print "Predicted_positive_count: %d, recall truly positive: %d, false positive: %d, missed true positive: %d" \
          % (predict_positive_count, retrieved_positive_count,
             predict_positive_count - retrieved_positive_count,
             real_positive_count - retrieved_positive_count)
    if fout is not None:
        fout.write("Clone pairs: %d, non-clone pairs: %d\n" % (
    real_positive_count, Y.shape[0] - real_positive_count))
        fout.write("Recall: %.4f, precision: %.4f, f1 score: %.4f\n" % (
    recall, precision, f1_score))
        fout.write("Predicted_positive_count: %d, recall truly positive: %d, "
                   "false positive: %d, missed true positive: %d\n" \
          % (predict_positive_count, retrieved_positive_count,
             predict_positive_count - retrieved_positive_count,
             real_positive_count - retrieved_positive_count))
    return recall, precision, f1_score



In [9]:
def predict_on_full_dataset(fold):
    tf.reset_default_graph()
    with tf.name_scope('input'):
        X_left = tf.placeholder(tf.float32, [None, dim, dim, bin_vec_dim])
        X_right = tf.placeholder(tf.float32, [None, dim, dim, bin_vec_dim])
        Y = tf.placeholder(tf.float32, [None, 2])
    dropout = tf.placeholder(tf.float32)
    phase = tf.placeholder(tf.bool, name='phase')

    with tf.variable_scope('encoding'):
        h_op = model(X_left, dropout, phase)

    h_left = tf.placeholder(tf.float32, [None, 64])
    h_right = tf.placeholder(tf.float32, [None, 64])
    py_x = classification_predict(h_left, h_right, dropout, phase)
    predict_op = tf.argmax(py_x, 1)

    file_path = "./dataset/g4_128.npy"
    dataset = np.load(file_path, allow_pickle=True)
    X, y = np.array(dataset['X']), np.array(dataset['y'], dtype=np.int)
    
    t_beg = time.clock()
    saver = tf.train.Saver()
    sess = tf.InteractiveSession()
    saver.restore(sess, '10_fold_balanced/'+fold+'/mode.ckpt')

    iter = 0
    X_reps = []
    for start, end in zip(range(0, np.shape(X)[0], batch_size), \
                     range(batch_size, np.shape(X)[0] + 1, batch_size)):
        dense_X = from_sparse_arrs(X[start:end])
        h_val = sess.run(h_op, feed_dict={X_left: dense_X, dropout: 1.0,
                                          phase:0})
        X_reps.extend(h_val.tolist())
    dense_X = from_sparse_arrs(X[end:])
    h_val = sess.run(h_op, feed_dict={X_left: dense_X, dropout: 1.0, phase:0})
    X_reps.extend(h_val.tolist())
    test_X_left = []
    test_X_right = []
    test_Y = []
    for i in xrange(y.shape[0]):
        for j in xrange(i+1, y.shape[0]):
            if y[i] == y[j]:
                test_X_left.append(X_reps[i])
                test_X_right.append(X_reps[j])
                test_Y.append([0, 1])
            else:
                test_X_left.append(X_reps[i])
                test_X_right.append(X_reps[j])
                test_Y.append([1, 0])
    test_X_left = np.array(test_X_left)
    test_X_right = np.array(test_X_right)
    test_Y = np.array(test_Y, dtype=np.float32)
    

    overall_predict_Y = []
    for start, end in zip(range(0, np.shape(test_X_left)[0], batch_size),
                          range(batch_size, np.shape(test_X_left)[0] + 1,
                                batch_size)):
        predict_Y = sess.run(predict_op,
                             feed_dict={h_left: test_X_left[start:end],
                                        h_right: test_X_right[start:end],
                                        dropout: 1.0, phase: 0})  # no dropout
        overall_predict_Y.extend(predict_Y.tolist())
        iter += 1

    stat(np.argmax(test_Y[:end], axis=1),
         np.array(overall_predict_Y, dtype=np.int32))

In [10]:
# fold 1
predict_on_full_dataset('1')

INFO:tensorflow:Restoring parameters from 10_fold_balanced/1/mode.ckpt
Clone pairs: 275496, non-clone pairs: 1116376 
Recall: 0.967571, precision: 0.972457, f1 score: 0.970008
Predicted_positive_count: 274112, recall truly positive: 266562, false positive: 7550, missed true positive: 8934


In [10]:
# fold 0
predict_on_full_dataset('0')


INFO:tensorflow:Restoring parameters from 10_fold_balanced/0/mode.ckpt




Clone pairs: 275496, non-clone pairs: 1116376 
Recall: 0.966983, precision: 0.956179, f1 score: 0.961551
Predicted_positive_count: 278609, recall truly positive: 266400, false positive: 12209, missed true positive: 9096


In [11]:
def predict_on_full_dataset_probs(fold):
    tf.reset_default_graph()
    with tf.name_scope('input'):
        X_left = tf.placeholder(tf.float32, [None, dim, dim, bin_vec_dim])
        X_right = tf.placeholder(tf.float32, [None, dim, dim, bin_vec_dim])
        Y = tf.placeholder(tf.float32, [None, 2])
    dropout = tf.placeholder(tf.float32)
    phase = tf.placeholder(tf.bool, name='phase')

    with tf.variable_scope('encoding'):
        h_op = model(X_left, dropout, phase)

    h_left = tf.placeholder(tf.float32, [None, 64])
    h_right = tf.placeholder(tf.float32, [None, 64])
    py_x = classification_predict(h_left, h_right, dropout, phase)
    # predict_op = tf.argmax(py_x, 1) changed
    probabilities = tf.nn.softmax(py_x)

    file_path = "./dataset/g4_128.npy"
    dataset = np.load(file_path, allow_pickle=True)
    X, y = np.array(dataset['X']), np.array(dataset['y'], dtype=np.int)
    
    t_beg = time.clock()
    saver = tf.train.Saver()
    sess = tf.InteractiveSession()
    saver.restore(sess, '10_fold_balanced/'+fold+'/mode.ckpt')

    iter = 0
    X_reps = []
    for start, end in zip(range(0, np.shape(X)[0], batch_size), \
                     range(batch_size, np.shape(X)[0] + 1, batch_size)):
        dense_X = from_sparse_arrs(X[start:end])
        h_val = sess.run(h_op, feed_dict={X_left: dense_X, dropout: 1.0,
                                          phase:0})
        X_reps.extend(h_val.tolist())
    dense_X = from_sparse_arrs(X[end:])
    h_val = sess.run(h_op, feed_dict={X_left: dense_X, dropout: 1.0, phase:0})
    X_reps.extend(h_val.tolist())
    test_X_left = []
    test_X_right = []
    test_Y = []
    for i in xrange(y.shape[0]):
        for j in xrange(i+1, y.shape[0]):
            if y[i] == y[j]:
                test_X_left.append(X_reps[i])
                test_X_right.append(X_reps[j])
                test_Y.append([0, 1])
            else:
                test_X_left.append(X_reps[i])
                test_X_right.append(X_reps[j])
                test_Y.append([1, 0])
    test_X_left = np.array(test_X_left)
    test_X_right = np.array(test_X_right)
    test_Y = np.array(test_Y, dtype=np.float32)
    

    overall_predict_probs = []
    for start, end in zip(range(0, np.shape(test_X_left)[0], batch_size),
                          range(batch_size, np.shape(test_X_left)[0] + 1,
                                batch_size)):
#         predict_Y = sess.run(predict_op,
#                              feed_dict={h_left: test_X_left[start:end],
#                                         h_right: test_X_right[start:end],
#                                         dropout: 1.0, phase: 0})  # no dropout
#         overall_predict_Y.extend(predict_Y.tolist())
        
        prob_vals = sess.run(probabilities,
                             feed_dict={h_left: test_X_left[start:end],
                                        h_right: test_X_right[start:end],
                                        dropout: 1.0, phase: 0})  # no dropout
        probs_class_1 = prob_vals[:, 1]
        overall_predict_probs.extend(probs_class_1.tolist())
        iter += 1

#     stat(np.argmax(test_Y[:end], axis=1),
#          np.array(overall_predict_Y, dtype=np.int32))
    true_labels = np.argmax(test_Y[:len(overall_predict_probs)], axis=1)
    
    # Option A: Compute AUC-ROC
    from sklearn.metrics import roc_auc_score
    auc = roc_auc_score(true_labels, overall_predict_probs)
    print("AUC-ROC:", auc)

    # Option B: Apply threshold to get predicted labels
    threshold = 0.5  # You can adjust this threshold
    predicted_labels = (np.array(overall_predict_probs) >= threshold).astype(int)

    # Use your existing stat function
    stat(true_labels, predicted_labels)

In [12]:
# fold 1
predict_on_full_dataset_probs('1')


INFO:tensorflow:Restoring parameters from 10_fold_balanced/1/mode.ckpt




('AUC-ROC:', 0.9968918857146205)
Clone pairs: 275496, non-clone pairs: 1116376 
Recall: 0.967571, precision: 0.972457, f1 score: 0.970008
Predicted_positive_count: 274112, recall truly positive: 266562, false positive: 7550, missed true positive: 8934


In [13]:
def predict_pair_probability(code1, code2, fold):
    tf.reset_default_graph()
    with tf.name_scope('input'):
        X_input = tf.placeholder(tf.float32, [None, dim, dim, bin_vec_dim])
        Y = tf.placeholder(tf.float32, [None, 2])
    dropout = tf.placeholder(tf.float32)
    phase = tf.placeholder(tf.bool, name='phase')

    with tf.variable_scope('encoding'):
        h_op = model(X_input, dropout, phase)

    h_left = tf.placeholder(tf.float32, [None, 64])
    h_right = tf.placeholder(tf.float32, [None, 64])
    py_x = classification_predict(h_left, h_right, dropout, phase)
    # predict_op = tf.argmax(py_x, 1) changed
    probabilities = tf.nn.softmax(py_x)

    saver = tf.train.Saver()
    sess = tf.InteractiveSession()
    saver.restore(sess, '10_fold_balanced/'+fold+'/mode.ckpt')
    
    code1_dense = np.expand_dims(from_sparse_arr(code1), axis=0)  # Shape: [1, dim, dim, bin_vec_dim]
    code2_dense = np.expand_dims(from_sparse_arr(code2), axis=0)
    
    
    h_left_val = sess.run(h_op, feed_dict={X_input: code1_dense, dropout: 1.0, phase:0})
    h_right_val = sess.run(h_op, feed_dict={X_input: code2_dense, dropout: 1.0, phase:0})

    
    prob_vals = sess.run(probabilities,
                         feed_dict={h_left: h_left_val,
                                    h_right: h_right_val,
                                    dropout: 1.0, phase: 0})  # no dropout
    print(prob_vals)
    prob_class_1 = prob_vals[:, 1]
    sess.close()
    return prob_class_1

In [14]:
file_path = "./dataset/g4_128.npy"
dataset = np.load(file_path, allow_pickle=True)

In [16]:
print(dataset['y'][0], dataset['y'][20])

(1, 1)


In [17]:
predict_pair_probability(dataset['X'][0], dataset['X'][20], '1')

INFO:tensorflow:Restoring parameters from 10_fold_balanced/1/mode.ckpt
[[0.00366638 0.9963336 ]]


array([0.9963336], dtype=float32)

In [31]:
print(dataset['y'][0], dataset['y'][1005])

(1, 9)


In [22]:
predict_pair_probability(dataset['X'][0], dataset['X'][1005], '1')

INFO:tensorflow:Restoring parameters from 10_fold_balanced/1/mode.ckpt
[[9.9966753e-01 3.3253073e-04]]


array([0.00033253], dtype=float32)

In [26]:
BallonsCor1 = read_encoding("./data/BallonsCor1/main(String).txt")

In [27]:
BallonsCor2 = read_encoding("./data/BallonsCor2/main(String).txt")

In [28]:
BallonsInc = read_encoding("./data/BallonsInc/main(String).txt")

In [29]:
predict_pair_probability(BallonsCor1, BallonsCor2, '1')

INFO:tensorflow:Restoring parameters from 10_fold_balanced/1/mode.ckpt


ValueError: too many values to unpack

## Load The model one time and compare two codes


In [30]:
import tensorflow as tf
import numpy as np

In [31]:
class ModelPredictor:
    def __init__(self, fold):
        # Store parameters
        self.fold = fold

        # Build the graph
        self.graph = tf.Graph()
        with self.graph.as_default():
            # Define placeholders and variables
            with tf.name_scope('input'):
                self.X_input = tf.placeholder(tf.float32, [None, dim, dim, bin_vec_dim])
                self.Y = tf.placeholder(tf.float32, [None, 2])
            self.dropout = tf.placeholder(tf.float32)
            self.phase = tf.placeholder(tf.bool, name='phase')

            # Encoding network
            with tf.variable_scope('encoding'):
                self.h_op = model(self.X_input, self.dropout, self.phase)

            # Classification network
            self.h_left = tf.placeholder(tf.float32, [None, 64])
            self.h_right = tf.placeholder(tf.float32, [None, 64])
            self.py_x = classification_predict(self.h_left, self.h_right, self.dropout, self.phase)
            self.probabilities = tf.nn.softmax(self.py_x)

            # Initialize the session
            self.sess = tf.Session(graph=self.graph)
            # Create a saver
            self.saver = tf.train.Saver()
            # Restore the model
            self.saver.restore(self.sess, '10_fold_balanced/' + self.fold + '/mode.ckpt')

    def predict_pair_probability(self, code1, code2):
        # code1 and code2 should be of shape [dim, dim, bin_vec_dim]
        # Prepare the samples
        # Convert sparse arrays to dense arrays using from_sparse_arr
        sample1_dense = np.expand_dims(code1, axis=0)  # Shape: [1, dim, dim, bin_vec_dim]
        sample2_dense = np.expand_dims(code2, axis=0)  # Shape: [1, dim, dim, bin_vec_dim]

        # Get the representations
        h_left_val = self.sess.run(self.h_op, feed_dict={
            self.X_input: sample1_dense,
            self.dropout: 1.0,
            self.phase: False
        })
        h_right_val = self.sess.run(self.h_op, feed_dict={
            self.X_input: sample2_dense,
            self.dropout: 1.0,
            self.phase: False
        })

        # Get the probability
        prob_vals = self.sess.run(self.probabilities, feed_dict={
            self.h_left: h_left_val,
            self.h_right: h_right_val,
            self.dropout: 1.0,
            self.phase: False
        })
        print(prob_vals)
        prob_class_1 = prob_vals[:, 1]  # Probability of label '1'

        return prob_class_1

    def close(self):
        # Close the session when done
        self.sess.close()


In [32]:
pred = ModelPredictor('1')

INFO:tensorflow:Restoring parameters from 10_fold_balanced/1/mode.ckpt


In [41]:
pred.predict_pair_probability(from_sparse_arr(dataset['X'][0]), from_sparse_arr(dataset['X'][20]))

[[0.00366638 0.9963336 ]]


array([0.9963336], dtype=float32)

In [42]:
pred.predict_pair_probability(from_sparse_arr(dataset['X'][0]), from_sparse_arr(dataset['X'][1005]))

[[9.9966753e-01 3.3253073e-04]]


array([0.00033253], dtype=float32)

In [62]:
dataset['X'][0][55]

(3, 4, 68)

In [59]:
len(dataset['X'][15])

1105

In [54]:
np.shape(from_sparse_arr(dataset['X'][0]))

(128, 128, 88)

In [43]:
def encoding_sparse_to_dense(indices, size=88):  
    vector = np.zeros(size, dtype=int)  
    vector[indices] = 1  
    return vector 
def read_encoding(txt_file):
    shape = (dim, dim, bin_vec_dim) 
    result_array = np.zeros(shape, dtype=int)  
    
    with open(txt_file, 'r') as file:  
        for i, line in enumerate(file):  
            # Split the line into sparse vector representations, strip '{}' and split by ','  
            vectors = [vec[1:-1].split(',') if vec != '{}' else [] for vec in line.strip().split('\t')]  

            # Convert each vector of indices to a full binary vector  
            for j, indices in enumerate(vectors):  
                if indices:  
                    indices = list(map(int, indices))  # Convert indices to integers  
                    result_array[i, j, :] = encoding_sparse_to_dense(indices)  
    return result_array


In [58]:
checkInc1 = read_encoding("./data/checkInc1-main(String).txt")
checkInc1.shape

(128, 128, 88)

In [71]:
checkInc3 = read_encoding("./data/checkInc3-solve().txt")
checkInc3.shape

(128, 128, 88)

In [76]:
pred.predict_pair_probability(checkInc1, checkInc3)

array([0.00306521], dtype=float32)

In [44]:
BallonsCor1 = read_encoding("./data/BallonsCor1/main(String).txt")

In [45]:
BallonsCor2 = read_encoding("./data/BallonsCor2/main(String).txt")

In [46]:
BallonsInc = read_encoding("./data/BallonsInc/main(String).txt")

In [47]:
pred.predict_pair_probability(BallonsCor1, BallonsCor2)

[[0.9908573  0.00914265]]


array([0.00914265], dtype=float32)

In [48]:
pred.predict_pair_probability(BallonsCor1, BallonsInc)

[[0.9973916 0.0026084]]


array([0.0026084], dtype=float32)

###  I will do method 2 to read data same as g4_128.npy"

#### for that, we save the indicies that have 1 in a tuple

In [65]:
# every tuple in an index tha is on
dataset['X'][2]

[(2, 2, 0),
 (2, 2, 12),
 (2, 2, 18),
 (2, 2, 30),
 (2, 2, 72),
 (2, 3, 0),
 (2, 3, 11),
 (2, 3, 18),
 (2, 3, 30),
 (2, 3, 68),
 (2, 21, 0),
 (2, 21, 11),
 (2, 21, 18),
 (2, 21, 30),
 (2, 21, 68),
 (2, 40, 0),
 (2, 40, 11),
 (2, 40, 18),
 (2, 40, 30),
 (2, 40, 68),
 (2, 45, 0),
 (2, 45, 11),
 (2, 45, 18),
 (2, 45, 30),
 (2, 45, 68),
 (2, 103, 0),
 (2, 103, 2),
 (2, 103, 11),
 (2, 103, 18),
 (2, 103, 30),
 (2, 103, 75),
 (2, 104, 0),
 (2, 104, 2),
 (2, 104, 11),
 (2, 104, 18),
 (2, 104, 30),
 (2, 104, 75),
 (2, 106, 81),
 (2, 107, 81),
 (2, 112, 81),
 (2, 115, 81),
 (2, 120, 81),
 (2, 123, 81),
 (2, 124, 81),
 (3, 107, 81),
 (4, 4, 0),
 (4, 4, 11),
 (4, 4, 18),
 (4, 4, 29),
 (4, 4, 72),
 (4, 5, 0),
 (4, 5, 11),
 (4, 5, 18),
 (4, 5, 29),
 (4, 5, 68),
 (4, 8, 0),
 (4, 8, 11),
 (4, 8, 18),
 (4, 8, 29),
 (4, 8, 68),
 (4, 108, 81),
 (4, 111, 81),
 (5, 4, 0),
 (5, 4, 11),
 (5, 4, 18),
 (5, 4, 29),
 (5, 4, 68),
 (5, 5, 0),
 (5, 5, 11),
 (5, 5, 18),
 (5, 5, 29),
 (5, 5, 72),
 (5, 6, 0),
 (5, 6,

In [71]:
for i in range(len(dataset['X'])):
    if(dataset['X'][i][0][0]==0):
        print(i)
        break

27


In [72]:
#0 indexed
dataset['X'][27]

[(0, 3, 0),
 (0, 3, 11),
 (0, 3, 18),
 (0, 3, 19),
 (0, 3, 29),
 (0, 3, 40),
 (0, 112, 81),
 (2, 3, 0),
 (2, 3, 11),
 (2, 3, 18),
 (2, 3, 19),
 (2, 3, 24),
 (2, 3, 40),
 (2, 51, 0),
 (2, 51, 6),
 (2, 51, 18),
 (2, 51, 19),
 (2, 51, 24),
 (2, 51, 60),
 (2, 60, 0),
 (2, 60, 6),
 (2, 60, 18),
 (2, 60, 19),
 (2, 60, 24),
 (2, 60, 73),
 (2, 61, 0),
 (2, 61, 6),
 (2, 61, 18),
 (2, 61, 19),
 (2, 61, 24),
 (2, 61, 62),
 (2, 61, 73),
 (2, 67, 0),
 (2, 67, 6),
 (2, 67, 18),
 (2, 67, 19),
 (2, 67, 24),
 (2, 67, 60),
 (2, 82, 0),
 (2, 82, 6),
 (2, 82, 18),
 (2, 82, 19),
 (2, 82, 24),
 (2, 82, 73),
 (2, 83, 0),
 (2, 83, 6),
 (2, 83, 18),
 (2, 83, 19),
 (2, 83, 24),
 (2, 83, 73),
 (2, 106, 0),
 (2, 106, 6),
 (2, 106, 18),
 (2, 106, 19),
 (2, 106, 24),
 (2, 106, 73),
 (2, 112, 81),
 (3, 7, 0),
 (3, 7, 11),
 (3, 7, 18),
 (3, 7, 29),
 (3, 7, 68),
 (3, 8, 0),
 (3, 8, 11),
 (3, 8, 18),
 (3, 8, 29),
 (3, 8, 68),
 (3, 14, 0),
 (3, 14, 11),
 (3, 14, 18),
 (3, 14, 29),
 (3, 14, 68),
 (3, 112, 81),
 (3, 117, 

In [82]:
def read_encoding_2(txt_file):
    result_array = []
    
    with open(txt_file, 'r') as file:  
        for i, line in enumerate(file):  
            # Split the line into sparse vector representations, strip '{}' and split by ','  
            vectors = [vec[1:-1].split(',') if vec != '{}' else [] for vec in line.strip().split('\t')]  
            # Convert each vector of indices to a full binary vector  
            for j, indices in enumerate(vectors):  
                if indices:  
                    indices = list(map(int, indices))  # Convert indices to integers
                    for k in indices:
                        result_array.append((i, j, k))
    return result_array


In [90]:
read_code_bal_cor_1 = read_encoding_2("./data/BallonsCor1/main(String).txt")

In [91]:
read_code_bal_cor_1

[(2, 2, 0),
 (2, 2, 11),
 (2, 2, 18),
 (2, 2, 29),
 (2, 2, 72),
 (2, 3, 0),
 (2, 3, 11),
 (2, 3, 18),
 (2, 3, 29),
 (2, 3, 68),
 (2, 4, 0),
 (2, 4, 11),
 (2, 4, 18),
 (2, 4, 29),
 (2, 4, 68),
 (2, 6, 0),
 (2, 6, 6),
 (2, 6, 18),
 (2, 6, 29),
 (2, 6, 68),
 (2, 10, 0),
 (2, 10, 6),
 (2, 10, 18),
 (2, 10, 29),
 (2, 10, 68),
 (2, 81, 81),
 (2, 82, 81),
 (2, 83, 81),
 (2, 87, 83),
 (3, 2, 0),
 (3, 2, 11),
 (3, 2, 18),
 (3, 2, 29),
 (3, 2, 68),
 (3, 4, 0),
 (3, 4, 11),
 (3, 4, 18),
 (3, 4, 29),
 (3, 4, 68),
 (3, 82, 81),
 (4, 82, 81),
 (5, 83, 81),
 (6, 5, 0),
 (6, 5, 11),
 (6, 5, 18),
 (6, 5, 24),
 (6, 5, 68),
 (6, 7, 0),
 (6, 7, 1),
 (6, 7, 6),
 (6, 7, 18),
 (6, 7, 24),
 (6, 7, 72),
 (6, 13, 0),
 (6, 13, 6),
 (6, 13, 18),
 (6, 13, 24),
 (6, 13, 60),
 (6, 17, 0),
 (6, 17, 1),
 (6, 17, 11),
 (6, 17, 18),
 (6, 17, 24),
 (6, 17, 72),
 (6, 21, 0),
 (6, 21, 6),
 (6, 21, 18),
 (6, 21, 24),
 (6, 21, 60),
 (6, 48, 0),
 (6, 48, 6),
 (6, 48, 18),
 (6, 48, 24),
 (6, 48, 60),
 (6, 83, 81),
 (6, 84, 81)

In [87]:
read_code_bal_cor_2 = read_encoding_2("./data/BallonsCor2/main(String).txt")

In [88]:
len(read_code_bal_cor_2)

1036

In [92]:
# same as my own parsing
predict_pair_probability(read_code_bal_cor_1, read_code_bal_cor_2, '1')

INFO:tensorflow:Restoring parameters from 10_fold_balanced/1/mode.ckpt
[[0.9908573  0.00914265]]


array([0.00914265], dtype=float32)

In [None]:
''' MinOperCor1
import java.util.Scanner;
 
public class HelloWorld {
	public static void main(String[] args) {
		Scanner sc = new Scanner(System.in);
		int t = sc.nextInt();
		while((t--) > 0) {
			long n = sc.nextLong();
			long k = sc.nextLong();
			long cnt = 0;
			if(k == 1) {
				System.out.println(n);
				continue;
			}
			while(n>0) {
				cnt += n % k;
				n /= k;
			}
			System.out.println(cnt);
		}
		sc.close();
		
	}
 
}
'''

In [None]:
'''MinOperCor2

import java.io.*;
import java.util.*;

public class HelloWorld {
    static MyScanner in;
    static OutputWriter out;

    public static void main(String[] args) throws IOException {
        in = new MyScanner(System.in);
        out = new OutputWriter(System.out);

        int t = in.nextInt();
        while (t-- > 0) {
            long n = in.nextLong();
            long k = in.nextLong();

            if (k == 1) {
                out.printLine(n);
                continue;
            }

            int ans = 0;
            while (n > 0) {

                ans += n % k;
                n /= k;
            }

            out.printLine(ans);
        }



        out.flush();
        out.close();
    }

    static class MyScanner {

        StringTokenizer st;
        BufferedReader br;

        public MyScanner(InputStream s) {
            br = new BufferedReader(new InputStreamReader(s));
        }

        public MyScanner(FileReader f) {
            br = new BufferedReader(f);
        }

        public String next() throws IOException {
            while (st == null || !st.hasMoreTokens())
                st = new StringTokenizer(br.readLine());
            return st.nextToken();
        }

        public int nextInt() throws IOException {
            return Integer.parseInt(next());
        }

        public long nextLong() throws IOException {
            return Long.parseLong(next());
        }

        public String nextLine() throws IOException {
            return br.readLine();
        }

        public double nextDouble() throws IOException {
            String x = next();
            StringBuilder sb = new StringBuilder("0");
            double res = 0, f = 1;
            boolean dec = false, neg = false;
            int start = 0;
            if (x.charAt(0) == '-') {
                neg = true;
                start++;
            }
            for (int i = start; i < x.length(); i++)
                if (x.charAt(i) == '.') {
                    res = Long.parseLong(sb.toString());
                    sb = new StringBuilder("0");
                    dec = true;
                } else {
                    sb.append(x.charAt(i));
                    if (dec)
                        f *= 10;
                }
            res += Long.parseLong(sb.toString()) / f;
            return res * (neg ? -1 : 1);
        }

        public boolean ready() throws IOException {
            return br.ready();
        }

        public boolean nextEmpty() throws IOException {
            String s = nextLine();
            st = new StringTokenizer(s);
            return s.isEmpty();
        }
    }

    static class OutputWriter {
        private final PrintWriter writer;

        public OutputWriter(OutputStream outputStream) {
            writer = new PrintWriter(new BufferedWriter(new OutputStreamWriter(outputStream)));
        }

        public OutputWriter(Writer writer) {
            this.writer = new PrintWriter(writer);
        }

        public void print(Object... objects) {
            for (int i = 0; i < objects.length; i++) {
                if (i != 0)
                    writer.print(' ');
                writer.print(objects[i]);
            }
        }


        public void printLine(Object... objects) {
            print(objects);
            writer.println();
        }

        public void close() {
            writer.close();
        }

        public void flush() {
            writer.flush();
        }

    }
}
'''

In [96]:
Min_oper_cor_1= read_encoding_2("./data/MinOperCor1/main(String).txt")

In [97]:
Min_oper_cor_2= read_encoding_2("./data/MinOperCor2/main(String).txt")

In [98]:
Min_oper_cor_1

[(2, 2, 0),
 (2, 2, 11),
 (2, 2, 18),
 (2, 2, 29),
 (2, 2, 72),
 (2, 3, 0),
 (2, 3, 11),
 (2, 3, 18),
 (2, 3, 29),
 (2, 3, 68),
 (2, 4, 0),
 (2, 4, 11),
 (2, 4, 18),
 (2, 4, 29),
 (2, 4, 68),
 (2, 6, 0),
 (2, 6, 6),
 (2, 6, 18),
 (2, 6, 29),
 (2, 6, 68),
 (2, 11, 0),
 (2, 11, 7),
 (2, 11, 18),
 (2, 11, 29),
 (2, 11, 68),
 (2, 13, 0),
 (2, 13, 7),
 (2, 13, 18),
 (2, 13, 29),
 (2, 13, 68),
 (2, 30, 0),
 (2, 30, 11),
 (2, 30, 18),
 (2, 30, 29),
 (2, 30, 68),
 (2, 34, 81),
 (2, 35, 81),
 (2, 36, 81),
 (2, 39, 83),
 (2, 40, 81),
 (2, 50, 81),
 (3, 2, 0),
 (3, 2, 11),
 (3, 2, 18),
 (3, 2, 29),
 (3, 2, 68),
 (3, 4, 0),
 (3, 4, 11),
 (3, 4, 18),
 (3, 4, 29),
 (3, 4, 68),
 (3, 35, 81),
 (4, 35, 81),
 (5, 36, 81),
 (6, 5, 0),
 (6, 5, 11),
 (6, 5, 18),
 (6, 5, 24),
 (6, 5, 68),
 (6, 36, 81),
 (7, 8, 0),
 (7, 8, 6),
 (7, 8, 18),
 (7, 8, 19),
 (7, 8, 24),
 (7, 8, 42),
 (7, 38, 82),
 (8, 19, 0),
 (8, 19, 6),
 (8, 19, 18),
 (8, 19, 24),
 (8, 19, 73),
 (8, 38, 82),
 (9, 16, 0),
 (9, 16, 6),
 (9, 16, 1

In [99]:
Min_oper_cor_2

[(2, 2, 0),
 (2, 2, 11),
 (2, 2, 18),
 (2, 2, 29),
 (2, 2, 72),
 (2, 3, 0),
 (2, 3, 11),
 (2, 3, 18),
 (2, 3, 29),
 (2, 3, 68),
 (2, 4, 0),
 (2, 4, 11),
 (2, 4, 18),
 (2, 4, 29),
 (2, 4, 68),
 (2, 48, 0),
 (2, 48, 2),
 (2, 48, 12),
 (2, 48, 18),
 (2, 48, 29),
 (2, 48, 75),
 (2, 51, 81),
 (2, 52, 81),
 (2, 53, 81),
 (3, 2, 0),
 (3, 2, 11),
 (3, 2, 18),
 (3, 2, 29),
 (3, 2, 68),
 (3, 4, 0),
 (3, 4, 11),
 (3, 4, 18),
 (3, 4, 29),
 (3, 4, 68),
 (3, 52, 81),
 (4, 52, 81),
 (5, 5, 0),
 (5, 5, 11),
 (5, 5, 18),
 (5, 5, 29),
 (5, 5, 72),
 (5, 6, 0),
 (5, 6, 11),
 (5, 6, 18),
 (5, 6, 29),
 (5, 6, 68),
 (5, 7, 0),
 (5, 7, 11),
 (5, 7, 18),
 (5, 7, 29),
 (5, 7, 68),
 (5, 49, 0),
 (5, 49, 2),
 (5, 49, 12),
 (5, 49, 18),
 (5, 49, 29),
 (5, 49, 75),
 (5, 53, 81),
 (5, 54, 81),
 (5, 55, 81),
 (6, 5, 0),
 (6, 5, 11),
 (6, 5, 18),
 (6, 5, 29),
 (6, 5, 68),
 (6, 7, 0),
 (6, 7, 11),
 (6, 7, 18),
 (6, 7, 29),
 (6, 7, 68),
 (6, 54, 81),
 (7, 54, 81),
 (8, 10, 0),
 (8, 10, 6),
 (8, 10, 18),
 (8, 10, 29),
 (

In [100]:
predict_pair_probability(Min_oper_cor_1, Min_oper_cor_2, '1')

INFO:tensorflow:Restoring parameters from 10_fold_balanced/1/mode.ckpt
[[0.797378   0.20262204]]


array([0.20262204], dtype=float32)

In [None]:
'''RevStr1

public class HelloWorld {  
    public static void main(String[] args) {  
        String originalString = "Hello, World!";  
        StringBuilder reversedString = new StringBuilder();  
  
        for (int i = originalString.length() - 1; i >= 0; i--) {  
            reversedString.append(originalString.charAt(i));  
        }  
  
        System.out.println("Reversed using StringBuilder: " + reversedString.toString());  
    }  
}  


'''

In [None]:
'''RevStr2

public class HelloWorld {  
    public static void main(String[] args) {  
        String originalString = "Hello, World!";  
        char[] stringArray = originalString.toCharArray();  
  
        int leftIndex = 0;  
        int rightIndex = stringArray.length - 1;  
        while (leftIndex < rightIndex) {  
            // Swap characters  
            char temp = stringArray[leftIndex];  
            stringArray[leftIndex] = stringArray[rightIndex];  
            stringArray[rightIndex] = temp;  
  
            // Move indices towards the center  
            leftIndex++;  
            rightIndex--;  
        }  
  
        String reversedString = new String(stringArray);  
        System.out.println("Reversed by swapping characters: " + reversedString);  
    }  
}  

'''

In [101]:
RevStr1= read_encoding_2("./data/RevString1/main(String).txt")

In [103]:
RevStr2= read_encoding_2("./data/RevString2/main(String).txt")

In [104]:
predict_pair_probability(RevStr1, RevStr2, '1')

INFO:tensorflow:Restoring parameters from 10_fold_balanced/1/mode.ckpt
[[0.7604767  0.23952328]]


array([0.23952328], dtype=float32)