In [19]:
import javalang
from tqdm import tqdm
import numpy as np
import csv
# import RNNmodel_2 as rnn
from keras import Model
from keras.layers import Layer
import keras.backend as K
from keras.layers import Input, Dense, SimpleRNN
import math
import random
import matplotlib.pyplot as plt
from path_extractor import extracting_path


In [20]:
class attention(Layer):
    def __init__(self, **kwargs):
        super(attention, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(name='attention_weight', shape=(input_shape[-1],1), initializer='random_normal', trainable=True)
        self.b = self.add_weight(name='attention_bias', shape=(input_shape[1],1), initializer='zeros', trainable=True)
        super(attention, self).build(input_shape)

    def call(self,x):
        e = K.tanh(K.dot(x, self.W) + self.b)
        e = K.squeeze(e, axis = -1)
        alpha = K.softmax(e)
        alpha = K.expand_dims(alpha, axis = -1)
        context = x * alpha
        context = K.sum(context, axis = 1)
        return context

In [21]:
class Student():
    def __init__(self, ID, score, assignments):
        self.ID = ID
        self.score = score
        self.assignments = assignments

class Assignment(): 
    def __init__(self, ID, problems):
        self.ID = ID
        self.problems = problems

class Problem(): 
    def __init__(self, ID, codeStates):
        self.ID = ID
        self.codeStates = codeStates

class CodeState():
    def __init__(self, ID, code, compileResult, compileMessageType, compileMessageData, score, time):
        self.ID = ID
        self.code = code
        self.compileResult = compileResult
        self.compileMessageType = compileMessageType
        self.compileMessageData = compileMessageData
        self.score = score
        self.time = time

In [22]:
def load_data():

    codeStates = {}
    with open('./Datasets/CodeStates.csv', mode='r', encoding="utf8") as file:
        csv_reader = csv.reader(file)
        next(csv_reader)
        for line in csv_reader:
            codeStates[line[0]] = line[1] 
            
    mainTable = {}
    with open('./Datasets/MainTable.csv', mode='r', encoding="utf8") as file:
        csv_reader = csv.reader(file)
        next(csv_reader)
        for line in csv_reader:
            # Student
            if line[0] not in mainTable:
                mainTable[line[0]] = {}
            # Assignment
            if line[5] not in mainTable[line[0]]:
                mainTable[line[0]][line[5]] = {}
            # Problem
            if line[6] not in mainTable[line[0]][line[5]]:
                mainTable[line[0]][line[5]][line[6]] = {}
            # CodeState
            if line[7] not in mainTable[line[0]][line[5]][line[6]]:
                # mainTable[Student][Assignment][Problem][Codestate]
                mainTable[line[0]][line[5]][line[6]][line[7]] = {} 
            mainTable[line[0]][line[5]][line[6]][line[7]]['time'] = line[2]
            mainTable[line[0]][line[5]][line[6]][line[7]]['code'] = codeStates[line[7]]
            if line[10] != '':
                mainTable[line[0]][line[5]][line[6]][line[7]]['score'] = line[10] 
            if line[11] != '':
                mainTable[line[0]][line[5]][line[6]][line[7]]['compileResult'] = line[11]
            if line[12] != '':
                mainTable[line[0]][line[5]][line[6]][line[7]]['compileMessageType'] = line[12]
                mainTable[line[0]][line[5]][line[6]][line[7]]['compileMessageData'] = line[13]
    
    studentScore = {}
    with open('./Datasets/Subject.csv', mode='r', encoding="utf8") as file:
        csv_reader = csv.reader(file)
        next(csv_reader)
        for line in csv_reader:
            studentScore[line[0]] = line[1] 

    studentDataset = []
    for student in tqdm(mainTable.keys()):
        assignments = []
        for assignment in mainTable[student]:
            problems = []
            for problem in mainTable[student][assignment]:
                codeStates = []
                for codeState in mainTable[student][assignment][problem]:
                    compileMessageType = ''
                    compileMessageData = ''
                    if 'complieMessageType' in mainTable[student][assignment][problem][codeState]:
                        compileMessageType = mainTable[student][assignment][problem][codeState]['compileMessageType']
                        compileMessageData = mainTable[student][assignment][problem][codeState]['compileMessageData']
                    cS = CodeState(
                        codeState, 
                        mainTable[student][assignment][problem][codeState]['code'],
                        mainTable[student][assignment][problem][codeState]['compileResult'],
                        compileMessageType,
                        compileMessageData,
                        mainTable[student][assignment][problem][codeState]['score'],
                        mainTable[student][assignment][problem][codeState]['time']
                    )
                    codeStates.append(cS)
                pb = Problem(problem, codeStates)
                problems.append(pb)
            ass = Assignment(assignment, problems)
            assignments.append(ass)
        stu = Student(student, studentScore[student], assignments)
        studentDataset.append(stu)

    return np.array(studentDataset)

In [23]:
raw_dataset = load_data()

100%|██████████| 506/506 [00:00<00:00, 1136.28it/s]


In [39]:
def code_to_AST(code):
    tokens = javalang.tokenizer.tokenize(code)
    parser = javalang.parser.Parser(tokens)
    tree = parser.parse_member_declaration()
    return tree

def fetch_codeState(students):
    students_output = []
    students_label = []
    students_label_final_grade = []

    for student in tqdm(students):
        student_i = []
        student_label_i = []
        students_label_final_grade_i = []
        final_grade = student.score
        for assignment in student.assignments:
            for problem in assignment.problems:
                for codeState in problem.codeStates:
                    try:
                        parsed = code_to_AST(codeState.code)
                    except:
                        parsed = "error"
                    label = codeState.score
                    student_i.append(parsed)
                    student_label_i.append(label)
                    students_label_final_grade_i.append(final_grade)
        hashing_table = {}
        AST_paths = [extracting_path(java_code, max_length=8, max_width=2, hash_path=True, hashing_table=hashing_table) for java_code in student_i]
        students_output.append(AST_paths)
        students_label.append(student_label_i)
        students_label_final_grade.append(students_label_final_grade_i)
    
    return students_output, students_label, students_label_final_grade

In [40]:
students, lables, lables_final_grade = fetch_codeState(raw_dataset)

100%|██████████| 506/506 [17:26<00:00,  2.07s/it]


In [43]:
def create_word_index_table(vocab):
    """
    Creating word to index table
    Input:
    vocab: list. The list of the node vocabulary

    """
    ixtoword = {}
    # period at the end of the sentence. make first dimension be end token
    ixtoword[0] = 'END'
    ixtoword[1] = 'UNK'
    wordtoix = {}
    wordtoix['END'] = 0
    wordtoix['UNK'] = 1
    ix = 2
    for w in vocab:
        wordtoix[w] = ix
        ixtoword[ix] = w
        ix += 1
    return wordtoix, ixtoword

def convert_to_idx(sample, node_word_index, path_word_index):
    """
    Converting to the index 
    Input:
    sample: list. One single training sample, which is a code, represented as a list of neighborhoods.
    node_word_index: dict. The node to word index dictionary.
    path_word_index: dict. The path to word index dictionary.

    """
    sample_index = []
    for line in sample:
        components = line.split(",")
        if components[0] in node_word_index:
            starting_node = node_word_index[components[0]]
        else:
            starting_node = node_word_index['UNK']
        if components[1] in path_word_index:
            path = path_word_index[components[1]]
        else:
            path = path_word_index['UNK']
        if components[2] in node_word_index:
            ending_node = node_word_index[components[2]]
        else:
            ending_node = node_word_index['UNK']
        
        sample_index.append([starting_node,path,ending_node])
    return sample_index

In [44]:
def get_data(students):
    for student in students:
        node_hist = {}
        path_hist = {}
        for paths in student:
            starting_nodes = [p.split(",")[0] for p in paths]
            path = [p.split(",")[1] for p in paths]
            ending_nodes = [p.split(",")[2] for p in paths]
            nodes = starting_nodes + ending_nodes
            for n in nodes:
                if not n in node_hist:
                    node_hist[n] = 1
                else:
                    node_hist[n] += 1
            for p in path:
                if not p in path_hist:
                    path_hist[p] = 1
                else:
                    path_hist[p] += 1
        
        # small frequency then abandon, for node and path
        valid_node = [node for node, count in node_hist.items()]
        valid_path = [path for path, count in path_hist.items()]

        # create ixtoword and wordtoix lists
        node_word_index, node_index_word = create_word_index_table(valid_node)
        path_word_index, path_index_word = create_word_index_table(valid_path)

    output = []
    for student in students:
        output_i = []
        for paths in student:
            raw_features = convert_to_idx(paths, node_word_index, path_word_index)
            try:
                features = np.array(raw_features).reshape(-1, len(raw_features*3))
            except:
                features = np.zeros(100)
            output_i.append(features)
        output.append(output_i)

    return output


In [45]:
students = get_data(students)

In [46]:
def get_input(students, lables):
    output = []
    j = 0
    for student in tqdm(students):
        output_i = []
        k = 0
        for code in student:
            code = np.resize(code, 300)
            # code = code / np.linalg.norm(code)
            toStore = {
                "codestate_array": code,
                "codestate_label": float(lables[j][k])
            }
            output_i.append(toStore)
            k += 1
        output.append(output_i)
        j += 1
    return output

In [83]:
input = get_input(students, lables)

100%|██████████| 506/506 [00:00<00:00, 720.07it/s]


In [102]:
def forward_propagation(layer_num, neuron_list, weights_list, instance):
    pred = []
    a_list = []
    if weights_list is None:
        weights_list = []
        for i in range(layer_num + 1):
            curweight = []
            if i == layer_num:
                weights = []
                pre_layer_neuron_num = neuron_list[i - 1] + 1
                for n in range(pre_layer_neuron_num):
                    weights.append(random.uniform(-1.0, 1.0))
                curweight.append(weights)
            else:
                for m in range(neuron_list[i]):
                    weights = []
                    pre_layer_neuron_num = 0
                    if i == 0:
                        pre_layer_neuron_num = len(instance[0]) + 1
                    else:
                        pre_layer_neuron_num = neuron_list[i-1] + 1
                    for n in range(pre_layer_neuron_num):
                        weights.append(random.uniform(-1.0, 1.0))
                    curweight.append(weights)
            weights_list.append(curweight)
    #print(weights_list)
    a1 = []
    a1.append([1])
    for i in instance[0]:
        a1.append([i])
    a = a1.copy()
    n = 1
    #print("a" + str(n) + ": " + str(a1))
    a_list.append(a)
    for layer in range(layer_num):
        n += 1
        z = np.dot(np.array(weights_list[layer]), np.array(a))
        #print("z" + str(n) + ": " + str(z))
        a = []
        a.append([1])
        for i in z:
            try:
                a.append([1 / (1 + math.exp(-i))])
            except:
                a.append([1 / float("inf")])
        a_list.append(a)
        #print("a" + str(n) + ": " + str(a))
    n += 1
    z = np.dot(weights_list[-1], a)
    #print("z" + str(n) + ": " + str(z))
    a = []
    for i in z:
            try:
                a.append([1 / (1 + math.exp(-i))])
            except:
                a.append([1 / float("inf")])
    pred = a
    a_list.append(a)
    #print("a" + str(n) + ": " + str(a))
    #print("Predicted output for instance: " + str(pred))
    #print("Expected output for instance: " + str(instance[1]))
    return pred, a_list, weights_list

def cost_function(layer_num, neuron_list, weights_list, ins, lambda_value):
    J = 0
    y = ins[1]
    output = forward_propagation(layer_num, neuron_list, weights_list, ins)
    weights_list = output[2]
    J = abs(y[0] - output[0][0][0])
    S = 0
    for layer in weights_list:
        for row in layer:
            new_row = row[1:]
            for r in new_row:
                S += r*r
    S *= (lambda_value / (2))
    return J+S

def backpropagation(layer_num, neuron_list, weights_list, instance, lambda_value):
    ini_J = cost_function(layer_num, neuron_list, weights_list, instance, lambda_value)
    D_list = []
    for i in range(layer_num + 1):
        D_list.append([])
    output, a_list, wl = forward_propagation(layer_num, neuron_list, weights_list, instance)
    weights_list = wl
    y = []
    delta_list = []
    y.append([instance[1]])
    delta = np.array(output[0]) - np.array(y[0])
    delta_list.insert(0, delta)
    #print("delta: " + str(delta))
    for num in range(layer_num):
        curweight = weights_list[-num-1]
        tweight = np.transpose(np.array(curweight))
        delta = np.dot(tweight, delta)
        for i in range(len(delta)):
            delta[i][0] = delta[i][0] * a_list[-num-2][i][0] * (1-a_list[-num-2][i][0])
        delta = delta[1:]
        delta_list.insert(0, delta)
        #print("delta: " + str(delta))
    for num in range(layer_num + 1):
        cur = np.array(D_list[-num-1])
        if len(cur) == 0:
            D_list[-num - 1] = np.dot(delta_list[-num-1], np.transpose(a_list[-num-2]))
        else:
            D_list[-num-1] = cur + np.dot(delta_list[-num-1], np.transpose(a_list[-num-2]))
    #for i in range(len(delta_list)):
        #print("Gradients of Theta" + str(len(delta_list) - i) + " based on training instance: " + str(np.dot(delta_list[-i-1], np.transpose(a_list[-i-2]))))
    for layer in range(layer_num+1):
        P = []
        for w in weights_list[-layer-1]:
            lst = []
            for e in w:
                lst.append(lambda_value * e)
            lst[0] = 0
            P.append(lst)
        D_list[-layer-1] = 1 * (np.array(D_list[-layer-1]) + np.array(P))
    #for i in range(len(D_list)):
        #print("Final regularized gradients of Theta" + str(i+1) + ": " + str(D_list[i]))
    for layer in range(layer_num+1):
        weights_list[-layer-1] = np.array(weights_list[-layer-1]) - 2 * np.array(D_list[-layer-1])
    cur_J = cost_function(layer_num, neuron_list, weights_list, instance, lambda_value)
    if ini_J - cur_J < 0.001:
        return weights_list
    else:
        return backpropagation(layer_num, neuron_list, weights_list, instance, lambda_value)

def cal_performance(predictions, testset, classes):
    total_precision = 0
    total_recall = 0
    total_F1 = 0
    total_accuracy = 0
    for curclass in classes:
        tp = 0
        tn = 0
        fp = 0
        fn = 0
        for i in range(len(predictions)):
            actual = testset[i][1]
            pred = predictions[i]
            if pred == actual == curclass:
                tp += 1
            elif pred == curclass and actual != curclass:
                fp += 1
            elif pred != curclass and actual == curclass:
                fn += 1
            else:
                tn += 1
        accuracy = (tp + tn) / (tp + tn + fp + fn)
        if (fp + fn) == 0:
            precision = 1
            recall = 1
            F1 = 1
        elif (tp + fp) == 0:
            precision = tn / (tn + fn)
            recall = tp / (tp + fn)
        else:
            precision = tp / (tp + fp)
            recall = tp / (tp + fn)
            F1 = 2 * ((precision * recall) / (precision + recall))
        total_accuracy += accuracy
        total_recall += recall
        total_precision += precision
        total_F1 += F1
        #print("tp" + str(tp) + "tn" + str(tn) + "fp" + str(fp) + "fn" + str(fn))
    class_num = len(classes)
    total_accuracy = total_accuracy / class_num
    total_precision = total_precision / class_num
    total_recall = total_recall / class_num
    total_F1 = total_F1 / class_num
    return total_accuracy, total_precision, total_recall, total_F1

def run_rnn(trainingset, layer_num, neuron_list, lambda_value, weight_list, testingset, classes):
    for instance in tqdm(trainingset):
        weight_list = backpropagation(layer_num, neuron_list, weight_list, instance,lambda_value)
    all_predictions = []
    for j in tqdm(testingset):
        prediction = forward_propagation(layer_num, neuron_list, weight_list, j)[0]
        all_predictions.append(prediction)
    accuracy, precision, recall, F1 = cal_performance(all_predictions, testingset, classes)
    return accuracy, precision, recall, F1, weight_list

In [105]:
# Testing ONLY

# trainingset = [[[0, 0, 0, 1, 1, 1], [3]] ,    [[1, 1, 0, 1, 0, 1], [4]]]
# testingset = [[[0, 0, 1, 1, 1, 1], [4]] ,    [[1, 0, 0, 1, 0, 1], [3]]]
# print(run_rnn(trainingset, 3, [3, 8, 8], 0.25, None, testingset, [0, 1, 2, 3, 4, 5, 6]))

# trainingset = []
# testingset = []
# for i in input[0]:
#     trainingset.append([i["codestate_array"], [i["codestate_label"]]])
# for i in input[1]:
#     testingset.append([i["codestate_array"], [i["codestate_label"]]])
# print(run_rnn(trainingset, 5, [6, 8, 8, 8, 6], 0.25, None, testingset, [0, 1, 2, 3, 4, 5, 6]))

training = input[:380]
testing = input[-126:]
trainingset = []
testingset = []
for student in training:
    for codestate in student:
        trainingset.append([codestate["codestate_array"], [codestate["codestate_label"]]])

for student in testing:
    for codestate in student:
        testingset.append([codestate["codestate_array"], [codestate["codestate_label"]]])


        

In [106]:
accuracy, precision, recall, F1, weight_list = run_rnn(trainingset, 5, [6, 8, 8, 8, 6], 0.25, None, testingset, [0, 1, 2, 3, 4, 5, 6])

100%|██████████| 95594/95594 [24:17<00:00, 65.58it/s] 
100%|██████████| 29984/29984 [00:05<00:00, 5869.32it/s]


In [107]:
print(accuracy)
print(precision)
print(recall)
print(F1)
print(weight_list)

1.0
1.0
1.0
1.0
[array([[7.61577363e-01, 6.89429702e-48, 3.44714851e-48, ...,
        1.03414455e-47, 3.44714851e-48, 3.37820554e-46],
       [9.50570444e-01, 6.75865150e-48, 3.37932575e-48, ...,
        1.01379772e-47, 3.37932575e-48, 3.31173923e-46],
       [9.57846538e-01, 6.75049893e-48, 3.37524946e-48, ...,
        1.01257484e-47, 3.37524946e-48, 3.30774447e-46],
       [9.12208497e-01, 6.79819079e-48, 3.39909539e-48, ...,
        1.01972862e-47, 3.39909539e-48, 3.33111349e-46],
       [8.40709274e-01, 6.85589157e-48, 3.42794579e-48, ...,
        1.02838374e-47, 3.42794579e-48, 3.35938687e-46],
       [6.34632087e-01, 6.89705654e-48, 3.44852827e-48, ...,
        1.03455848e-47, 3.44852827e-48, 3.37955770e-46]]), array([[-5.24831486e-01,  8.04803588e-29,  8.51476780e-29,
         8.53201096e-29,  8.42294142e-29,  8.24777374e-29,
         7.71561636e-29],
       [-5.25223039e-01,  8.04524762e-29,  8.51181784e-29,
         8.52905503e-29,  8.42002328e-29,  8.24491628e-29,
         7.

In [108]:
input = get_input(students, lables_final_grade)

100%|██████████| 506/506 [00:00<00:00, 585.73it/s]


In [109]:
training = input[:380]
testing = input[-126:]
trainingset = []
testingset = []
for student in training:
    for codestate in student:
        trainingset.append([codestate["codestate_array"], [codestate["codestate_label"]]])

for student in testing:
    for codestate in student:
        testingset.append([codestate["codestate_array"], [codestate["codestate_label"]]])

In [110]:
accuracy, precision, recall, F1, weight_list = run_rnn(trainingset, 5, [6, 8, 8, 8, 6], 0.25, weight_list, testingset, [0, 1, 2, 3, 4, 5, 6])

100%|██████████| 95594/95594 [02:26<00:00, 653.68it/s]
100%|██████████| 29984/29984 [00:04<00:00, 6441.77it/s]


In [111]:
print(accuracy)
print(precision)
print(recall)
print(F1)
print(weight_list)

1.0
1.0
1.0
1.0
[array([[7.61577363e-001, 2.00738425e-103, 1.00369213e-103, ...,
        4.26671681e-102, 1.00369213e-103, 6.48976418e-102],
       [9.50570444e-001, 1.96788890e-103, 9.83944451e-104, ...,
        4.18276902e-102, 9.83944451e-104, 6.36207786e-102],
       [9.57846538e-001, 1.96551515e-103, 9.82757576e-104, ...,
        4.17772358e-102, 9.82757576e-104, 6.35440365e-102],
       [9.12208497e-001, 1.97940140e-103, 9.89700698e-104, ...,
        4.20723894e-102, 9.89700698e-104, 6.39929712e-102],
       [8.40709274e-001, 1.99620190e-103, 9.98100949e-104, ...,
        4.24294859e-102, 9.98100949e-104, 6.45361223e-102],
       [6.34632087e-001, 2.00818773e-103, 1.00409387e-103, ...,
        4.26842461e-102, 1.00409387e-103, 6.49236179e-102]]), array([[-5.24831486e-01,  1.62095041e-52,  1.71495463e-52,
         1.71842757e-52,  1.69645993e-52,  1.66117950e-52,
         1.55399798e-52],
       [-5.25223039e-01,  1.62038883e-52,  1.71436048e-52,
         1.71783221e-52,  1.695872