## Setting Custom Parameters

In [18]:
import pandas as pd


Modify the following parameters according to your folder organization. Example files for each folder can be found in this repository.

In [19]:
INSTRUCTOR_SOLUTIONS = "./instructor_solutions"
STUDENT_INTERACTION_DATA = "./student_interaction_data"
OUTPUT_FOLDER = "./datasets/experiment"
CODESTATE_KC = "../java-parser/codestate_kc.csv"

Select the number of students to be sampled from the set. Must be less than the total number of students in the dataset.

In [20]:
STUDENT_SAMPLE_SIZE = 100

List of semesters the data was collected from. Must match the names of the subfolders in STUDENT_INTERACTION_DATA.

In [21]:
LIST_OF_SEMESTERS = ['semester']

## Imports

In [22]:
import os

import numpy as np
import pandas as pd

from tqdm.auto import tqdm
from graphs import *

## Generate ASTs for Solution Space

In [23]:
sol_path = INSTRUCTOR_SOLUTIONS
sol_trees = {}
solutions = {}
for file in tqdm(os.listdir(sol_path)):
    try:
        with open(os.path.join(sol_path, file), 'rb') as f:
            program = f.read().decode("utf-8-sig").encode("utf-8")
        solutions[file[:-3]] = [program]
    except:
        print("Error while reading instructor solution:", file)

  0%|          | 0/1 [00:00<?, ?it/s]

In [24]:
def return_correct_submissions(problemName):
    '''
    Helper function for filtering a DataFrame for correct student submissions.
    '''
    return data[(data['Score'] == 1.0) & (data['ProblemName'] == problemName)]['Input'].unique()

In [25]:
solutions = {}
for root, dirs, files in os.walk(STUDENT_INTERACTION_DATA):
    if len(dirs) > 0:
        continue
    users = {} 
    np.random.seed(42)
    random_set = None
    for file in tqdm(files):
        data = pd.read_csv(os.path.join(root, file))
        if len(data) <= 0:
            continue
        for problemName in data['ProblemName'].unique():
            problemInput = return_correct_submissions(problemName)
            if problemName not in solutions:
                solutions[problemName] = []
            solutions[problemName].extend(problemInput)


  0%|          | 0/1 [00:00<?, ?it/s]

In [26]:
solutions.keys()


dict_keys([13, 232, 235, 234, 236, 5, 233, 1, 3, 12, 24, 100, 101, 102, 25, 17, 28, 21, 20, 22, 40, 37, 38, 128, 39, 36, 32, 31, 33, 34, 41, 43, 44, 46, 49, 67, 108, 107, 104, 106, 48, 45, 51, 112, 56, 57, 64, 70, 71, 118])

In [27]:
solutions = pd.read_csv('problem_kc.csv')

In [28]:
solutions = solutions.drop_duplicates(subset=['ProblemName', 'KC'])

In [29]:
solutions

Unnamed: 0,KC,ProblemName
0,ClassDefinition,13
1,PublicClassSpecifier,13
2,MethodDefinition,13
3,PublicMethodSpecifier,13
4,VoidDataType,13
...,...,...
590499,ActualMethodParameter,118
591074,LessEqualExpression,118
591103,PreIncrementExpression,118
591993,AndExpression,118


In [30]:
solutions = {problemName: ["|".join([str(x) for x in solutions[solutions['ProblemName'] == problemName]['KC'].tolist()])] for problemName in solutions['ProblemName'].unique()}
# solutions = {problemName: ",".join([str(x) for x in solutions.loc[solutions['ProblemName'] == problemName].index[0]['KC'].tolist()]) for problemName in solutions['ProblemName'].unique()}


In [31]:
solutions[13]


['ClassDefinition|PublicClassSpecifier|MethodDefinition|PublicMethodSpecifier|VoidDataType|IntDataType|SimpleVariable|BooleanDataType|IfElseStatement|NotExpression|IfElseIfStatement|LessEqualExpression|ReturnStatement|GreaterExpression|AndExpression|IfStatement|EqualExpression|True|SimpleAssignmentExpression|SubtractExpression|MinusAssignmentExpression|False|GreaterEqualExpression|WhileStatement|LessExpression|AddExpression|AddAssignmentExpression|OrExpression|AutoBoxing|java.lang.Integer|ActualMethodParameter|MultiplyExpression|NotEqualExpression']

In [32]:
print_ast(parse_java(solutions[13][0]))

[0, 'Thing']
└── [0, 'Java']
    ├── [0, 'JavaLanguage']
    │   ├── [0, 'ProgramStructure']
    │   │   ├── [0, 'ProgramElementDefinition']
    │   │   │   ├── [0, 'ClassDefinition']
    │   │   │   └── [0, 'ClassMemberDefinition']
    │   │   │       └── [0, 'MethodDefinition']
    │   │   └── [0, 'Modifier']
    │   │       └── [0, 'PublicModifier']
    │   │           ├── [0, 'PublicClassSpecifier']
    │   │           └── [0, 'PublicMethodSpecifier']
    │   ├── [0, 'Data']
    │   │   ├── [0, 'DataType']
    │   │   │   ├── [0, 'VoidDataType']
    │   │   │   └── [0, 'SimpleDataType']
    │   │   │       ├── [0, 'IntDataType']
    │   │   │       └── [0, 'BooleanDataType']
    │   │   ├── [0, 'Variable']
    │   │   │   └── [0, 'SimpleVariable']
    │   │   └── [0, 'Value']
    │   │       └── [0, 'SimpleDataTypeValue']
    │   │           └── [0, 'BooleanValue']
    │   │               ├── [0, 'True']
    │   │               └── [0, 'False']
    │   ├── [0, 'Statement']
    │   

In [33]:
parse_java(solutions[13][0])

Node("/[0, 'Thing']")

In [34]:
solutions_embedded = {}
solutions_traversal = {}
tfidf_params = {}
for problem in tqdm(solutions):
    def parser_handler(content):
        try:
            return parse_java(content)
        except:
            print("Error while parsing AST - returning empty tree...")
            return Node("Module")
    ast_trees = [parser_handler(solution) for solution in solutions[problem]]
    tfidf_vectors, all_nodes, idf = compute_tfidf(ast_trees)
    solutions_embedded[problem] = tfidf_vectors
    solutions_traversal[problem] = [dfs_traversal(tree) for tree in ast_trees]
    tfidf_params[problem] = (all_nodes, idf) 
    

  0%|          | 0/50 [00:00<?, ?it/s]

In [35]:
solutions

{13: ['ClassDefinition|PublicClassSpecifier|MethodDefinition|PublicMethodSpecifier|VoidDataType|IntDataType|SimpleVariable|BooleanDataType|IfElseStatement|NotExpression|IfElseIfStatement|LessEqualExpression|ReturnStatement|GreaterExpression|AndExpression|IfStatement|EqualExpression|True|SimpleAssignmentExpression|SubtractExpression|MinusAssignmentExpression|False|GreaterEqualExpression|WhileStatement|LessExpression|AddExpression|AddAssignmentExpression|OrExpression|AutoBoxing|java.lang.Integer|ActualMethodParameter|MultiplyExpression|NotEqualExpression'],
 232: ['ClassDefinition|PublicClassSpecifier|MethodDefinition|PublicMethodSpecifier|VoidDataType|StringVariable|StringDataType|SimpleVariable|IntDataType|BooleanDataType|IfElseStatement|EqualExpression|True|OrExpression|ReturnStatement|StringLiteral|IfElseIfStatement|GreaterExpression|AndExpression|LessExpression|False|IfStatement|GreaterEqualExpression|NotExpression|WhileStatement|LessEqualExpression|StringInitializationStatement|Sim

## Compare Student Submission to the Solution Space

In [36]:
print_ast(parse_java(solutions[13][0]))

[0, 'Thing']
└── [0, 'Java']
    ├── [0, 'JavaLanguage']
    │   ├── [0, 'ProgramStructure']
    │   │   ├── [0, 'ProgramElementDefinition']
    │   │   │   ├── [0, 'ClassDefinition']
    │   │   │   └── [0, 'ClassMemberDefinition']
    │   │   │       └── [0, 'MethodDefinition']
    │   │   └── [0, 'Modifier']
    │   │       └── [0, 'PublicModifier']
    │   │           ├── [0, 'PublicClassSpecifier']
    │   │           └── [0, 'PublicMethodSpecifier']
    │   ├── [0, 'Data']
    │   │   ├── [0, 'DataType']
    │   │   │   ├── [0, 'VoidDataType']
    │   │   │   └── [0, 'SimpleDataType']
    │   │   │       ├── [0, 'IntDataType']
    │   │   │       └── [0, 'BooleanDataType']
    │   │   ├── [0, 'Variable']
    │   │   │   └── [0, 'SimpleVariable']
    │   │   └── [0, 'Value']
    │   │       └── [0, 'SimpleDataTypeValue']
    │   │           └── [0, 'BooleanValue']
    │   │               ├── [0, 'True']
    │   │               └── [0, 'False']
    │   ├── [0, 'Statement']
    │   

In [37]:
cache = {}
def get_list_of_nodes(programName, dataInput):
    if programName not in solutions:
        print("Not found in solutions")
        return ":"

    min_op = 50
    try:
        submission = parse_java(dataInput)
    except:
        submission = None
        target = parse_java(solutions[programName][0])
        incorrect_ops = tree_edit_distance_with_operations(target, submission)
    else:
        # find two closest candidates for comparing the trees
        submission_vec = compute_tfidf_ood(submission, *tfidf_params[programName])
        # dfs_tree = dfs_traversal(submission)
        d_tfidf = []
        for vec in solutions_embedded[programName]:
            d_tfidf.append(euclidean_distance(submission_vec, vec))

        sort_counter = 0
        indice = np.argmin(d_tfidf)
        while True:
            sort_counter += 1
            try:
                sol_tfidf = solutions[programName][indice]
                _ = parse_java(sol_tfidf)
                break
            except:
                indice = np.argsort(d_tfidf)[sort_counter:][0]


        # d_align = []
        # for tree in (solutions_traversal[programName]):
        #     d_align.append(calculate_dissimilarity(dfs_tree, tree))
        # sol_align = solutions[programName][np.argmin(d_align)]
        ops_tfidf = tree_edit_distance_with_operations(parse_java(sol_tfidf), submission)

        incorrect_ops = ops_tfidf
        target = parse_java(sol_tfidf)
        

    correct_ops = set_of_children(target).difference(incorrect_ops)
    l = ','.join(list(correct_ops) + list(incorrect_ops)) + ':' + ','.join('1'*len(correct_ops)+'0'*len(incorrect_ops))
    
    return l

## Generate Transaction Data

In [38]:
cols = [
    'Transaction Id',
    'Anon Student Id',
    'Session Id', # set to 1 for all
    'Time',
    'Level (Unit)', # homework no
    'Problem Name',
    'Problem Start Time',
    'Input',
    'Step Name',
    'Outcome',
    'KC (Binary-Node)',
    'KC Category (Binary-Node)'
]

In [39]:
from hashlib import md5

def string_hash(string):
    return md5(string.encode()).hexdigest()


In [40]:
from collections import defaultdict
counters = defaultdict(int)

user_counts = {semester:{} for semester in LIST_OF_SEMESTERS}

import string
table = str.maketrans('', '', string.ascii_lowercase)

def user_counter(anonid, semester):
    if anonid not in user_counts[semester]:
        counters[semester] += 1
        user_counts[semester][anonid] = f"{'CWO'}-S{counters[semester]:05d}"
    return user_counts[semester][anonid]

def anonID(student_id, semester='semester'):
    # find the key in user_counts[semester] that has the value student_id
    for key, value in user_counts[semester].items():
        if value == student_id:
            return key
    return None

In [41]:
state_to_list = pd.read_csv(CODESTATE_KC)

def code_state_to_ontology_lookup(codestateid):
    x = state_to_list[state_to_list['CodeStateID'] == codestateid]['KC'].values.tolist()
    try:
        x = "|".join(x)
    except TypeError:
        x = [str(i) for i in x]
        try:
            x = "|".join(x)
        except TypeError:
            x = ""
            print(f"Error while converting {codestateid} to string - {x}")

    return x

In [42]:
state_to_list[state_to_list['CodeStateID'] == 1717563]

Unnamed: 0,KC,CodeStateID
0,ClassDefinition,1717563
1,PublicClassSpecifier,1717563
2,MethodDefinition,1717563
3,PublicMethodSpecifier,1717563
4,VoidDataType,1717563
5,MethodDefinition,1717563
6,PublicMethodSpecifier,1717563
7,IntDataType,1717563
8,IntDataType,1717563
9,SimpleVariable,1717563


In [43]:
code_state_to_ontology_lookup(1723984)
code_state_to_ontology_lookup(1717563)


'ClassDefinition|PublicClassSpecifier|MethodDefinition|PublicMethodSpecifier|VoidDataType|MethodDefinition|PublicMethodSpecifier|IntDataType|IntDataType|SimpleVariable|IntDataType|IntDataType|SimpleVariable|BooleanDataType|BooleanDataType|SimpleVariable|IfElseStatement|InstanceFieldInvocation|ThisReference|InstanceFieldInvocation|ThisReference|LessExpression|SimpleAssignmentExpression'

In [44]:
solutions_embedded

{13: [array([0.01538462, 0.01538462, 0.01538462, 0.01538462, 0.01538462,
         0.01538462, 0.01538462, 0.01538462, 0.01538462, 0.01538462,
         0.01538462, 0.01538462, 0.01538462, 0.01538462, 0.01538462,
         0.01538462, 0.01538462, 0.01538462, 0.01538462, 0.01538462,
         0.01538462, 0.01538462, 0.01538462, 0.01538462, 0.01538462,
         0.01538462, 0.01538462, 0.01538462, 0.01538462, 0.01538462,
         0.01538462, 0.01538462, 0.01538462, 0.01538462, 0.01538462,
         0.01538462, 0.01538462, 0.01538462, 0.01538462, 0.01538462,
         0.01538462, 0.01538462, 0.01538462, 0.01538462, 0.01538462,
         0.01538462, 0.01538462, 0.01538462, 0.01538462, 0.01538462,
         0.01538462, 0.01538462, 0.01538462, 0.01538462, 0.01538462,
         0.01538462, 0.01538462, 0.01538462, 0.01538462, 0.01538462,
         0.01538462, 0.01538462, 0.01538462, 0.01538462, 0.01538462])],
 232: [array([0.01408451, 0.01408451, 0.01408451, 0.01408451, 0.01408451,
         0.01408451, 0

In [45]:
def populate_import_csv(data, semester, random_set=None):
    df = pd.DataFrame()
    data = data[data['AnonID'].isin([anonID(id) for id in random_set])]

    df[cols[1]] = data['AnonID'].apply(user_counter, args=(semester,))
    
    if random_set is not None:
        print(len(df))
        df = df[df['Anon Student Id'].isin(random_set)]
        print(len(df))

    df[cols[2]] = 1
    df[cols[3]] = data['Timestamp'].apply(lambda x: x.replace('T', ' '))
    df[cols[4]] = data['Assessment']
    df[cols[5]] = data['ProblemName']
    df[cols[6]] = df[cols[3]] # str apply [:-6]
    tqdm.pandas()

    series = data.progress_apply(lambda x: get_list_of_nodes(x['ProblemName'], code_state_to_ontology_lookup(x['CodeStateID'])), axis=1)

    df[cols[9]] = series.apply(lambda x: x.split(':')[1])
    df[cols[10]] = series.apply(lambda x: x.split(':')[0])

    df[cols[11]] = ''
    df = df.drop_duplicates(subset=[cols[1], cols[5]])
    df = df.assign(**{cols[10]: df[cols[10]].str.split(','),
                      cols[9]: df[cols[9]].str.split(',').apply(lambda x: x if '' in x else [float(i) for i in x])}).explode(
        [cols[10], cols[9]])
    df[cols[9]] = df[cols[9]].apply(lambda x: 'CORRECT' if x == 1.0 else 'INCORRECT')
    df[cols[8]] = df[cols[10]]

    df.reset_index()
    return df

In [46]:
np.random.seed(42)

series = data['AnonID'].apply(user_counter, args=('semester',))
random_set = np.random.choice(series.unique(), STUDENT_SAMPLE_SIZE, replace=False)
random_set

array(['CWO-S00395', 'CWO-S00350', 'CWO-S00401', 'CWO-S00354',
       'CWO-S00182', 'CWO-S00409', 'CWO-S00297', 'CWO-S00245',
       'CWO-S00317', 'CWO-S00196', 'CWO-S00292', 'CWO-S00071',
       'CWO-S00080', 'CWO-S00056', 'CWO-S00142', 'CWO-S00174',
       'CWO-S00057', 'CWO-S00291', 'CWO-S00166', 'CWO-S00325',
       'CWO-S00085', 'CWO-S00010', 'CWO-S00402', 'CWO-S00376',
       'CWO-S00371', 'CWO-S00403', 'CWO-S00114', 'CWO-S00295',
       'CWO-S00094', 'CWO-S00117', 'CWO-S00031', 'CWO-S00138',
       'CWO-S00146', 'CWO-S00079', 'CWO-S00034', 'CWO-S00400',
       'CWO-S00286', 'CWO-S00266', 'CWO-S00278', 'CWO-S00361',
       'CWO-S00016', 'CWO-S00410', 'CWO-S00001', 'CWO-S00127',
       'CWO-S00272', 'CWO-S00185', 'CWO-S00337', 'CWO-S00384',
       'CWO-S00074', 'CWO-S00355', 'CWO-S00238', 'CWO-S00091',
       'CWO-S00273', 'CWO-S00359', 'CWO-S00095', 'CWO-S00073',
       'CWO-S00043', 'CWO-S00396', 'CWO-S00077', 'CWO-S00040',
       'CWO-S00363', 'CWO-S00026', 'CWO-S00399', 'CWO-S

In [47]:
[anonID(id) for id in random_set]

[14337,
 14332,
 14470,
 6048,
 14379,
 14372,
 10385,
 14347,
 14334,
 14450,
 14497,
 13426,
 14210,
 14205,
 14368,
 9217,
 14360,
 14371,
 9453,
 14163,
 14235,
 13504,
 14238,
 14478,
 14433,
 14505,
 14300,
 14436,
 14222,
 14411,
 14452,
 14173,
 14395,
 14462,
 14225,
 10358,
 14413,
 13463,
 14302,
 10319,
 14208,
 10309,
 14398,
 10276,
 14279,
 14404,
 14278,
 714,
 14314,
 14335,
 263,
 14288,
 14498,
 10405,
 14458,
 14403,
 14496,
 9948,
 13402,
 14420,
 14471,
 14310,
 13602,
 14144,
 13830,
 14226,
 14194,
 14495,
 14393,
 14399,
 14438,
 14192,
 14501,
 14325,
 14178,
 4419,
 14389,
 14232,
 14277,
 6493,
 14441,
 14195,
 14463,
 13513,
 6986,
 10155,
 14354,
 14417,
 10115,
 10098,
 14340,
 14381,
 14365,
 14212,
 14483,
 14419,
 14492,
 14447,
 13658,
 14207]

In [48]:
student_dict = {}

In [49]:
for root, dirs, files in os.walk(STUDENT_INTERACTION_DATA):
    if len(dirs) > 0:
        continue
    users = {} 
    student_list = []
    for file in tqdm(files):
        data = pd.read_csv(os.path.join(root, file))
        if len(data) <= 0:
            continue
        df = populate_import_csv(data, root.split('/')[-1], random_set=random_set)
        #df = populate_import_csv(data, root.split('\\')[-1], random_set)
        student_list.extend(df['Anon Student Id'].unique())
        os.makedirs(os.path.join(OUTPUT_FOLDER, root.split('/')[-1]), exist_ok=True)
        df.to_csv(os.path.join(OUTPUT_FOLDER, root.split('/')[-1], file[:-4] + '_ont.txt'), sep='\t')
        #os.makedirs(os.path.join(OUTPUT_FOLDER, root.split('\\')[-1]), exist_ok=True)
        #df.to_csv(os.path.join(OUTPUT_FOLDER, root.split('\\')[-1], file[:-4] + '.txt'), sep='\t')


  0%|          | 0/1 [00:00<?, ?it/s]

17484
17484


  0%|          | 0/17484 [00:00<?, ?it/s]