## Setting Custom Parameters

In [24]:
import pandas as pd


Modify the following parameters according to your folder organization. Example files for each folder can be found in this repository.

In [25]:
INSTRUCTOR_SOLUTIONS = "./instructor_solutions"
STUDENT_INTERACTION_DATA = "./student_interaction_data"
OUTPUT_FOLDER = "./datasets/experiment"

Select the number of students to be sampled from the set. Must be less than the total number of students in the dataset.

In [26]:
STUDENT_SAMPLE_SIZE = 20

List of semesters the data was collected from. Must match the names of the subfolders in STUDENT_INTERACTION_DATA.

In [27]:
LIST_OF_SEMESTERS = ['semester']

## Imports

In [28]:
import os

import numpy as np
import pandas as pd

from tqdm.auto import tqdm
from graphs import *

## Generate ASTs for Solution Space

In [29]:
sol_path = INSTRUCTOR_SOLUTIONS
sol_trees = {}
solutions = {}
for file in tqdm(os.listdir(sol_path)):
    try:
        with open(os.path.join(sol_path, file), 'rb') as f:
            program = f.read().decode("utf-8-sig").encode("utf-8")
        solutions[file[:-3]] = [program]
    except:
        print("Error while reading instructor solution:", file)

  0%|          | 0/1 [00:00<?, ?it/s]

In [30]:
def return_correct_submissions(problemName):
    '''
    Helper function for filtering a DataFrame for correct student submissions.
    '''
    return data[(data['Score'] == 1.0) & (data['ProblemName'] == problemName)]['Input'].unique()

In [31]:
solutions = {}
for root, dirs, files in os.walk(STUDENT_INTERACTION_DATA):
    if len(dirs) > 0:
        continue
    users = {} 
    np.random.seed(42)
    random_set = None
    for file in tqdm(files):
        data = pd.read_csv(os.path.join(root, file))
        if len(data) <= 0:
            continue
        for problemName in data['ProblemName'].unique():
            problemInput = return_correct_submissions(problemName)
            if problemName not in solutions:
                solutions[problemName] = []
            solutions[problemName].extend(problemInput)


  0%|          | 0/1 [00:00<?, ?it/s]

In [32]:
solutions.keys()


dict_keys([13, 232, 235, 234, 236, 5, 233, 1, 3, 12, 24, 100, 101, 102, 25, 17, 28, 21, 20, 22, 40, 37, 38, 128, 39, 36, 32, 31, 33, 34, 41, 43, 44, 46, 49, 67, 108, 107, 104, 106, 48, 45, 51, 112, 56, 57, 64, 70, 71, 118])

In [33]:
solutions = pd.read_csv('problem_kc.csv')

In [34]:
solutions = solutions.drop_duplicates(subset=['ProblemName', 'KC'])

In [35]:
solutions

Unnamed: 0,KC,ProblemName
0,ClassDefinition,13
1,PublicClassSpecifier,13
2,MethodDefinition,13
3,PublicMethodSpecifier,13
4,VoidDataType,13
...,...,...
590499,ActualMethodParameter,118
591074,LessEqualExpression,118
591103,PreIncrementExpression,118
591993,AndExpression,118


In [36]:
solutions = {problemName: ["|".join([str(x) for x in solutions[solutions['ProblemName'] == problemName]['KC'].tolist()])] for problemName in solutions['ProblemName'].unique()}
# solutions = {problemName: ",".join([str(x) for x in solutions.loc[solutions['ProblemName'] == problemName].index[0]['KC'].tolist()]) for problemName in solutions['ProblemName'].unique()}


In [37]:
solutions[13]


['ClassDefinition|PublicClassSpecifier|MethodDefinition|PublicMethodSpecifier|VoidDataType|IntDataType|SimpleVariable|BooleanDataType|IfElseStatement|NotExpression|IfElseIfStatement|LessEqualExpression|ReturnStatement|GreaterExpression|AndExpression|IfStatement|EqualExpression|True|SimpleAssignmentExpression|SubtractExpression|MinusAssignmentExpression|False|GreaterEqualExpression|WhileStatement|LessExpression|AddExpression|AddAssignmentExpression|OrExpression|AutoBoxing|java.lang.Integer|ActualMethodParameter|MultiplyExpression|NotEqualExpression']

In [38]:
print_ast(parse_java(solutions[13][0]))

[0, 'Thing']
└── [0, 'Java']
    ├── [0, 'JavaLanguage']
    │   ├── [0, 'ProgramStructure']
    │   │   ├── [0, 'ProgramElementDefinition']
    │   │   │   ├── [0, 'ClassDefinition']
    │   │   │   └── [0, 'ClassMemberDefinition']
    │   │   │       └── [0, 'MethodDefinition']
    │   │   └── [0, 'Modifier']
    │   │       └── [0, 'PublicModifier']
    │   │           ├── [0, 'PublicClassSpecifier']
    │   │           └── [0, 'PublicMethodSpecifier']
    │   ├── [0, 'Data']
    │   │   ├── [0, 'DataType']
    │   │   │   ├── [0, 'VoidDataType']
    │   │   │   └── [0, 'SimpleDataType']
    │   │   │       ├── [0, 'IntDataType']
    │   │   │       └── [0, 'BooleanDataType']
    │   │   ├── [0, 'Variable']
    │   │   │   └── [0, 'SimpleVariable']
    │   │   └── [0, 'Value']
    │   │       └── [0, 'SimpleDataTypeValue']
    │   │           └── [0, 'BooleanValue']
    │   │               ├── [0, 'True']
    │   │               └── [0, 'False']
    │   ├── [0, 'Statement']
    │   

In [39]:
parse_java(solutions[13][0])

Node("/[0, 'Thing']")

In [40]:
solutions_embedded = {}
solutions_traversal = {}
tfidf_params = {}
for problem in tqdm(solutions):
    def parser_handler(content):
        try:
            return parse_java(content)
        except:
            print("Error while parsing AST - returning empty tree...")
            return Node("Module")
    ast_trees = [parser_handler(solution) for solution in solutions[problem]]
    tfidf_vectors, all_nodes, idf = compute_tfidf(ast_trees)
    solutions_embedded[problem] = tfidf_vectors
    solutions_traversal[problem] = [dfs_traversal(tree) for tree in ast_trees]
    tfidf_params[problem] = (all_nodes, idf) 
    

  0%|          | 0/50 [00:00<?, ?it/s]

In [41]:
solutions

{13: ['ClassDefinition|PublicClassSpecifier|MethodDefinition|PublicMethodSpecifier|VoidDataType|IntDataType|SimpleVariable|BooleanDataType|IfElseStatement|NotExpression|IfElseIfStatement|LessEqualExpression|ReturnStatement|GreaterExpression|AndExpression|IfStatement|EqualExpression|True|SimpleAssignmentExpression|SubtractExpression|MinusAssignmentExpression|False|GreaterEqualExpression|WhileStatement|LessExpression|AddExpression|AddAssignmentExpression|OrExpression|AutoBoxing|java.lang.Integer|ActualMethodParameter|MultiplyExpression|NotEqualExpression'],
 232: ['ClassDefinition|PublicClassSpecifier|MethodDefinition|PublicMethodSpecifier|VoidDataType|StringVariable|StringDataType|SimpleVariable|IntDataType|BooleanDataType|IfElseStatement|EqualExpression|True|OrExpression|ReturnStatement|StringLiteral|IfElseIfStatement|GreaterExpression|AndExpression|LessExpression|False|IfStatement|GreaterEqualExpression|NotExpression|WhileStatement|LessEqualExpression|StringInitializationStatement|Sim

## Compare Student Submission to the Solution Space

In [42]:
print_ast(parse_java(solutions[13][0]))

[0, 'Thing']
└── [0, 'Java']
    ├── [0, 'JavaLanguage']
    │   ├── [0, 'ProgramStructure']
    │   │   ├── [0, 'ProgramElementDefinition']
    │   │   │   ├── [0, 'ClassDefinition']
    │   │   │   └── [0, 'ClassMemberDefinition']
    │   │   │       └── [0, 'MethodDefinition']
    │   │   └── [0, 'Modifier']
    │   │       └── [0, 'PublicModifier']
    │   │           ├── [0, 'PublicClassSpecifier']
    │   │           └── [0, 'PublicMethodSpecifier']
    │   ├── [0, 'Data']
    │   │   ├── [0, 'DataType']
    │   │   │   ├── [0, 'VoidDataType']
    │   │   │   └── [0, 'SimpleDataType']
    │   │   │       ├── [0, 'IntDataType']
    │   │   │       └── [0, 'BooleanDataType']
    │   │   ├── [0, 'Variable']
    │   │   │   └── [0, 'SimpleVariable']
    │   │   └── [0, 'Value']
    │   │       └── [0, 'SimpleDataTypeValue']
    │   │           └── [0, 'BooleanValue']
    │   │               ├── [0, 'True']
    │   │               └── [0, 'False']
    │   ├── [0, 'Statement']
    │   

In [None]:
cache = {}
def get_list_of_nodes(programName, dataInput):
    if programName not in solutions:
        print("Not found in solutions")
        return ":"

    min_op = 50
    try:
        submission = parse_java(dataInput)
    except:
        submission = None
        target = parse_java(solutions[programName][0])
        incorrect_ops = tree_edit_distance_with_operations(target, submission)
    else:
        # find two closest candidates for comparing the trees
        submission_vec = compute_tfidf_ood(submission, *tfidf_params[programName])
        # dfs_tree = dfs_traversal(submission)
        d_tfidf = []
        for vec in solutions_embedded[programName]:
            d_tfidf.append(euclidean_distance(submission_vec, vec))

        sort_counter = 0
        indice = np.argmin(d_tfidf)
        while True:
            sort_counter += 1
            try:
                sol_tfidf = solutions[programName][indice]
                _ = parse_java(sol_tfidf)
                break
            except:
                indice = np.argsort(d_tfidf)[sort_counter:][0]


        # d_align = []
        # for tree in (solutions_traversal[programName]):
        #     d_align.append(calculate_dissimilarity(dfs_tree, tree))
        # sol_align = solutions[programName][np.argmin(d_align)]
        ops_tfidf = tree_edit_distance_with_operations(parse_java(sol_tfidf), submission)

        incorrect_ops = ops_tfidf
        target = parse_java(sol_tfidf)
        

    correct_ops = set_of_children(target).difference(incorrect_ops)
    l = ','.join(list(correct_ops) + list(incorrect_ops)) + ':' + ','.join('1'*len(correct_ops)+'0'*len(incorrect_ops))
    
    return l

## Generate Transaction Data

In [None]:
cols = [
    'Transaction Id',
    'Anon Student Id',
    'Session Id', # set to 1 for all
    'Time',
    'Level (Unit)', # homework no
    'Problem Name',
    'Problem Start Time',
    'Input',
    'Step Name',
    'Outcome',
    'KC (Binary-Node)',
    'KC Category (Binary-Node)'
]

In [43]:
from hashlib import md5

def string_hash(string):
    return md5(string.encode()).hexdigest()


In [None]:
from collections import defaultdict
counters = defaultdict(int)

user_counts = {semester:{} for semester in LIST_OF_SEMESTERS}

import string
table = str.maketrans('', '', string.ascii_lowercase)

def user_counter(anonid, semester):
    if anonid not in user_counts[semester]:
        counters[semester] += 1
        user_counts[semester][anonid] = f"{semester.translate(table)}-S{counters[semester]:05d}"
    return user_counts[semester][anonid]

In [44]:
state_to_list = pd.read_csv("codestate_kc.csv")

def code_state_to_ontology_lookup(codestateid):
    x = state_to_list[state_to_list['CodeStateID'] == codestateid]['KC'].values.tolist()
    x = "|".join(x)
    return x

In [None]:
state_to_list[state_to_list['CodeStateID'] == 1717563]

Unnamed: 0,KC,CodeStateID


In [45]:
code_state_to_ontology_lookup(1723984)
code_state_to_ontology_lookup(1717563)


''

In [None]:
solutions_embedded

{13: [array([0.33333333, 0.33333333, 0.33333333])],
 232: [array([0.33333333, 0.33333333, 0.33333333])],
 235: [array([0.33333333, 0.33333333, 0.33333333])],
 234: [array([0.33333333, 0.33333333, 0.33333333])],
 236: [array([0.33333333, 0.33333333, 0.33333333])],
 5: [array([0.33333333, 0.33333333, 0.33333333])],
 233: [array([0.33333333, 0.33333333, 0.33333333])],
 1: [array([0.33333333, 0.33333333, 0.33333333])],
 3: [array([0.33333333, 0.33333333, 0.33333333])],
 12: [array([0.33333333, 0.33333333, 0.33333333])],
 24: [array([0.33333333, 0.33333333, 0.33333333])],
 100: [array([0.33333333, 0.33333333, 0.33333333])],
 101: [array([0.33333333, 0.33333333, 0.33333333])],
 102: [array([0.33333333, 0.33333333, 0.33333333])],
 25: [array([0.33333333, 0.33333333, 0.33333333])],
 17: [array([0.33333333, 0.33333333, 0.33333333])],
 28: [array([0.33333333, 0.33333333, 0.33333333])],
 21: [array([0.33333333, 0.33333333, 0.33333333])],
 20: [array([0.33333333, 0.33333333, 0.33333333])],
 22: [a

In [46]:
def populate_import_csv(data, semester, random_set=None):
    df = pd.DataFrame()
    df[cols[1]] = data['AnonID'].apply(user_counter, args=(semester,))
    
    if random_set is not None:
        df = df[df['Anon Student Id'].isin(random_set)]
    
    df[cols[2]] = 1
    df[cols[3]] = data['Timestamp'].apply(lambda x: x.replace('T', ' '))
    df[cols[4]] = data['Assessment']
    df[cols[5]] = data['ProblemName']
    df[cols[6]] = df[cols[3]] # str apply [:-6]
    tqdm.pandas()

    series = data.progress_apply(lambda x: get_list_of_nodes(x['ProblemName'], code_state_to_ontology_lookup(x['CodeStateID'])), axis=1)

    df[cols[9]] = series.apply(lambda x: x.split(':')[1])
    df[cols[10]] = series.apply(lambda x: x.split(':')[0])

    df[cols[11]] = ''
    df = df.drop_duplicates(subset=[cols[1], cols[5]])
    df = df.assign(**{cols[10]: df[cols[10]].str.split(','),
                      cols[9]: df[cols[9]].str.split(',').apply(lambda x: x if '' in x else [float(i) for i in x])}).explode(
        [cols[10], cols[9]])
    df[cols[9]] = df[cols[9]].apply(lambda x: 'CORRECT' if x == 1.0 else 'INCORRECT')
    df[cols[8]] = df[cols[10]]

    df.reset_index()
    return df

In [None]:
populate_import_csv(pd.read_csv('/Users/mehmetarifdemirtas/SummerSchoolCMU24/ast-kc-step-generation/student_interaction_data/semester/FilteredMainTable.csv'), 'semester', ['-S00174'])

In [None]:
student_dict = {}

In [None]:
print("x")

In [30]:
populate_import_csv(pd.read_csv('/Users/mehmetarifdemirtas/SummerSchoolCMU24/ast-kc-step-generation/student_interaction_data/semester/FilteredMainTable.csv'), 'semester', ['-S00174'])

  0%|          | 0/69627 [00:00<?, ?it/s]

ProgramElementDefinition
├── [0, 'ClassDefinition']
└── [0, 'ClassMemberDefinition']
ProgramElementDefinition
├── [0, 'ClassDefinition']
└── [0, 'ClassMemberDefinition']


In [39]:
for root, dirs, files in os.walk(STUDENT_INTERACTION_DATA):
    if len(dirs) > 0:
        continue
    users = {} 
    np.random.seed(42)
    random_set = None
    student_list = []
    for file in tqdm(files):
        data = pd.read_csv(os.path.join(root, file))
        data=data[data['AnonID'].isin(
            [14205,
            14360,
            13426,
            14210,
            14368,
            9453,
            9217,
            14379,
            14450,
            14347,
            14371,
            14497,
            10385,
            14334,
            14163,
            14332,
            6048,
            14337,
            14470,
            14372])]
        if len(data) <= 0:
            continue
        random_set = ['-S00395', '-S00350', '-S00401', '-S00354', '-S00182', '-S00409',
       '-S00297', '-S00245', '-S00317', '-S00196', '-S00292', '-S00071',
       '-S00080', '-S00056', '-S00142', '-S00174', '-S00057', '-S00291',
       '-S00166', '-S00325']

        df = populate_import_csv(data, root.split('/')[-1], random_set=None)
        #df = populate_import_csv(data, root.split('\\')[-1], random_set)
        #if random_set is None:
            #df = df[df['Anon Student Id'].isin(random_set)]
        student_list.extend(df['Anon Student Id'].unique())
        os.makedirs(os.path.join(OUTPUT_FOLDER, root.split('/')[-1]), exist_ok=True)
        df.to_csv(os.path.join(OUTPUT_FOLDER, root.split('/')[-1], file[:-4] + '.txt'), sep='\t')
        #os.makedirs(os.path.join(OUTPUT_FOLDER, root.split('\\')[-1]), exist_ok=True)
        #df.to_csv(os.path.join(OUTPUT_FOLDER, root.split('\\')[-1], file[:-4] + '.txt'), sep='\t')


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3739 [00:00<?, ?it/s]

In [37]:
data[data['AnonID'].isin(
[14205,
14360,
13426,
14210,
14368,
9453,
9217,
14379,
14450,
14347,
14371,
14497,
10385,
14334,
14163,
14332,
6048,
14337,
14470,
14372])]

Unnamed: 0,AnonID,Timestamp,Assessment,ProblemName,TransactionID,Score,CodeStateID,SessionID,MaxScore,Input,SubmissionNumber
1852,14205,2019-02-22T19:42:21,439.0,13,13-52143,0.0,1723984,0,1,"public int caughtSpeeding(int speed, boolean i...",1
1853,14205,2019-02-22T19:42:33,439.0,13,13-52146,1.0,1723986,0,1,"public int caughtSpeeding(int speed, boolean i...",2
1855,14205,2019-02-22T19:45:04,439.0,232,232-49762,1.0,1723997,0,1,"public String alarmClock(int day, boolean vaca...",1
1858,14205,2019-02-22T19:47:39,439.0,235,235-22685,1.0,1724002,0,1,"public int dateFashion(int you, int date)\n{\n...",1
1861,14205,2019-02-22T19:49:51,439.0,234,234-21338,1.0,1724012,0,1,"public boolean cigarParty(int cigars, boolean ...",1
...,...,...,...,...,...,...,...,...,...,...,...
68738,14332,2019-05-06T02:38:59,502.0,64,64-15590,1.0,1927539,0,1,"public String[] fizzBuzz(int start, int end)\n...",1
68746,14332,2019-05-06T02:39:46,502.0,70,70-6143,1.0,1927547,0,1,public boolean twoTwo(int[] nums)\n{\n if(n...,1
68755,14332,2019-05-06T02:40:43,502.0,71,71-3262,1.0,1927559,0,1,public boolean canBalance(int[] nums)\n{\n ...,1
68760,14332,2019-05-06T02:41:16,502.0,112,112-2787,1.0,1927564,0,1,public int[] seriesUp(int n)\n{\n int[] arr...,1


In [30]:
user_counter


<function __main__.user_counter(anonid, semester)>

In [31]:
cols

['Transaction Id',
 'Anon Student Id',
 'Session Id',
 'Time',
 'Level (Unit)',
 'Problem Name',
 'Problem Start Time',
 'Input',
 'Step Name',
 'Outcome',
 'KC (Binary-Node)',
 'KC Category (Binary-Node)']

In [32]:
random_set

['-S00395',
 '-S00350',
 '-S00401',
 '-S00354',
 '-S00182',
 '-S00409',
 '-S00297',
 '-S00245',
 '-S00317',
 '-S00196',
 '-S00292',
 '-S00071',
 '-S00080',
 '-S00056',
 '-S00142',
 '-S00174',
 '-S00057',
 '-S00291',
 '-S00166',
 '-S00325']

In [33]:
student_list

['-S00056',
 '-S00057',
 '-S00071',
 '-S00080',
 '-S00142',
 '-S00166',
 '-S00174',
 '-S00182',
 '-S00196',
 '-S00245',
 '-S00291',
 '-S00292',
 '-S00297',
 '-S00317',
 '-S00325',
 '-S00350',
 '-S00354',
 '-S00395',
 '-S00401',
 '-S00409']

In [34]:
d = {v:k for k, v in user_counts['semester'].items()}
for s in student_list:
    print(d[s])

14205
14360
13426
14210
14368
9453
9217
14379
14450
14347
14371
14497
10385
14334
14163
14332
6048
14337
14470
14372
