In [1]:

import pandas as pd
import matplotlib.pyplot as plt

# Paths - adjust if your notebook is in a different location
DATA_DIR = "dataset/CodeWorkout"

In [2]:

main_table = pd.read_csv(f"{DATA_DIR}/MainTable.csv")
print(f"Main table: {len(main_table):,} rows")

codestate_table = pd.read_csv(f"{DATA_DIR}/LinkTables/CodeStates.csv")
print(f"CodeState table: {len(codestate_table):,} rows")

subject_table = pd.read_csv(f"{DATA_DIR}/LinkTables/Subject.csv")
print(f"Subject table: {len(subject_table):,} rows")

Main table: 201,570 rows
CodeState table: 69,627 rows
Subject table: 381 rows


In [3]:

df = main_table.merge(codestate_table, on="CodeStateID")
df = df.merge(subject_table, on="SubjectID")
print(f"Joined dataset: {len(df):,} rows")
print(f"Columns: {df.columns.tolist()}")

Joined dataset: 191,584 rows
Columns: ['Order', 'SubjectID', 'ToolInstances', 'ServerTimestamp', 'ServerTimezone', 'CourseID', 'CourseSectionID', 'TermID', 'AssignmentID', 'ProblemID', 'Attempt', 'CodeStateID', 'IsEventOrderingConsistent', 'EventType', 'Score', 'Compile.Result', 'CompileMessageType', 'CompileMessageData', 'EventID', 'ParentEventID', 'SourceLocation', 'Code', 'X-Grade']


In [4]:
df.head()

Unnamed: 0,Order,SubjectID,ToolInstances,ServerTimestamp,ServerTimezone,CourseID,CourseSectionID,TermID,AssignmentID,ProblemID,...,EventType,Score,Compile.Result,CompileMessageType,CompileMessageData,EventID,ParentEventID,SourceLocation,Code,X-Grade
0,0,14502,Java 8; CodeWorkout,2019-03-20T19:34:25,0,CS 1114,410.0,spring-2019,492.0,32,...,Run.Program,1.0,,,,32-84779,,,"public String plusOut(String str, String word)...",0.88
1,1,14502,Java 8; CodeWorkout,2019-03-20T19:34:25,0,CS 1114,410.0,spring-2019,492.0,32,...,Compile,,Success,,,32-84780,32-84779,,"public String plusOut(String str, String word)...",0.88
2,4,13499,Java 8; CodeWorkout,2019-03-20T21:13:09,0,CS 1114,410.0,spring-2019,492.0,32,...,Run.Program,0.0,,,,32-84783,,,"public String plusOut(String str, String word)...",0.92
3,5,13499,Java 8; CodeWorkout,2019-03-20T21:13:09,0,CS 1114,410.0,spring-2019,492.0,32,...,Compile,,Error,,,32-84784,32-84783,,"public String plusOut(String str, String word)...",0.92
4,6,13499,Java 8; CodeWorkout,2019-03-20T21:13:09,0,CS 1114,410.0,spring-2019,492.0,32,...,Compile.Error,,,SyntaxError,line 3: error: ';' expected,32-84785,32-84784,Text:3,"public String plusOut(String str, String word)...",0.92


In [5]:
print("Submissions per Problem:")
submissions_per_problem = df.groupby('ProblemID').size().sort_values(ascending=False)
print(submissions_per_problem.describe())
print(f"\nUnique problems: {df['ProblemID'].nunique()}")

Submissions per Problem:
count       50.00000
mean      3831.68000
std       2203.66066
min       1277.00000
25%       2467.50000
50%       3201.50000
75%       4095.00000
max      13388.00000
dtype: float64

Unique problems: 50


In [6]:
df['code_length'] = df['Code'].str.len()
df['estimated_tokens'] = df['code_length'] / 4  # rough estimate: 4 chars ≈ 1 token

print("Code length statistics:")
print(df['code_length'].describe())

print("\nEstimated tokens per submission:")
print(df['estimated_tokens'].describe())

Code length statistics:
count    191584.000000
mean        346.031052
std         169.383148
min          36.000000
25%         232.000000
50%         311.000000
75%         428.000000
max        2606.000000
Name: code_length, dtype: float64

Estimated tokens per submission:
count    191584.000000
mean         86.507763
std          42.345787
min           9.000000
25%          58.000000
50%          77.750000
75%         107.000000
max         651.500000
Name: estimated_tokens, dtype: float64


In [11]:
tokens_per_problem = df.groupby('ProblemID')['estimated_tokens'].sum()
print("Estimated total tokens per problem:")
print(tokens_per_problem.sort_index())

Estimated total tokens per problem:
ProblemID
1        94498.75
3       233605.00
5       222654.00
12      121131.25
13     1433729.50
17      112680.75
20      163734.50
21      108127.75
22      250964.75
24      444037.75
25      484812.25
28      238268.25
31      156493.50
32      684948.50
33      519293.75
34      478283.50
36      288322.25
37      356586.25
38      592020.25
39      333557.75
40      711218.75
41       62340.00
43      418748.25
44       93970.50
45      301765.75
46      246725.75
48      259983.50
49      228475.25
51      225623.00
56      237132.00
57       58235.50
64      349421.00
67      290354.00
70      369635.50
71      201724.50
100     275718.50
101     631779.00
102     616828.00
104     328096.25
106     267079.50
107     345635.50
108     477330.00
112     146479.50
118     151955.25
128     329013.25
232     844123.50
233     162500.00
234     234333.50
235     172979.50
236     216548.50
Name: estimated_tokens, dtype: float64


In [8]:
attempts_per_student_problem = df.groupby(['SubjectID', 'ProblemID']).size()
print("Attempts per student per problem:")
print(attempts_per_student_problem.describe())

Attempts per student per problem:
count    15375.000000
mean        12.460748
std         20.796445
min          2.000000
25%          2.000000
50%          5.000000
75%         14.000000
max        556.000000
dtype: float64


In [None]:
specific_problem = submissions_per_problem.idxmax()
problem_df = df[df['ProblemID'] == specific_problem]
plt.figure(figsize=(10, 6))
plt.hist(problem_df['estimated_tokens'], bins=30, color='skyblue', edgecolor='black')
plt.title(f'Estimated Tokens Distribution for Problem {specific_problem}')
plt.xlabel('Estimated Tokens')
plt.ylabel('Number of Submissions')
plt.grid(axis='y', alpha=0.75)

AttributeError: 'DataFrame' object has no attribute 'OrderID'