In [1]:
import os
import pandas as pd
import numpy as np
from itertools import chain
from sklearn.model_selection import train_test_split, StratifiedKFold
pd.set_option("mode.chained_assignment", None) # ingore warning

1. Data Loading and Processing

In [2]:
rawdatadir = '../../rawdata/'
middatadir = '../../middata/'

# Read data
df = pd.read_csv(os.path.join(rawdatadir, 'assist-0910/skill_builder_data_corrected_collapsed.csv'), encoding='latin1', low_memory=False)
df.head(3)

Unnamed: 0.1,Unnamed: 0,order_id,assignment_id,user_id,assistment_id,problem_id,original,correct,attempt_count,ms_first_response,...,hint_count,hint_total,overlap_time,template_id,answer_id,answer_text,first_action,bottom_hint,opportunity,opportunity_original
0,1,33022537,277618,64525,33139,51424,1,1,1,32454,...,0,3,32454,30799,,26,0,,1,1.0
1,2,33022709,277618,64525,33150,51435,1,1,1,4922,...,0,3,4922,30799,,55,0,,2,2.0
2,3,35450204,220674,70363,33159,51444,1,0,2,25390,...,0,3,42000,30799,,88,0,,1,1.0


In [3]:
# Data cols select
data = df.copy()
use_cols = ['user_id', 'problem_id', 'skill_id', 'correct']
data = data[use_cols]

# Data Processing
## step1: Remove records of exercises that do not correspond to knowledge concepts.
data = data.dropna(subset=['skill_id'])
## step2: Remove repetitive exercises and keep only the first time you answer an exercise.
data = data.groupby('user_id').apply(lambda x: x.drop_duplicates(subset='problem_id', keep='first')).reset_index(drop=True)
## step3: Rename operate
origin_data = data.rename(columns={'user_id':'stu_id', 
                                    'problem_id':'exer_id', 
                                    'skill_id': 'cpt_seq',
                                    'correct': 'label'})
## step4: Convert knowledge into list
origin_data['cpt_seq'] = origin_data['cpt_seq'].apply(lambda c: [int(x) for x in c.split('_')])
## step5: Filter interactions with less than 15 for each student.
origin_data = origin_data.groupby('stu_id').filter(lambda q: len(q) >= 15).copy()
## step6: Recode operation
def recodeEK(input):
    stumapdic = {}
    exermapdic = {}
    knowmapdic = {}
    # Student mapping
    stus = input['stu_id'].unique().tolist()
    for index, s in enumerate(stus): stumapdic[s] = index
    input['stu_id'] = input['stu_id'].apply(lambda x: stumapdic[x])
    # Exercise Mapping
    exers = input['exer_id'].unique().tolist()
    for index, e in enumerate(exers): exermapdic[e] = index
    input['exer_id'] = input['exer_id'].apply(lambda x: exermapdic[x])
    # Knowledge Mapping
    kcInlogs = list(chain.from_iterable(input['cpt_seq']))
    unique_kcInlogs = list(set(kcInlogs))
    for index, k in enumerate(unique_kcInlogs): knowmapdic[k] = index
    input["cpt_seq"] = input["cpt_seq"].apply(lambda x: [knowmapdic[i] for i in x])
    return input
origin_data = recodeEK(origin_data)

origin_data.head(3)

  data = data.groupby('user_id').apply(lambda x: x.drop_duplicates(subset='problem_id', keep='first')).reset_index(drop=True)


Unnamed: 0,stu_id,exer_id,cpt_seq,label
0,0,0,"[1, 24, 44]",0
1,0,1,"[1, 24, 44]",1
2,0,2,"[1, 24, 44]",0


In [4]:
# Statistics data information
## Student number
print(len(origin_data['stu_id'].unique()))
## Exercise number
print(len(origin_data['exer_id'].unique()))
## Knowledge number
from itertools import chain
merged_list = list(chain.from_iterable(origin_data['cpt_seq']))
unique_list = list(set(merged_list))
print(len(unique_list))
# Interactions
print(len(origin_data))
## Average logs per student
print(len(origin_data)/len(origin_data['stu_id'].unique()))

2493
17676
123
267423
107.26955475330927


2. Data Split Operation

In [5]:
# Q Matrix Generation
data = origin_data.copy()

data = data.drop_duplicates("exer_id")
def transform_Qdata(input_dict):
    output_dict = {}
    output_dict["exer_id:token"] = input_dict["exer_id"]
    output_dict["cpt_seq:token_seq"] = [','.join(map(str, seq)) for seq in input_dict["cpt_seq"]]
    return output_dict
dataQ_ = transform_Qdata(data[['exer_id', 'cpt_seq']])
dataQ = pd.DataFrame(dataQ_)

dataQ.to_csv(os.path.join(middatadir, 'assist-0910/assist-0910-Q.csv'), index=False, encoding='utf-8')

In [6]:
# Split data into train/test with ratio p.
data = origin_data.copy()
def split_train_test(originD, p):
    train = []
    test = []
    for _, stu_df in originD.groupby('stu_id'):
        stu_df = stu_df.sample(frac=1, random_state=2024)  # seed=2024 for reproducibility
        train_stu, test_stu = None, None
        train_test_threshold = int(len(stu_df) * p)
        test_stu = stu_df.iloc[:train_test_threshold]
        train_stu = stu_df.iloc[train_test_threshold:]
        if train_stu is not None:
            train.append(train_stu)
        if test_stu is not None:
            test.append(test_stu)

    return pd.concat(train), pd.concat(test)
assist0910train, assist0910test = split_train_test(data, p=0.2)  # train/test ratio p=0.2

3. Split Data to Save

In [7]:
# assis0910train to save
assis0910train_ = assist0910train.copy()
assis0910train_ = assis0910train_[['stu_id', 'exer_id', 'label', 'cpt_seq']].rename(columns={'stu_id':'stu_id:token', 
                                                                                  'exer_id':'exer_id:token',
                                                                                  'label':'label:float',})
assis0910train_["cpt_seq"] = [','.join(map(str, seq)) for seq in assis0910train_["cpt_seq"]]
assis0910train_.to_csv(os.path.join(middatadir, 'assist-0910/assist-0910-train.inter.csv'), index=False, encoding='utf-8')

In [8]:
# assis0910test to save
assis0910test_ = assist0910test.copy()
assis0910test_ = assis0910test_[['stu_id', 'exer_id', 'label', 'cpt_seq']].rename(columns={'stu_id':'stu_id:token', 
                                                                                'exer_id':'exer_id:token',
                                                                                'label':'label:float',})
assis0910test_["cpt_seq"] = [','.join(map(str, seq)) for seq in assis0910test_["cpt_seq"]]
assis0910test_.to_csv(os.path.join(middatadir, 'assist-0910/assist-0910-test.inter.csv'), index=False, encoding='utf-8')

In [9]:
print(len(assist0910train))
print(len(assist0910test))

214874
52549
