In [1]:
import random
import pandas as pd
import tqdm
import numpy as np

data = pd.read_csv(
    '../dataset.csv',
    usecols=['order_id', 'user_id', 'sequence_id', 'skill_id', 'correct'],
    encoding='utf-8'
).dropna(subset=['skill_id'])


In [2]:
raw_question = data.skill_id.unique().tolist()
num_skill = len(raw_question)

# question id from 0 to (num_skill - 1)
questions = { p: i for i, p in enumerate(raw_question) }

print("number of skills: %d" % num_skill)

number of skills: 9


In [3]:
from loader import parse_all_seq

# [(question_sequence_0, answer_sequence_0), ..., (question_sequence_n, answer_sequence_n)]
sequences = parse_all_seq(data)

In [4]:
def train_test_split(data, train_size=.7, shuffle=True):
    if shuffle:
        random.shuffle(data)
    boundary = round(len(data) * train_size)
    return data[: boundary], data[boundary:]


train_sequences, test_sequences = train_test_split(sequences)

In [5]:
def sequences2tl(sequences, trgpath):
    with open(trgpath, 'a', encoding='utf8') as f:
        for seq in tqdm.tqdm(sequences, 'write into file: '):
            questions, answers = seq
            seq_len = len(questions)
            f.write(str(seq_len) + '\n')
            f.write(','.join([str(q) for q in questions]) + '\n')
            f.write(','.join([str(a) for a in answers]) + '\n')


# save triple line format for other tasks
sequences2tl(train_sequences, 'train.txt')
sequences2tl(test_sequences, 'test.txt')

write into file: 100%|██████████| 6300/6300 [00:00<00:00, 16747.43it/s]
write into file: 100%|██████████| 2700/2700 [00:00<00:00, 22555.46it/s]


In [6]:
from loader import encode_onehot

train_data = encode_onehot(train_sequences, num_skill)
test_data = encode_onehot(test_sequences, num_skill)

In [7]:
print("train data shape: ", train_data.shape)
print("test data shape: ", test_data.shape)

train data shape:  (56736, 50, 18)
test data shape:  (23723, 50, 18)


In [8]:
# save onehot data
np.save('train_data.npy', train_data)
np.save('test_data.npy', test_data)