In [1]:
import random
import pandas as pd
import tqdm
import numpy as np

data = pd.read_csv(
    'dataset.csv',
    usecols=['order_id', 'user_id', 'sequence_id', 'skill_id', 'correct'],
    encoding='utf-8'
).dropna(subset=['skill_id'])

In [2]:
raw_question = data.skill_id.unique().tolist()
num_skill = len(raw_question)

# question id from 0 to (num_skill - 1)
questions = { p: i for i, p in enumerate(raw_question) }

print("number of skills: %d" % num_skill)

number of skills: 9


In [3]:
def parse_all_seq(students):
    all_sequences = []
    for student_id in tqdm.tqdm(students, 'parse student sequence:\t'):
        student_sequence = parse_student_seq(data[data.user_id == student_id])
        all_sequences.extend([student_sequence])
    return all_sequences


def parse_student_seq(student):
    seq = student.sort_values('order_id')
    q = [questions[q] for q in seq.skill_id.tolist()]
    a = seq.correct.tolist()
    return q, a


# [(question_sequence_0, answer_sequence_0), ..., (question_sequence_n, answer_sequence_n)]
sequences = parse_all_seq(data.user_id.unique())

parse student sequence:	: 100%|██████████| 10000/10000 [00:21<00:00, 469.23it/s]


In [4]:
def train_test_split(data, train_size=.7, shuffle=True):
    if shuffle:
        random.shuffle(data)
    boundary = round(len(data) * train_size)
    return data[: boundary], data[boundary:]


train_sequences, test_sequences = train_test_split(sequences)

In [5]:
def sequences2tl(sequences, trgpath):
    with open(trgpath, 'a', encoding='utf8') as f:
        for seq in tqdm.tqdm(sequences, 'write into file: '):
            questions, answers = seq
            seq_len = len(questions)
            f.write(str(seq_len) + '\n')
            f.write(','.join([str(q) for q in questions]) + '\n')
            f.write(','.join([str(a) for a in answers]) + '\n')


# save triple line format for other tasks
sequences2tl(train_sequences, 'train.txt')
sequences2tl(test_sequences, 'test.txt')

write into file: 100%|██████████| 7000/7000 [00:00<00:00, 18646.18it/s]
write into file: 100%|██████████| 3000/3000 [00:00<00:00, 19408.55it/s]


In [6]:
MAX_STEP = 50
NUM_QUESTIONS = num_skill


def encode_onehot(sequences, max_step, num_questions):
    result = []

    for q, a in tqdm.tqdm(sequences, 'convert to one-hot format: '):
        length = len(q)
        # append questions' and answers' length to an integer multiple of max_step
        mod = 0 if length % max_step == 0 else (max_step - length % max_step)
        onehot = np.zeros(shape=[length + mod, 2 * num_questions])
        for i, q_id in enumerate(q):
            index = int(q_id if a[i] > 0 else q_id + num_questions)
            onehot[i][index] = 1
        result = np.append(result, onehot)
    
    return result.reshape(-1, max_step, 2 * num_questions)


train_data = encode_onehot(train_sequences, MAX_STEP, NUM_QUESTIONS)
test_data = encode_onehot(test_sequences, MAX_STEP, NUM_QUESTIONS)

convert to one-hot format: 100%|██████████| 7000/7000 [04:02<00:00, 28.92it/s] 
convert to one-hot format: 100%|██████████| 3000/3000 [00:46<00:00, 63.87it/s] 


In [7]:
# save onehot data
np.save('train_data.npy', train_data)
np.save('test_data.npy', test_data)