In [None]:
import random
import pandas as pd
import tqdm
import numpy as np

data = pd.read_csv(
    'dataset.csv',
    usecols=['order_id', 'user_id', 'sequence_id', 'skill_id', 'correct'],
    encoding='utf-8'
).dropna(subset=['skill_id'])

In [None]:
raw_question = data.skill_id.unique().tolist()
num_skill = len(raw_question)

# question id from 0 to (num_skill - 1)
questions = { p: i for i, p in enumerate(raw_question) }

print("number of skills: %d" % num_skill)

In [None]:
def parse_all_seq(students):
    all_sequences = []
    for student_id in tqdm.tqdm(students, 'parse student sequence:\t'):
        student_sequence = parse_student_seq(data[data.user_id == student_id])
        all_sequences.extend([student_sequence])
    return all_sequences


def parse_student_seq(student):
    seq = student.sort_values('order_id')
    q = [questions[q] for q in seq.skill_id.tolist()]
    a = seq.correct.tolist()
    return q, a


# [(question_sequence_0, answer_sequence_0), ..., (question_sequence_n, answer_sequence_n)]
sequences = parse_all_seq(data.user_id.unique())

In [None]:
def train_test_split(data, train_size=.7, shuffle=True):
    if shuffle:
        random.shuffle(data)
    boundary = round(len(data) * train_size)
    return data[: boundary], data[boundary:]


train_sequences, test_sequences = train_test_split(sequences)

In [None]:
def sequences2tl(sequences, trgpath):
    with open(trgpath, 'a', encoding='utf8') as f:
        for seq in tqdm.tqdm(sequences, 'write into file: '):
            questions, answers = seq
            seq_len = len(questions)
            f.write(str(seq_len) + '\n')
            f.write(','.join([str(q) for q in questions]) + '\n')
            f.write(','.join([str(a) for a in answers]) + '\n')


# save triple line format for other tasks
sequences2tl(train_sequences, 'train.txt')
sequences2tl(test_sequences, 'test.txt')

In [None]:
MAX_STEP = 50
NUM_QUESTIONS = num_skill

# def encode_onehot(sequences, max_step, num_questions):
#     result = []

#     for q, a in tqdm.tqdm(sequences, 'convert to one-hot format: '):
#         length = len(q)
#         # append questions' and answers' length to an integer multiple of max_step
#         mod = 0 if length % max_step == 0 else (max_step - length % max_step)
#         onehot = np.zeros(shape=[length + mod, 2 * num_questions])
#         for i, q_id in enumerate(q):
#             index = int(q_id if a[i] > 0 else q_id + num_questions)
#             onehot[i][index] = 1
#         result = np.append(result, onehot)
    
#     return result.reshape(-1, max_step, 2 * num_questions)


def encode_onehot(sequences, max_step, num_questions):
    # Calculate the number of chunks (each of size max_step)
    chunks_per_seq = [max(1, (len(q) + max_step - 1) // max_step) for q, a in sequences]
    total_chunks = sum(chunks_per_seq)
    
    # Pre-allocate the result array
    result = np.zeros((total_chunks * max_step, 2 * num_questions))
    
    chunk_idx = 0
    for seq_idx, (q, a) in enumerate(tqdm.tqdm(sequences, 'convert to one-hot format: ')):
        
        # Fill in the onehot values
        for i, q_id in enumerate(q):
            index = int(q_id if a[i] > 0 else q_id + num_questions)
            result_idx = chunk_idx * max_step + i
            result[result_idx, index] = 1
        
        chunk_idx += chunks_per_seq[seq_idx]
    
    return result.reshape(-1, max_step, 2 * num_questions)

train_data = encode_onehot(train_sequences, MAX_STEP, NUM_QUESTIONS)
test_data = encode_onehot(test_sequences, MAX_STEP, NUM_QUESTIONS)

In [None]:
print("train data shape: ", train_data.shape)
print("test data shape: ", test_data.shape)

In [None]:
# save onehot data
np.save('train_data.npy', train_data)
np.save('test_data.npy', test_data)