In [6]:
import random
import pandas as pd
import tqdm
import numpy as np

data = pd.read_csv(
    '../simulation_dataset.csv',
    usecols=['order_id', 'user_id', 'skill_id', 'correct'],
    encoding = "ISO-8859-1"
).dropna(subset=['skill_id'])

data.head()

Unnamed: 0,order_id,user_id,skill_id,correct
0,1,1,0,0
1,2,1,1,0
2,3,1,2,0
3,4,1,3,0
4,5,1,4,1


In [7]:
raw_question = data.skill_id.unique().tolist()
num_skill = len(raw_question)

# question id from 0 to (num_skill - 1)
questions = { p: i for i, p in enumerate(raw_question) }

print("number of skills: %d" % num_skill)

number of skills: 9


In [8]:
def parse_all_seq(students):
    all_sequences = []
    for student_id in tqdm.tqdm(students, 'parse student sequence:\t'):
        student_sequence = parse_student_seq(data[data.user_id == student_id])
        all_sequences.extend([student_sequence])
    return all_sequences


def parse_student_seq(student):
    seq = student.sort_values('order_id')
    q = [questions[q] for q in seq.skill_id.tolist()]
    a = seq.correct.tolist()
    return q, a


# [(question_sequence_0, answer_sequence_0), ..., (question_sequence_n, answer_sequence_n)]
sequences = parse_all_seq(data.user_id.unique())

parse student sequence:	: 100%|██████████| 5000/5000 [00:07<00:00, 684.51it/s]


In [9]:
def train_test_split(data, train_size=.7, shuffle=True):
    if shuffle:
        random.shuffle(data)
    boundary = round(len(data) * train_size)
    return data[: boundary], data[boundary:]


train_sequences, test_sequences = train_test_split(sequences)

In [10]:
def sequences2tl(sequences, trgpath):
    with open(trgpath, 'a', encoding='utf8') as f:
        for seq in tqdm.tqdm(sequences, 'write into file: '):
            questions, answers = seq
            seq_len = len(questions)
            f.write(str(seq_len) + '\n')
            f.write(','.join([str(q) for q in questions]) + '\n')
            f.write(','.join([str(a) for a in answers]) + '\n')


# save triple line format for other tasks
sequences2tl(train_sequences, 'train.txt')
sequences2tl(test_sequences, 'test.txt')


# reduce the amount of data for example running faster
percentage = 0.05
train_sequences = train_sequences[:int(len(train_sequences)*percentage)]
test_sequences = test_sequences[:int(len(test_sequences)*percentage)]
sequences2tl(train_sequences, 'example_train.txt')
sequences2tl(test_sequences, 'example_test.txt')

write into file: 100%|██████████| 3500/3500 [00:00<00:00, 18735.52it/s]
write into file: 100%|██████████| 1500/1500 [00:00<00:00, 20607.18it/s]
write into file: 100%|██████████| 175/175 [00:00<00:00, 18686.91it/s]
write into file: 100%|██████████| 75/75 [00:00<00:00, 17726.41it/s]
