In [1]:
import os
import pandas as pd
import numpy as np
from itertools import chain
pd.set_option("mode.chained_assignment", None) # ingore warning

数据读取

In [2]:
rawdatadir = '../../rawdata/'
middatadir = '../../middata/'

# 读取数据
df = pd.read_csv(os.path.join(rawdatadir, 'assist-0910/skill_builder_data_corrected_collapsed.csv'), encoding='latin1', low_memory=False)
df.head(3)

Unnamed: 0.1,Unnamed: 0,order_id,assignment_id,user_id,assistment_id,problem_id,original,correct,attempt_count,ms_first_response,...,hint_count,hint_total,overlap_time,template_id,answer_id,answer_text,first_action,bottom_hint,opportunity,opportunity_original
0,1,33022537,277618,64525,33139,51424,1,1,1,32454,...,0,3,32454,30799,,26,0,,1,1.0
1,2,33022709,277618,64525,33150,51435,1,1,1,4922,...,0,3,4922,30799,,55,0,,2,2.0
2,3,35450204,220674,70363,33159,51444,1,0,2,25390,...,0,3,42000,30799,,88,0,,1,1.0


In [3]:
# data select
data = df.copy()
use_cols = ['order_id', 'user_id', 'problem_id', 'skill_id', 'correct']
data = data[use_cols]

# process data
## step1.0: sort data according to order_id
data = data.sort_values('order_id')

## step1.1: Remove questions without skill
data = data.dropna(subset=['skill_id'])

# step1.2: 去除重复习题，仅保留第一次做答习题
data = data.groupby('user_id').apply(lambda x: x.drop_duplicates(subset='problem_id', keep='first')).reset_index(drop=True)
data = data.sort_values(by='order_id')

## step1.3: Remove users with answer nums < 5 | 用户交互的不同知识点需要大于5
data = data.groupby('user_id').filter(lambda q: len(q) >= 15).copy()
origin_data = data.groupby('user_id').filter(lambda q: len(set(list(chain.from_iterable(q['skill_id'])))) >= 5).copy()

## 验证部分：辅助长尾现象的认知诊断
# origin_data = origin_data.groupby('user_id').filter(lambda q: len(q) < 55).copy()

## step1.4: rename operate
origin_data = origin_data.rename(columns={'user_id':'stu_id', 
                                          'problem_id':'exer_id', 
                                          'skill_id': 'cpt_seq',
                                          'correct': 'label'})

## step1.4: list cpt_seq generate
origin_data['cpt_seq'] = origin_data['cpt_seq'].apply(lambda c: [int(x) for x in c.split('_')])

## step1.5: 重编码操作
def recodeEK(input):
    stumapdic = {}
    exermapdic = {}
    knowmapdic = {}

    # user map
    stus = input['stu_id'].unique().tolist()
    for index, s in enumerate(stus): stumapdic[s] = index
    input['stu_id'] = input['stu_id'].apply(lambda x: stumapdic[x])
    
    # exercise map
    exers = input['exer_id'].unique().tolist()
    for index, e in enumerate(exers): exermapdic[e] = index
    input['exer_id'] = input['exer_id'].apply(lambda x: exermapdic[x])

    # knowledge map
    kcInlogs = list(chain.from_iterable(input['cpt_seq']))
    unique_kcInlogs = list(set(kcInlogs))
    for index, k in enumerate(unique_kcInlogs): knowmapdic[k] = index
    input["cpt_seq"] = input["cpt_seq"].apply(lambda x: [knowmapdic[i] for i in x])

    return input

origin_data = recodeEK(origin_data)

origin_data.head(3)

Unnamed: 0,order_id,stu_id,exer_id,cpt_seq,label
20401,20224085,0,0,[89],0
20402,20224095,0,1,[89],1
20403,20224113,0,2,[89],1


In [4]:
origin_data_ = origin_data.copy()
origin_data_["cpt_seq"] = [','.join(map(str, seq)) for seq in origin_data_["cpt_seq"]]
origin_data_.to_csv(os.path.join(middatadir, 'assist-0910/assist-0910-all.csv'), index=False, encoding='utf-8')

In [5]:
# 统计学生数量
print(len(origin_data['stu_id'].unique()))
# 统计习题数量
print(len(origin_data['exer_id'].unique()))
# 统计知识点数量
from itertools import chain
merged_list = list(chain.from_iterable(origin_data['cpt_seq']))
unique_list = list(set(merged_list))
print(len(unique_list))
# 统计总logs
print(len(origin_data))

# 统计：每个学生的平均logs
print(len(origin_data)/len(origin_data['stu_id'].unique()))
# 统计：每个习题的平均概念
kcInlogs = list(chain.from_iterable(origin_data['cpt_seq']))
print(len(kcInlogs) / len(origin_data))
# 统计：每个知识点的平均logs
molecule = 0
denominator = 0
def algPKC(stu):
    global molecule, denominator
    stu_u = stu[['exer_id', 'cpt_seq']]
    stu_u = stu_u.explode('cpt_seq')
    molecule = molecule + len(stu_u)
    denominator = denominator + len(stu_u['cpt_seq'].unique())
    
origin_data.groupby('stu_id').apply(algPKC)
print(molecule / denominator)


2088
17567
123
256971
123.07040229885058
1.200458417486798
8.199537504651534


进行Q矩阵生成

In [6]:
data = origin_data.copy()

# step 1: 去除相同的习题
data = data.drop_duplicates("exer_id")

# step 2: 进行Q矩阵生成
def transform_Qdata(input_dict):
    output_dict = {}

    output_dict["exer_id:token"] = input_dict["exer_id"]
    # for cpt_seq, convert it to str
    output_dict["cpt_seq:token_seq"] = [','.join(map(str, seq)) for seq in input_dict["cpt_seq"]]

    return output_dict

dataQ_ = transform_Qdata(data[['exer_id', 'cpt_seq']])
dataQ = pd.DataFrame(dataQ_)
dataQ.head(3)

dataQ.to_csv(os.path.join(middatadir, 'assist-0910/assist-0910-Q.csv'), index=False, encoding='utf-8')

划分方式一：长尾分布场景

In [7]:
from collections import defaultdict
import random

random.seed(1234)
np.random.seed(1234)

data = origin_data.copy()

def split_train_test(originD, ratio=0.2):
    train = []
    test = []
    for _, stu_df in originD.groupby('stu_id'):
        train_stu = list(stu_df.iterrows())
        test_stu = []

        testNums = int(len(train_stu) * ratio)  # 测试集数量，用户角度
        dataIndices = random.sample(range(len(train_stu)), testNums)  # 获取测试集数据索引Indexs
        for idx in sorted(dataIndices, reverse=True):    # 逆序方式从train中pop数据，保证train数据相对索引不会发生变化
            test_stu.append(train_stu.pop(idx))

        train += [x[1] for x in train_stu]
        test += [x[1] for x in test_stu]

    return pd.DataFrame(train), pd.DataFrame(test)

assis0910train, assis0910test = split_train_test(data)

划分方式二：弱覆盖场景 | 方式(4)划分：用户角度，非完成cpt | 后面不会继续做

In [8]:
# from collections import defaultdict
# import random

# random.seed(1234)
# np.random.seed(1234)

# data = origin_data.copy()

# # split train:test  |  保证weak concept covery
# def split_train_test(oDAta, t_ratio=0.2):
#     train = []
#     test = []
#     num = 0

#     # 从学生角度, 划分数据为train set和test set
#     for _, stu_df in oDAta.groupby('stu_id'):
        
#         train_stu = list(stu_df.iterrows())  
#         test_stu = []
#         drop_exers = []  # 划分到测试集中习题对应的idx

#         # 统计知识点对应的习题数量，其中知识点作为key，而value是习题的编号  |  对于一题多知识点情况，只统计第一次习题
#         cpt_dict = defaultdict(list)
#         for e_idx, (_, records) in enumerate(train_stu):
#             cpt_dict[records['cpt_seq'][0]].append(e_idx)

#         # 将知识点对应的20%习题丢进测试集中
#         for _, e_idxs in  cpt_dict.items():
#             tmp_exers =  random.sample(e_idxs, int(len(e_idxs) * t_ratio))
#             drop_exers.extend(tmp_exers)
#         gap_num = int(len(train_stu) * t_ratio) - len(drop_exers)
#         if gap_num:
#             trian_exers = [e_idx for e_idx in range(len(train_stu)) if e_idx not in drop_exers]
#             drop_exers.extend(random.sample(trian_exers, gap_num))
        
#         # 将exer_id存在于drop_exers中的数据提取到test中 | 原理：索引号
#         for idx in  sorted(drop_exers, reverse=True):
#             test_stu.append(train_stu.pop(idx))

#         train += [x[1] for x in train_stu]
#         test += [x[1] for x in test_stu]

#     return pd.DataFrame(train), pd.DataFrame(test)

# assis0910train, assis0910test = split_train_test(data)

划分方式二：弱覆盖场景 | 方式(3)划分：用户角度，完成cpt划分，排序 | 后面不会继续做

In [9]:
# from collections import defaultdict
# import random

# random.seed(1234)
# np.random.seed(1234)

# data = origin_data.copy()

# # split train:test  |  保证weak concept covery
# def split_train_test(oDAta, t_ratio=0.2):
#     train = []
#     test = []

#     # 从学生角度, 划分数据为train set和test set
#     for _, stu_df in oDAta.groupby('stu_id'):
        
#         train_stu = list(stu_df.iterrows())  
#         test_stu = []

#         while len(test_stu) < len(stu_df) * t_ratio:
#             # 统计知识点对应的习题数量，其中知识点作为key，而value是习题的编号
#             cpt_dict = defaultdict(list)
#             for e_idx, (_, records) in enumerate(train_stu):
#                 for cpt in records['cpt_seq']:
#                     cpt_dict[cpt].append(e_idx)
#             # 依据知识点对应的习题数量对知识点进行排序
#             cpt_sorted = sorted(cpt_dict.items(), key=lambda x: len(x[1]))
#             # 将最少数量的知识点对应的所有习题划分测试集
#             cpt, indices = cpt_sorted[0]
#             if (len(indices) + len(test_stu)) > len(stu_df) * t_ratio:  # 跳出条件
#                 break
#             for idx in sorted(indices, reverse=True):  # 将exer_id存在于drop_exers中的数据提取到test中 | 原理：索引号
#                 test_stu.append(train_stu.pop(idx))
#         train += [x[1] for x in train_stu]
#         test += [x[1] for x in test_stu]

#     return pd.DataFrame(train), pd.DataFrame(test)

# assis0910train, assis0910test = split_train_test(data)

数据分析模块

In [8]:
from collections import defaultdict
# data = origin_data.copy() 
# data = assis0910train.copy() 
data = assis0910test.copy() 


## 统计每个学生交互的知识点数量和习题数量、每个习题交互的学生数量
data = data[['stu_id', 'exer_id', 'cpt_seq']]
data = pd.DataFrame({
    'stu_id': data['stu_id'].repeat(data['cpt_seq'].apply(len)),
    'exer_id': data['exer_id'].repeat(data['cpt_seq'].apply(len)),
    'cpt_id': data['cpt_seq'].explode()
})
# data = data.drop_duplicates(['stu_id', 'cpt_id']).reset_index(drop=True)

# # 统计习题所交互的学生数量
# exer_id = []
# inter_stuNum = []

# for _, stu_df in data.groupby('exer_id'):
#     exer_id.extend(stu_df['exer_id'].unique())
#     inter_stuNum.append(len(stu_df['stu_id'].unique()))
# sorted_data = sorted(zip(exer_id, inter_stuNum), key=lambda x:x[1], reverse=True)
# St_exer, St_stuNums = zip(*sorted_data)

# 统计学生所交互的习题数量和知识点数量
stu_id = []
inter_eNum = []
inter_cptNum = []

for _, stu_df in data.groupby('stu_id'):
    stu_id.extend(stu_df['stu_id'].unique())
    inter_eNum.append(len(stu_df['exer_id']))  # 用户交互的习题是独一无二的 | 不会出现重复的情景
    inter_cptNum.append(len(stu_df['cpt_id'].unique()))  # 用户交互的知识点存在重复的情况 | 不同的习题考察同一知识点

# 习题数量(知识点数量:[2])从大到小进行排序，学生编号对应变化
sorted_data = sorted(zip(stu_id, inter_eNum, inter_cptNum), key=lambda x:x[1], reverse=True)
St_Stu, St_eNums, St_cptNums = zip(*sorted_data)

# 统计：每个学生的平均交互知识点
print(sum(inter_cptNum) / len(origin_data['stu_id'].unique()))

11.296455938697317


In [11]:
eNumGroup = []  # 存放每组用户交互的习题数量 | 交互习题排序，前700学生作为Head学生

for i in range(35):
    start = i * 100
    end = (i + 1) * 100
    eNumGroup.append(sum(St_eNums[start : end]) / sum(St_eNums))
    if end == 3500:
        eNumGroup.append(sum(St_eNums[end:]) / sum(St_eNums))
        print(len(St_eNums[end:]))

0


In [None]:
cNumGroup = []

sorted_cptNum = sorted(St_cptNums, reverse=True)
for i in range(35):
    start = i * 100
    end = (i + 1) * 100
    cNumGroup.append(sum(sorted_cptNum[start : end]) / sum(sorted_cptNum))
    if end == 3500:
        cNumGroup.append(sum(sorted_cptNum[end:]) / sum(sorted_cptNum))
        print(len(sorted_cptNum[end:]))

In [None]:
exerK = 495  # 对应交互习题数量为50,即小于50的都是尾部用户
s
print(sum(St_eNums[:exerK]) / sum(St_eNums))
print(exerK / 2088)
print("-" * 100)
print(St_eNums[:exerK])

print("**" * 100)


0.6883065578087965
0.23706896551724138
----------------------------------------------------------------------------------------------------
********************************************************************************************************************************************************************************************************


In [None]:
min(St_eNums)

In [None]:
cptK = 737  # 区分头部学生和尾部学生

print(sum(sorted_cptNum[:cptK]) / sum(sorted_cptNum))
print(cptK / 3644)
print("-" * 100)
print()

In [None]:
headCptNum, tailCptNum = St_cptNums[:702], St_cptNums[702:]
headExerNum, tailExerNum = St_eNums[:702], St_eNums[702:]

In [None]:
HHIdx, HTIdx, THIdx, TTIdx = [], [], [], []

for stu, eNum, cNum in zip(St_Stu, St_eNums, St_cptNums):
    if eNum > 98 and cNum > 13: HHIdx.append(stu)
    elif eNum > 98 and cNum <= 13:  HTIdx.append(stu)
    elif eNum <= 98 and cNum > 13:  THIdx.append(stu)
    else: TTIdx.append(stu)

print(f"{len(HHIdx)}  {len(HTIdx)}  {len(THIdx)}  {len(TTIdx)}")

In [None]:
len(HHIdx) + len(HTIdx) + len(THIdx) + len(TTIdx)

In [None]:
import torch


In [None]:
a = [torch.tensor(0), torch.tensor(1), torch.tensor(3)]
b = {}

for idx, item in enumerate(a):
    b[item.item()] = idx

In [None]:
b

In [None]:
htIdx = [idx for idx, i in enumerate(headCptNum) if i > 13]

selected_elements = [headExerNum[i] for i in htIdx]
len(selected_elements)

In [None]:
import matplotlib.pyplot as plt

# 统计学生关联知识点情况
plt.figure(figsize=(32, 12))
# 绘制习题
plt.subplot(121)
bars = plt.bar(np.arange(len(eNumGroup)), eNumGroup, width=0.8, label='ExerNums')  # tick_label是设置横坐标标签
## 绘制知识点
plt.subplot(122)
bars = plt.bar(np.arange(len(cNumGroup)), cNumGroup, width=0.8, label='CptNums')  # tick_label是设置横坐标标签

# # 绘制折线图
# plots = plt.plot(np.arange(len(St_cptNums)), [number * 10 for number in St_cptNums], color="red", label='CptNums')

# plt.xlabel('Students')
# plt.ylabel('Nums')

# plt.axhline(y=55, color='red', linestyle='--', label='Threshold')
# plt.text(x=3400, y=59, s=r'$y=55$', fontsize=15, color='blue')

# 显示条形图
plt.legend()
plt.show()

训练集和测试集存储

In [None]:
## assis0910train to save
assis0910train_ = assis0910train.copy()
# step 1.1: sort train data according to order_id
assis0910train_ = assis0910train_.sort_values('order_id')
# step 1.2: save user info
assis0910train_ = assis0910train_[['stu_id', 'exer_id', 'label', 'cpt_seq']].rename(columns={'stu_id':'stu_id:token', 
                                                                                  'exer_id':'exer_id:token',
                                                                                  'label':'label:float',})

assis0910train_["cpt_seq"] = [','.join(map(str, seq)) for seq in assis0910train_["cpt_seq"]]
assis0910train_.to_csv(os.path.join(middatadir, 'assist-0910/assist-0910-train.inter.csv'), index=False, encoding='utf-8')

In [None]:
## assis0910test to save
assis0910test_ = assis0910test.copy()
# step 1.1: sort train data according to order_id
assis0910test_ = assis0910test_.sort_values('order_id')
# step 1.2: save user info
assis0910test_ = assis0910test_[['stu_id', 'exer_id', 'label', 'cpt_seq']].rename(columns={'stu_id':'stu_id:token', 
                                                                                'exer_id':'exer_id:token',
                                                                                'label':'label:float',})

assis0910test_["cpt_seq"] = [','.join(map(str, seq)) for seq in assis0910test_["cpt_seq"]]
assis0910test_.to_csv(os.path.join(middatadir, 'assist-0910/assist-0910-test.inter.csv'), index=False, encoding='utf-8')

In [None]:
print(len(assis0910train))
print(len(assis0910test))
print(len(assis0910train) / (len(assis0910test) + len(assis0910train)))
print(len(assis0910test) / (len(assis0910test) + len(assis0910train)))