In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
class AssismentData():
    def __init__(self):
        self.data = pd.read_csv("/content/drive/My Drive/DKT/2015_100_skill_builders_main_problems.csv")

        self.data.dropna()

        self.data["user_id"],_= pd.factorize(self.data["user_id"])
        self.data["sequence_id"], _ = pd.factorize(self.data["sequence_id"])

        self.data = self.data.drop(columns="log_id",axis=1)

        self.data = self.data.groupby("user_id").filter(lambda q:len(q)>1).copy()

        self.seq = self.data.groupby('user_id').apply(
            lambda r: (
                r['user_id'].values[:],
                r['sequence_id'].values[:],
                r['correct'].values[:],
            )
        )

    def datasetReturn(self,shuffle=None,batch_size=32,val_data=None):

        dataset = tf.data.Dataset.from_generator(lambda :self.seq,output_types=(tf.float32,tf.float32,tf.float32))

        if shuffle:
            dataset = dataset.shuffle(buffer_size=shuffle)

        # user_depth = self.data['user_id'].max() 
        # skill_depth =  self.data['sequence_id'].max() 

        # dataset = dataset.map(
        #     # 一个预处理
        #     lambda userid, skill, label: (
        #         tf.one_hot(userid, depth=user_depth),
        #         tf.one_hot(skill, depth=skill_depth),
        #         label
        #         )
        # )
        MASK_VALUE = -1.0
        dataset = dataset.padded_batch(
            batch_size=32,
            padding_values=(MASK_VALUE , MASK_VALUE, MASK_VALUE),
            padded_shapes=([None],[None],[None]),
            drop_remainder=True
        )
        i = 0
        for l in dataset.as_numpy_iterator():
          i += 1

        test_size = int(np.ceil(i*0.2))
        train_size = i - test_size
        val_size = int(np.ceil(i*0.2))
        train_size = train_size - val_size
   
        test_data = dataset.take(test_size)
        dataset = dataset.skip(test_size)

        val_data = dataset.take(val_size)
        dataset = dataset.skip(val_size)

        # train_data = dataset.take(train_size)
        

        return dataset,test_data,val_data


In [2]:

ass = AssismentData()
train_data,test_data,val_data = ass.datasetReturn()
val_log = 'log/val'
train_loss_log = 'log/train'
summary_writer = tf.summary.create_file_writer(val_log)

In [3]:
class DKT(tf.keras.models.Model):
    def __init__(self,total_user,total_skill,embedding_size):
        super(DKT, self).__init__(name="DKTModel")

        self.mask = tf.keras.layers.Masking(mask_value=-1.0)

        # 两个嵌入层
        self.user_embedding = tf.keras.layers.Embedding(total_user,embedding_size)
        self.skill_embedding = tf.keras.layers.Embedding(total_skill,embedding_size)
        # RNN
        self.rnn = tf.keras.layers.LSTM(units=64,return_sequences=True,dropout=0.3)

        #dense
        self.dense = tf.keras.layers.Dense(2, activation='sigmoid')

        self.distribute = tf.keras.layers.TimeDistributed(self.dense)

        self.softmax = tf.keras.layers.Softmax()
        

    def call(self,userid,skillid):
        userid,skillid = tf.expand_dims(userid,axis=-1),tf.expand_dims(skillid,axis=-1)
        userid = self.mask(userid)
        skillid = self.mask(skillid)
  
        user_vector = self.user_embedding(userid)
       
        skill_vector = self.skill_embedding(skillid)

        x = tf.concat([user_vector,skill_vector],axis=-1)

        x = tf.squeeze(x,axis=-2)
        x = self.rnn(x)

        x = self.distribute(x)

        y = self.softmax(x)

        return y
    


In [4]:

dkt=DKT(ass.data["user_id"].max()+1,ass.data["sequence_id"].max()+1,64)
AUC = tf.keras.metrics.AUC()
VAUC = tf.keras.metrics.AUC()
SCC = tf.keras.metrics.SparseCategoricalCrossentropy()
optimizer = tf.keras.optimizers.Adam(learning_rate=0.005)
def test_one_step(userid,skillid,label):
  loss = dkt(userid,skillid)

  label = tf.expand_dims(label,axis=-1)
  
  mask = 1. - tf.cast(tf.equal(label, -1.),label.dtype)
  mask = tf.squeeze(mask)
  label = tf.boolean_mask(label,mask=mask)
  loss = tf.boolean_mask(loss,mask=mask)
  VAUC.update_state(tf.one_hot(tf.squeeze(tf.cast(label,tf.int32),axis=-1),depth=2),loss)
  
def train_one_step(userid,skillid,label):
  with tf.GradientTape() as tape:
    loss = dkt(userid,skillid)
    
    label = tf.expand_dims(label,axis=-1)
    
    mask = 1. - tf.cast(tf.equal(label, -1.),label.dtype)
    mask = tf.squeeze(mask)
    
    #print(mask)
    label = tf.boolean_mask(label,mask=mask)
    loss = tf.boolean_mask(loss,mask=mask)
    #print(tf.one_hot(tf.squeeze(tf.cast(label,tf.int32),axis=-1),depth=2))
    #print(label*mask)
    # print(loss)
    # print(label)
    # print(label*mask)
    AUC.update_state(tf.one_hot(tf.squeeze(tf.cast(label,tf.int32),axis=-1),depth=2),loss)
    SCC.update_state(label,loss)
    # print(loss.shape)
    loss = tf.keras.losses.sparse_categorical_crossentropy(label,loss)
    # print(loss.shape)
    # print(tf.squeeze(mask))
    loss = tf.reduce_sum(loss)

    gradients = tape.gradient(loss,dkt.trainable_variables)
    # 反向传播，自动微分计算
    optimizer.apply_gradients(zip(gradients,dkt.trainable_variables))
  

In [None]:
pred =tf.convert_to_tensor([[0.0,0.0],[0.0,0.0]])
label=tf.convert_to_tensor([0,0])
print(label)
print(pred)
loss=tf.keras.losses.sparse_categorical_crossentropy(label,pred)
# pred = tf.convert_to_tensor([[0.9, 0.05, 0.05], [0.5, 0.89, 0.6], [2.05, 0.01, 0.94]])
# label = tf.convert_to_tensor([2, 1, 2])
# loss = tf.keras.losses.sparse_categorical_crossentropy(label, pred)
print(loss)
pred =tf.convert_to_tensor([[0.0,0.0],[0.0,0.0]])
label=tf.convert_to_tensor([[0,0],[0,0]])
loss = tf.keras.metrics.AUC()(label,pred)
print(loss)

tf.Tensor([0 0], shape=(2,), dtype=int32)
tf.Tensor(
[[0. 0.]
 [0. 0.]], shape=(2, 2), dtype=float32)
tf.Tensor([0.6931472 0.6931472], shape=(2,), dtype=float32)
tf.Tensor(0.0, shape=(), dtype=float32)


In [None]:
i = 0
for u,v,l in test_data.take(1).as_numpy_iterator():
  print(dkt(u,v))
  print(l)
  print(tf.one_hot(l,depth=2))
print(i)
for u,v,l in val_data.as_numpy_iterator():
  i +=1
print(i)

In [5]:
for epoch in range(10):
  # train_data = train_data.shuffle(32)
  AUC.reset_states()
  VAUC.reset_states()
  SCC.reset_states()
  for u,v,l in train_data.as_numpy_iterator():
    train_one_step(u,v,l)

  for u,v,l in val_data.as_numpy_iterator():
    test_one_step(u,v,l)

  with summary_writer.as_default():
    tf.summary.scalar('train_auc',AUC.result(),step=epoch)
    tf.summary.scalar('val_auc',VAUC.result(),step=epoch)

  print(SCC.result(),AUC.result(),VAUC.result())

tf.Tensor(0.597481, shape=(), dtype=float32) tf.Tensor(0.7346457, shape=(), dtype=float32) tf.Tensor(0.7458189, shape=(), dtype=float32)
tf.Tensor(0.57582974, shape=(), dtype=float32) tf.Tensor(0.77768105, shape=(), dtype=float32) tf.Tensor(0.75025725, shape=(), dtype=float32)
tf.Tensor(0.55926365, shape=(), dtype=float32) tf.Tensor(0.7956676, shape=(), dtype=float32) tf.Tensor(0.7369466, shape=(), dtype=float32)
tf.Tensor(0.5514222, shape=(), dtype=float32) tf.Tensor(0.80268157, shape=(), dtype=float32) tf.Tensor(0.7418804, shape=(), dtype=float32)
tf.Tensor(0.54437804, shape=(), dtype=float32) tf.Tensor(0.8093544, shape=(), dtype=float32) tf.Tensor(0.7313023, shape=(), dtype=float32)
tf.Tensor(0.5380892, shape=(), dtype=float32) tf.Tensor(0.8146082, shape=(), dtype=float32) tf.Tensor(0.7367939, shape=(), dtype=float32)
tf.Tensor(0.53364533, shape=(), dtype=float32) tf.Tensor(0.8184457, shape=(), dtype=float32) tf.Tensor(0.73359203, shape=(), dtype=float32)
tf.Tensor(0.529352, shape=(

In [6]:
VAUC.reset_states()
for u,v,l in test_data.as_numpy_iterator():
  test_one_step(u,v,l)
print(VAUC.result())

tf.Tensor(0.73262596, shape=(), dtype=float32)


In [9]:
user_embedding = dkt.user_embedding.get_weights()[0]
f = open("/content/embedding.csv","w")
np.savetxt(f,user_embedding,delimiter='\t')