In [1]:
import pandas as pd
import pickle
import numpy as np
import dgl
from dgl.data import DGLDataset
import torch
from tqdm import tqdm
from sklearn.model_selection import train_test_split

def one_hot_encoding(n, n_to_index):
    one_hot_vector = [0]*(len(n_to_index))
    index = n_to_index[n]
    one_hot_vector[index] = 1
    return one_hot_vector

# learning_sequence에 이미 들어 있는 activity를 제외하고 다음 activity를 추가함
def add_learning_sequence(num_next_learning, learning_sequence):
    for i in range(len(num_next_learning)):
        if num_next_learning[i][0] in learning_sequence:
            pass
        else:
            learning_sequence.append(num_next_learning[i][0])
            break
    return learning_sequence, num_next_learning[i][0]

# list에 있는 activity들의 개수를 count함
def count_activity(first_learning):
    num_activity = {}
    for activity in list(set(first_learning)):
        num_activity[activity] = first_learning.count(activity)
    num_activity = sorted(num_activity.items(),reverse=True, key=lambda x:x[1])
    return num_activity

# next_learning이라는 리스트에 우리의 target_activity 다음에 오는 activity들을 추가함
def find_next_learning(student_list, target_activity, student_learning):
    next_learning = []
    for student in student_list:
        first_activity_index = [i for i in range(len(student_learning[student])) if target_activity == student_learning[student][i]]
        for index in first_activity_index:
            if index+1 >= len(student_learning[student]):
                pass
            else:
                next_learning.append(student_learning[student][index+1])
    return next_learning

# MultiIndex 컬럼을 평탄화 하는 함수
def flat_cols(df):
    df.columns = [' / '.join(x) for x in df.columns.to_flat_index()]
    return df

courses = pd.read_csv('data/archive/courses.csv')
vle = pd.read_csv('data/archive/vle.csv')
studentVle = pd.read_csv('data/archive/studentVle.csv')
studentRegistration = pd.read_csv('data/archive/studentRegistration.csv')
studentAssessment = pd.read_csv('data/archive/studentAssessment.csv')
studentInfo = pd.read_csv('data/archive/studentInfo.csv')
assessments = pd.read_csv('data/archive/assessments.csv')


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Completion == 1, Dropout == 0
studentInfo['completion_status'] = list(map(lambda x: 1 if (x == 'Pass') or (x == 'Distinction') else 0, studentInfo['final_result']))
studentInfo['course_name'] = studentInfo['code_module'] + '_' + studentInfo['code_presentation']
vle['course_name'] = vle['code_module'] + '_' + vle['code_presentation']
studentVle['course_name'] = studentVle['code_module'] + '_' + studentVle['code_presentation']
studentVle['graph_name'] = studentVle['code_module'] + '_' + studentVle['code_presentation'] + '_' + studentVle['id_student'].astype(str)

# 첫번째 인터랙션 날을 기준으로 이후 30일간으로 맞추기 위해 첫번째 인터랙션 날을 기준으로 뺄셈
student_first_date = studentVle[['graph_name', 'date']].groupby('graph_name').agg(['min']).pipe(flat_cols)
studentVle = pd.merge(studentVle, student_first_date, how='left', on='graph_name')
studentVle['new_date'] = studentVle['date'] - studentVle['date / min']

course_learning_sequence = {}
learning_sequence = []

studentVle = studentVle[studentVle['new_date'] < 30]

for course in tqdm(list(set(vle['course_name']))):
    target_course = studentVle[studentVle['course_name'] == course]

    target_course_student = set(target_course['id_student'].to_list())
    target_course_activity = set(target_course['id_site'].to_list())

    student_learning = {}
    first_learning = []

    # 학생들 별로 learning sequence 담기
    for student in target_course_student:
        learing_list = target_course[target_course['id_student'] == student]['id_site'].to_list()
        student_learning[student] = learing_list
        first_learning.append(learing_list[0])

    # 첫번째 시작 activity 계산
    num_activity = count_activity(first_learning)
    target_activity = num_activity[0][0]
    learning_sequence.append(target_activity)

    for i in range(len(target_course_activity)):
        bf_ls_len = len(learning_sequence)
        next_learning = find_next_learning(target_course_student, target_activity, student_learning)
        num_next_learning = count_activity(next_learning)
        learning_sequence, target_activity = add_learning_sequence(num_next_learning, learning_sequence)
        if bf_ls_len == len(learning_sequence):
            break
    
    course_learning_sequence[course] = learning_sequence
    
# Graph build 순서
# course node 만들기 -> activity node 만들기 -> activity node와 course node 사이에 edge 생성
# activity node 사이에 edge만들기
# student node만들기 -> activity node와 edge 생성 -> course node와 edge 생성
studentVle = studentVle[studentVle['date'] >= 0]
group_studentvle = studentVle.groupby(['id_student', 'id_site']).sum().reset_index()

node_set = {}
index = 0
course_node = list(set(vle['course_name']))
activity_node = list(set(vle['id_site']))
student_node = list(set(studentInfo['id_student']))
for i in course_node + activity_node + student_node:
    node_set[i] = index
    index += 1
    
edge_feature = []
# course node, activity node 만들기 -> activity node와 course node 사이에 edge 생성 가중치 1
src_node, dst_node = [], []
for i in range(len(vle)):
    #src_node.append(node_set[vle['course_name'].to_numpy()[i]])
    #dst_node.append(node_set[vle['id_site'].to_numpy()[i]])
    src_node.append(node_set[vle['course_name'][i]])
    dst_node.append(node_set[vle['id_site'][i]])
    edge_feature.append(1)
    
    
# student node만들기 -> activity node와 edge 생성 click_sum z-score
for i in range(len(group_studentvle)):
    src_node.append(node_set[group_studentvle['id_student'][i]])
    dst_node.append(node_set[group_studentvle['id_site'][i]])
    edge_feature.append(group_studentvle['sum_click'][i])
    #edge_feature.append(group_studentvle[(group_studentvle['id_student'] == studentVle['id_student'][i]) & (group_studentvle['id_site'] == studentVle['id_site'][i])]['sum_click'].values[0])
    
# course node와 edge 생성 completion한 사람만 edge 생성 가중치 1
for i in range(len(studentInfo)):
    if studentInfo['completion_status'][i] == 'Completion':
        src_node.append(node_set[studentInfo['id_student'][i]])
        dst_node.append(node_set[studentInfo['course_name'][i]])
        edge_feature.append(1)
        
# activity 사이에 edge 생성
for course in list(set(vle['course_name'])):
    for i in range(len(course_learning_sequence[course])):
        src_node.append(node_set[course])
        dst_node.append(node_set[course_learning_sequence[course][i]])
        edge_feature.append(1)

graph_src_node = src_node + dst_node
graph_dst_node = dst_node + src_node
g = dgl.graph((graph_src_node, graph_dst_node))

g.edata['edge_feature'] = torch.FloatTensor(edge_feature + edge_feature)

node_feature = []

# make one-hot vector using activity_type
activity_onehot_list = list(vle['activity_type'].unique())
activity_zero_list = [0 for i in range(len(activity_onehot_list))]
activity_to_index = {n : index for index, n in enumerate(activity_onehot_list)}
activity_type = []
for activity_id in activity_node:
    activity_type.append(vle[vle['id_site'] == activity_id]['activity_type'].values[0])
    
# activity one-hot
activity_node_feature = []
for activity_type in activity_type:
    activity_node_feature.append(one_hot_encoding(activity_type, activity_to_index))
    
# date one-hot
date_feature = []
for student in student_node:
    base_date_feature = [0]*30
    student_date = list(set(studentVle[studentVle['id_student'] == student]['new_date']))
    for date in student_date:
        base_date_feature[date] = 1
    date_feature.append(base_date_feature)
base_date_feature = [0]*30

for i in range(len(course_node)):
    node_feature.append([0,0,0] + activity_zero_list + base_date_feature)

for i in range(len(activity_node)):
    node_feature.append([1,0,0] + activity_node_feature[i] + base_date_feature)
    
for i in range(len(student_node)):
    node_feature.append([0,1,0] + activity_zero_list + date_feature[i])
    
g.ndata['feature'] = torch.FloatTensor(node_feature)

100%|██████████| 22/22 [00:18<00:00,  1.18it/s]
  group_studentvle = studentVle.groupby(['id_student', 'id_site']).sum().reset_index()


In [3]:


course_student = {}
pos_student = studentInfo[studentInfo['completion_status'] == 1] # Completion 15385
neg_student = studentInfo[studentInfo['completion_status'] == 0] # Dropout 17208
train_pos, train_neg, test_pos, test_neg = {'src':[], 'dst':[]}, {'src':[], 'dst':[]}, {'src':[], 'dst':[]}, {'src':[], 'dst':[]}
for course in course_node:
    pos_src_list = []
    pos_dst_list = []
    
    neg_src_list = []
    neg_dst_list = []
    
    for i in pos_student[pos_student['course_name'] == course]['id_student'].tolist():
        pos_src_list.append(node_set[i])
        pos_dst_list.append(node_set[course])
        
    for i in neg_student[neg_student['course_name'] == course]['id_student'].tolist():
        neg_src_list.append(node_set[i])
        neg_dst_list.append(node_set[course])
    
    train_pos_src, test_pos_src, train_pos_dst, test_pos_dst = train_test_split(pos_src_list, pos_dst_list, test_size=0.2, train_size=0.8, random_state=32)
    train_neg_src, test_neg_src, train_neg_dst, test_neg_dst = train_test_split(neg_src_list, neg_dst_list, test_size=0.2, train_size=0.8, random_state=32)

    train_pos['src'] = train_pos['src'] + train_pos_src
    train_pos['dst'] = train_pos['dst'] + train_pos_dst
    train_neg['src'] = train_neg['src'] + train_neg_src
    train_neg['dst'] = train_neg['dst'] + train_neg_dst
    test_pos['src'] = test_pos['src'] + test_pos_src
    test_pos['dst'] = test_pos['dst'] + test_pos_dst
    test_neg['src'] = test_neg['src'] + test_neg_src
    test_neg['dst'] = test_neg['dst'] + test_neg_dst
    
num_node = len(course_node + activity_node + student_node)
train_pos_g = dgl.graph((train_pos['src'], train_pos['dst']), num_nodes=num_node)
train_neg_g = dgl.graph((train_neg['src'], train_neg['dst']), num_nodes=num_node)
test_pos_g = dgl.graph((test_pos['src'], test_pos['dst']), num_nodes=num_node)
test_neg_g = dgl.graph((test_neg['src'], test_neg['dst']), num_nodes=num_node)

In [21]:
import dgl.function as fn
from dgl.nn import SAGEConv, GraphConv
import torch.nn as nn
from sklearn.metrics import classification_report, roc_auc_score
import torch.nn.functional as F
import itertools
import pandas as pd
import numpy as np
import dgl
from dgl.data import DGLDataset
import torch
from tqdm import tqdm
import os
from dgl import save_graphs, load_graphs
from sklearn.model_selection import train_test_split

# ----------- 2. create model -------------- #
# build a two-layer GraphSAGE model
class GraphSAGE(nn.Module):
    def __init__(self, in_feats, h_feats):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_feats, h_feats, 'mean')
        self.conv2 = SAGEConv(h_feats, h_feats, 'mean')

    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h

class GCN(nn.Module):
    def __init__(self, num_layers, in_feats, h_feats, num_classes):
        super(GCN, self).__init__()
        self.convlayers = nn.ModuleList()
        for layer in range(num_layers):
            if layer == 0:
                self.convlayers.append(
                    GraphConv(in_feats, h_feats)
                )
            else:
                self.convlayers.append(
                    GraphConv(h_feats, h_feats)
                )
            # Initialize the weights with xavier_uniform
            nn.init.xavier_uniform_(self.convlayers[-1].weight)
        #self.conv_out = GraphConv(h_feats, num_classes)
        
        #self.mlp = MLP(h_feats, 8, num_classes)
        #nn.init.xavier_uniform_(self.mlp.weight)
        

    def forward(self, g):
        h = g.ndata['feature']
        e = g.edata['edge_feature']
    
        for i, layer in enumerate(self.convlayers):
            h = layer(g, h, edge_weight=e)
            h = F.relu(h)
            
        # h = self.conv_out(g, h, edge_weight=e)
        # g.ndata["h"] = h
        # return dgl.mean_nodes(g, "h")
    
        
        #last_node = g.num_nodes() - 1  # index of last node
        #date_node = g.ndata['node_type'].tolist().count([0,0,1])
        #h = self.mlp(h[last_node])
        #return h
    
        # h = self.conv_out(g, h, edge_weight=e)
        # date_node = g.ndata['node_type'].tolist().count([0,0,1])
        # return h[date_node]
        
        #date_node = g.ndata['node_type'].tolist().count([0,0,1])
        #h = self.mlp(h[-date_node:].sum(dim=0)/date_node)
        return h
    
class MLPPredictor(nn.Module):
    def __init__(self, h_feats):
        super().__init__()
        self.W1 = nn.Linear(h_feats * 2, h_feats)
        self.W2 = nn.Linear(h_feats, 1)
        
    def apply_edges(self, edges):
        h = torch.cat([edges.src['h'], edges.dst['h']], 1)
        return {'score': self.W2(F.relu(self.W1(h))).squeeze(1)}
        
    def forward(self, g, h):
        with g.local_scope():
            g.ndata['h'] = h
            g.apply_edges(self.apply_edges)
            return g.edata['score']
        
def compute_loss(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score])
    labels = torch.cat([torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])])
    return F.binary_cross_entropy_with_logits(scores, labels)

def compute_auc(pos_score, neg_score):
    scores = torch.cat([pos_score, neg_score]).numpy()
    labels = torch.cat(
        [torch.ones(pos_score.shape[0]), torch.zeros(neg_score.shape[0])]).numpy()
    return roc_auc_score(labels, scores)

In [35]:
#model = GraphSAGE(g.ndata['feature'].shape[1], 16)

model = GCN(2, g.ndata['feature'].shape[1], 16, 16)

# You can replace DotPredictor with MLPPredictor.
pred = MLPPredictor(16)
#pred = DotPredictor()

# ----------- 3. set up loss and optimizer -------------- #
# in this case, loss will in training loop
optimizer = torch.optim.Adam(itertools.chain(model.parameters(), pred.parameters()), lr=0.001)

# ----------- 4. training -------------------------------- #
all_logits = []
for e in range(500):
    # forward
    h = model(dgl.add_self_loop(g))
    
    pos_score = pred(train_pos_g, h)
    neg_score = pred(train_neg_g, h)
    loss = compute_loss(pos_score, neg_score)
    #print(loss)

    # backward
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if e % 5 == 0:
        print('In epoch {}, loss: {}'.format(e, loss))

In epoch 0, loss: 3.273742437362671
In epoch 5, loss: 1.6731860637664795
In epoch 10, loss: 0.8603308200836182
In epoch 15, loss: 0.8703266978263855
In epoch 20, loss: 0.7849550247192383
In epoch 25, loss: 0.688713788986206
In epoch 30, loss: 0.6854637265205383
In epoch 35, loss: 0.656919002532959
In epoch 40, loss: 0.6564443707466125
In epoch 45, loss: 0.6490513682365417
In epoch 50, loss: 0.6424835920333862
In epoch 55, loss: 0.6366294622421265
In epoch 60, loss: 0.6315904855728149
In epoch 65, loss: 0.6281233429908752
In epoch 70, loss: 0.6254758834838867
In epoch 75, loss: 0.6237497329711914
In epoch 80, loss: 0.6223864555358887
In epoch 85, loss: 0.6209548115730286
In epoch 90, loss: 0.6199554800987244
In epoch 95, loss: 0.61895352602005
In epoch 100, loss: 0.6179776191711426
In epoch 105, loss: 0.6171786785125732
In epoch 110, loss: 0.6164430379867554
In epoch 115, loss: 0.6157175302505493
In epoch 120, loss: 0.6150180697441101
In epoch 125, loss: 0.6143000721931458
In epoch 130,

In [44]:
# ----------- 5. check results ------------------------ #
from sklearn.metrics import roc_auc_score
with torch.no_grad():
    pos_score = pred(test_pos_g, h)
    neg_score = pred(test_neg_g, h)
    
    pass_accuracy, fail_accuracy = 0, 0
    
    predicted_pass = []
    for i in pos_score.tolist():
        if i > 0.5:
            predicted_pass.append(1)
            pass_accuracy += 1
        else:
            predicted_pass.append(0)
            
    predicted_fail = []
    for i in neg_score.tolist():
        if i > 0.5:
            predicted_fail.append(1)
        else:
            predicted_fail.append(0)
            fail_accuracy += 1
    
    print('AUC', compute_auc(pos_score, neg_score))
    print('AUC', compute_auc(torch.tensor(predicted_pass), torch.tensor(predicted_fail)))
    print('accuracy', (pass_accuracy+fail_accuracy)/(len(pos_score)+len(neg_score)))
    print('pass accuracy', pass_accuracy/len(pos_score))
    print('fail accuracy', fail_accuracy/len(neg_score))

AUC 0.7265529716879391
AUC 0.5927757229328681
accuracy 0.6089684726048362
pass accuracy 0.3021069692058347
fail accuracy 0.8834444766599014
