In [1]:
!git branch

  balance_theory[m
  lab_balance_theory[m
  lab_balance_theory2[m
* [32mlab_rgcn[m
  lab_server[m
  master[m


In [2]:
import matplotlib.pyplot as plt
import copy as cp
%matplotlib inline

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from dgl import DGLGraph
import dgl.function as fn
from functools import partial

class RGCNLayer(nn.Module):
    def __init__(self, in_feat, out_feat, num_rels, num_bases=-1, bias=None,
                 activation=None, is_input_layer=False):
        super(RGCNLayer, self).__init__()
        self.in_feat = in_feat
        self.out_feat = out_feat
        self.num_rels = num_rels
        self.num_bases = num_bases
        self.bias = bias
        self.activation = activation
        self.is_input_layer = is_input_layer

        # sanity check
        if self.num_bases <= 0 or self.num_bases > self.num_rels:
            self.num_bases = self.num_rels

        # weight bases in equation (3)
        self.weight = nn.Parameter(torch.Tensor(self.num_bases, self.in_feat,
                                                self.out_feat))
        if self.num_bases < self.num_rels:
            # linear combination coefficients in equation (3)
            self.w_comp = nn.Parameter(torch.Tensor(self.num_rels, self.num_bases))

        # add bias
        if self.bias:
            self.bias = nn.Parameter(torch.Tensor(out_feat))

        # init trainable parameters
        nn.init.xavier_uniform_(self.weight,
                                gain=nn.init.calculate_gain('relu'))
        if self.num_bases < self.num_rels:
            nn.init.xavier_uniform_(self.w_comp,
                                    gain=nn.init.calculate_gain('relu'))
        if self.bias:
            nn.init.xavier_uniform_(self.bias,
                                    gain=nn.init.calculate_gain('relu'))

    def forward(self, g):
        if self.num_bases < self.num_rels:
            # generate all weights from bases (equation (3))
            weight = self.weight.view(self.in_feat, self.num_bases, self.out_feat)
            weight = torch.matmul(self.w_comp, weight).view(self.num_rels,
                                                        self.in_feat, self.out_feat)
        else:
            weight = self.weight

        if self.is_input_layer:
            def message_func(edges):
                # for input layer, matrix multiply can be converted to be
                # an embedding lookup using source node id
                # embed = weight.view(-1, self.out_feat)
                # index = edges.data['rel_type'] * self.in_feat + edges.src['id']
                # index = edges.data['rel_type'] * self.in_feat + edges.src['id']
                # return {'msg': embed[index] * edges.data['norm']}
                w = weight[edges.data['rel_type']]
                msg = torch.bmm(edges.src['init_h'].unsqueeze(1), w).squeeze()
                msg = msg * edges.data['norm']
                return {'msg': msg}                
        else:
            def message_func(edges):
                w = weight[edges.data['rel_type']]
                msg = torch.bmm(edges.src['h'].unsqueeze(1), w).squeeze()
                msg = msg * edges.data['norm']
                return {'msg': msg}

        def apply_func(nodes):
            h = nodes.data['h']
            if self.bias:
                h = h + self.bias
            if self.activation:
                h = self.activation(h)
            return {'h': h}

        g.update_all(message_func, fn.sum(msg='msg', out='h'), apply_func)

Define full R-GCN model
~~~~~~~~~~~~~~~~~~~~~~~



In [4]:
class Model(nn.Module):
    def __init__(self, num_nodes, h_dim, out_dim, num_rels,node_feature_array,
                 num_bases=-1, num_hidden_layers=1):
        super(Model, self).__init__()
        self.num_nodes = num_nodes
        self.h_dim = h_dim
        self.out_dim = out_dim
        self.num_rels = num_rels
        self.node_feature_array = node_feature_array
        self.num_bases = num_bases
        self.num_hidden_layers = num_hidden_layers
        

        # create rgcn layers
        self.build_model()

        # create initial features
        self.features = self.create_features()

    def build_model(self):
        self.layers = nn.ModuleList()
        # input to hidden
        i2h = self.build_input_layer()
        self.layers.append(i2h)
        # hidden to hidden
        for idx in range(self.num_hidden_layers - 1):
            h2h = self.build_hidden_layer(idx)
            self.layers.append(h2h)
        # hidden to output
        h2o = self.build_output_layer()
        self.layers.append(h2o)

    # initialize feature for each node
    def create_features(self):
        # features = torch.arange(self.num_nodes)
        features = torch.from_numpy(self.node_feature_array)
        return features

    def build_input_layer(self):
        return RGCNLayer(self.node_feature_array.shape[1], self.h_dim[0], self.num_rels, self.num_bases,
                         activation=F.relu, is_input_layer=True)

    def build_hidden_layer(self,idx):
        return RGCNLayer(self.h_dim[idx], self.h_dim[idx+1], self.num_rels, self.num_bases,
                         activation=F.relu)

    def build_output_layer(self):
        return RGCNLayer(self.h_dim[-1], self.out_dim, self.num_rels, self.num_bases,
                         activation=partial(F.softmax, dim=1))

    def forward(self, g):
        if self.features is not None:
            # g.ndata['id'] = self.features
            g.ndata['init_h'] = self.features
        for layer in self.layers:
            layer(g)
        return g.ndata.pop('h')

Handle dataset
~~~~~~~~~~~~~~~~
In this tutorial, we use AIFB dataset from R-GCN paper:



In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [6]:
def get_dist(df,col):
    df_cnt = df.groupby([col]+['etype'])['time'].count().unstack(1,fill_value=0)
    df_dist = pd.DataFrame(df_cnt.values / df_cnt.sum(1).values.reshape(-1,1),
                                               columns=df_cnt.columns,
                                               index=df_cnt.index)
    return df_dist

## Amazon Data Loading

In [None]:
# etype = {1,2,3,4,5}
from sklearn.preprocessing import LabelEncoder

amazon_network_ = pd.read_csv('/home2/kudo/SGCN/raw_data/amazon/amazon_network.csv',header=None)
amazon_network_.columns = ['src_raw','dst_raw','etype','time']
amazon_network_inv = amazon_network_[['dst_raw','src_raw','etype','time']]
amazon_network_inv.columns = ['src_raw','dst_raw','etype','time']

amazon_network = amazon_network_.append(amazon_network_inv)

etype_encoder = LabelEncoder()
amazon_network['etype'] = etype_encoder.fit_transform(amazon_network.etype)
amazon_gt = pd.read_csv('/home2/kudo/SGCN/raw_data/amazon/amazon_gt.csv',header=None)
amazon_gt.columns = ['node_id_raw','label']
amazon_gt = amazon_gt.drop_duplicates('node_id_raw')

# edge_normの計算
amazon_src_cnt = amazon_network.groupby(['src_raw','etype'])['time'].count().unstack(1,fill_value=0)

amazon_src_dist = pd.DataFrame(amazon_src_cnt.values/amazon_src_cnt.sum(1).values.reshape(-1,1),
                                                        index=amazon_src_cnt.index,
                                                        columns=amazon_src_cnt.columns)

merged_network = pd.merge(amazon_network,amazon_src_dist.stack().reset_index(),on=['src_raw','etype'])


label_encoder = LabelEncoder()
label_encoder.fit(np.hstack((merged_network.src_raw,
                                                   merged_network.dst_raw,
                                                   amazon_gt.node_id_raw)))

merged_network['src'] = label_encoder.transform(merged_network.src_raw)

merged_network['dst'] = label_encoder.transform(merged_network.dst_raw)

amazon_gt['node_id'] = label_encoder.transform(amazon_gt.node_id_raw)
amazon_gt['label'] = amazon_gt['label'].map(lambda x:1 if x==-1 else 0)

# padding
amazon_gt_padded = pd.merge(pd.DataFrame(np.arange(label_encoder.classes_.shape[0])),amazon_gt,
                                      left_index=True,right_on='node_id',how='left').fillna(0.5).sort_values('node_id')

num_nodes = label_encoder.classes_.shape[0]
num_rels = merged_network.etype.unique().shape[0]
num_classes = amazon_gt.label.unique().shape[0]
labels = amazon_gt_padded['label'].values.astype(int).reshape(-1,1)
all_idx = amazon_gt['node_id'].values

# edge type and normalization factor
edge_type = torch.from_numpy(merged_network['etype'].values)
edge_norm = torch.from_numpy(merged_network[0].values.astype('float32')).unsqueeze(1)

labels = torch.from_numpy(labels).view(-1)

merged_network_directed = merged_network.copy().iloc[:merged_network.shape[0]//2,:]
node_feature_df = pd.concat([get_dist(merged_network_directed,'src'),get_dist(merged_network_directed,'dst')],1).fillna(0).sort_index()
node_feature_array = node_feature_df.values.astype('float32')

known_labels = amazon_gt['label'].values

In [None]:
# etype = {-1,1}
from sklearn.preprocessing import LabelEncoder

amazon_network_ = pd.read_csv('/home2/kudo/SGCN/input/amazon/amazon_network.csv')

amazon_network_.columns = ['src_raw','dst_raw','etype']
amazon_network_inv = amazon_network_[['dst_raw','src_raw','etype']]
amazon_network_inv.columns = ['src_raw','dst_raw','etype']

amazon_network = amazon_network_.append(amazon_network_inv)
amazon_network['time'] = 1

etype_encoder = LabelEncoder()
amazon_network['etype'] = etype_encoder.fit_transform(amazon_network.etype)

amazon_gt = pd.read_csv('/home2/kudo/SGCN/input/amazon/amazon_gt.csv')
amazon_gt.columns = ['node_id_raw','label']
amazon_gt = amazon_gt.drop_duplicates('node_id_raw')

# edge_normの計算
amazon_src_raw_cnt = amazon_network.groupby(['src_raw','etype'])['dst_raw'].count().unstack(1,fill_value=0)

amazon_src_raw_dist = pd.DataFrame(amazon_src_raw_cnt.values/amazon_src_raw_cnt.sum(1).values.reshape(-1,1),
                                                        index=amazon_src_raw_cnt.index,
                                                        columns=amazon_src_raw_cnt.columns)

merged_network = pd.merge(amazon_network,amazon_src_raw_dist.stack().reset_index(),on=['src_raw','etype'])

label_encoder = LabelEncoder()
label_encoder.fit(np.hstack((merged_network.src_raw,
                                                   merged_network.dst_raw,
                                                   amazon_gt.node_id_raw)))

merged_network['src'] = label_encoder.transform(merged_network.src_raw)

merged_network['dst'] = label_encoder.transform(merged_network.dst_raw)

amazon_gt['node_id'] = label_encoder.transform(amazon_gt.node_id_raw)
amazon_gt['label'] = amazon_gt['label'].map(lambda x:1 if x==-1 else 0)

# padding
amazon_gt_padded = pd.merge(pd.DataFrame(np.arange(label_encoder.classes_.shape[0])),amazon_gt,
                                      left_index=True,right_on='node_id',how='left').fillna(0.5).sort_values('node_id')

num_nodes = label_encoder.classes_.shape[0]
num_rels = merged_network.etype.unique().shape[0]
num_classes = amazon_gt.label.unique().shape[0]
labels = amazon_gt_padded['label'].values.astype(int).reshape(-1,1)
all_idx = amazon_gt['node_id'].values

# edge type and normalization factor
edge_type = torch.from_numpy(merged_network['etype'].values)
edge_norm = torch.from_numpy(merged_network[0].values.astype('float32')).unsqueeze(1)

labels = torch.from_numpy(labels).view(-1)

# merged_network_directed = merged_network.copy().iloc[:merged_network.shape[0]//2,:]
# node_feature_df = pd.concat([get_dist(merged_network_directed,'src'),get_dist(merged_network_directed,'dst')],1).fillna(0).sort_index()
# node_feature_array = node_feature_df.values.astype('float32')
node_feature_df = pd.read_csv('/home2/kudo/SGCN/input/amazon/amazon_node_feature.csv')
node_feature_array = node_feature_df.values.astype('float32')

known_labels = amazon_gt['label'].values

## alpha ,otc and epinions Data Loading

In [None]:
from sklearn.preprocessing import LabelEncoder

epinions_network = pd.read_csv('/home2/kudo/SGCN/raw_data/epinions/epinions_network.csv',header=None)
epinions_network.columns = ['src_raw','dst_raw','etype','time']
etype_encoder = LabelEncoder()
epinions_network['etype'] = etype_encoder.fit_transform(epinions_network.etype)
epinions_gt = pd.read_csv('/home2/kudo/SGCN/raw_data/epinions/epinions_gt.csv',header=None)
epinions_gt.columns = ['node_id_raw','label']
epinions_gt = epinions_gt.drop_duplicates('node_id_raw')
epinions_gt = epinions_gt.loc[epinions_gt.node_id_raw.isin(set(epinions_network.src_raw)|set(epinions_network.dst_raw))]

# edge_normの計算
epinions_src_cnt = epinions_network.groupby(['src_raw','etype'])['time'].count().unstack(1,fill_value=0)

epinions_src_dist = pd.DataFrame(epinions_src_cnt.values/epinions_src_cnt.sum(1).values.reshape(-1,1),
                                                        index=epinions_src_cnt.index,
                                                        columns=epinions_src_cnt.columns)

merged_network = pd.merge(epinions_network,epinions_src_dist.stack().reset_index(),on=['src_raw','etype'])


label_encoder = LabelEncoder()
label_encoder.fit(np.hstack((merged_network.src_raw,
                                                   merged_network.dst_raw,
                                                   epinions_gt.node_id_raw)))

merged_network['src'] = label_encoder.transform(merged_network.src_raw)

merged_network['dst'] = label_encoder.transform(merged_network.dst_raw)

epinions_gt['node_id'] = label_encoder.transform(epinions_gt.node_id_raw)
epinions_gt['label'] = epinions_gt['label'].map(lambda x:1 if x==-1 else 0)

# padding
epinions_gt_padded = pd.merge(pd.DataFrame(np.arange(label_encoder.classes_.shape[0])),epinions_gt,
                                      left_index=True,right_on='node_id',how='left').fillna(0.5).sort_values('node_id')

num_nodes = label_encoder.classes_.shape[0]
num_rels = merged_network.etype.unique().shape[0]
num_classes = epinions_gt.label.unique().shape[0]
labels = epinions_gt_padded['label'].values.astype(int).reshape(-1,1)
all_idx = epinions_gt['node_id'].values

# edge type and normalization factor
edge_type = torch.from_numpy(merged_network['etype'].values)
edge_norm = torch.from_numpy(merged_network[0].values.astype('float32')).unsqueeze(1)

labels = torch.from_numpy(labels).view(-1)

node_feature_df = pd.concat([get_dist(merged_network,'src'),get_dist(merged_network,'dst')],1).fillna(0).sort_index()
node_feature_array = node_feature_df.values.astype('float32')

known_labels = epinions_gt['label'].values

In [7]:
from sklearn.preprocessing import LabelEncoder
#etype = {-1,1}

epinions_network = pd.read_csv('/home2/kudo/SGCN/input/epinions/epinions_network.csv')
epinions_network.columns = ['src_raw','dst_raw','etype']
etype_encoder = LabelEncoder()
epinions_network['etype'] = etype_encoder.fit_transform(epinions_network.etype)
epinions_gt = pd.read_csv('/home2/kudo/SGCN/input/epinions/epinions_gt.csv')
epinions_gt.columns = ['node_id_raw','label']
epinions_gt = epinions_gt.drop_duplicates('node_id_raw')
epinions_network['time'] = 1

# edge_normの計算
epinions_src_cnt = epinions_network.groupby(['src_raw','etype'])['time'].count().unstack(1,fill_value=0)

epinions_src_dist = pd.DataFrame(epinions_src_cnt.values/epinions_src_cnt.sum(1).values.reshape(-1,1),
                                                        index=epinions_src_cnt.index,
                                                        columns=epinions_src_cnt.columns)

merged_network = pd.merge(epinions_network,epinions_src_dist.stack().reset_index(),on=['src_raw','etype'])


label_encoder = LabelEncoder()
label_encoder.fit(np.hstack((merged_network.src_raw,
                                                   merged_network.dst_raw,
                                                   epinions_gt.node_id_raw)))

merged_network['src'] = label_encoder.transform(merged_network.src_raw)

merged_network['dst'] = label_encoder.transform(merged_network.dst_raw)

epinions_gt['node_id'] = label_encoder.transform(epinions_gt.node_id_raw)
epinions_gt['label'] = epinions_gt['label'].map(lambda x:1 if x==-1 else 0)

# padding
epinions_gt_padded = pd.merge(pd.DataFrame(np.arange(label_encoder.classes_.shape[0])),epinions_gt,
                                      left_index=True,right_on='node_id',how='left').fillna(0.5).sort_values('node_id')

num_nodes = label_encoder.classes_.shape[0]
num_rels = merged_network.etype.unique().shape[0]
num_classes = epinions_gt.label.unique().shape[0]
labels = epinions_gt_padded['label'].values.astype(int).reshape(-1,1)
all_idx = epinions_gt['node_id'].values

# edge type and normalization factor
edge_type = torch.from_numpy(merged_network['etype'].values)
edge_norm = torch.from_numpy(merged_network[0].values.astype('float32')).unsqueeze(1)

labels = torch.from_numpy(labels).view(-1)

node_feature_df = pd.read_csv('/home2/kudo/SGCN/input/epinions/epinions_node_feature.csv')
node_feature_array = node_feature_df.values.astype('float32')

known_labels = epinions_gt['label'].values

Create graph and model
~~~~~~~~~~~~~~~~~~~~~~~



In [8]:
# configurations
n_hidden = [32,16] # number of hidden units
n_bases = -1 # -1 # use number of relations as number of bases
n_hidden_layers = 2 # use 1 input layer, 1 output layer, no hidden layer
n_epochs = 100 # epochs to train
lr = 0.025 # learning rate
l2norm = 0.00001 # L2 norm coefficient

## cross validation

In [9]:
from sklearn.model_selection import StratifiedKFold

In [10]:
kf = StratifiedKFold(n_splits=10,shuffle=True,random_state=0)

In [None]:
auc_scores = []

for i, (for_train_val_idx, for_test_idx) in enumerate(kf.split(np.arange(len(all_idx)),y=known_labels)):
    train_val_idx = all_idx[for_train_val_idx]
    train_idx, val_idx = train_test_split(train_val_idx,test_size=0.33,stratify=known_labels[for_train_val_idx])
    test_idx = all_idx[for_test_idx]
    # create graph
    g = DGLGraph()
    g.add_nodes(num_nodes)
    g.add_edges(merged_network['src'].values, merged_network['dst'].values)
    g.edata.update({'rel_type': edge_type, 'norm': edge_norm})
    # create model
    model = Model(len(g),
                  n_hidden,
                  num_classes,
                  num_rels,
                  node_feature_array,
                  num_bases=n_bases,
                  num_hidden_layers=n_hidden_layers)
    # optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=l2norm)
    print("Start {}-th fold".format(i))
    print("==== Train Phase ====")
    model.train()
    best_auc = 0.0
    best_auc_logits = None
    for epoch in range(n_epochs):
        optimizer.zero_grad()
        logits = model.forward(g)
        loss = F.cross_entropy(logits[train_idx], labels[train_idx])
        loss.backward()

        optimizer.step()

        train_auc = roc_auc_score(y_true=labels[train_idx].detach().numpy(),y_score=logits[train_idx].detach().numpy()[:,1])
        train_loss = F.cross_entropy(logits[train_idx], labels[train_idx])
        val_auc = roc_auc_score(y_true=labels[val_idx].detach().numpy(),y_score=logits[val_idx].detach().numpy()[:,1])
        val_loss = F.cross_entropy(logits[val_idx], labels[val_idx])
        
        if val_auc >= best_auc:
            best_auc = val_auc
            best_auc_logits = logits
            
        print("Epoch {:05d} | ".format(epoch) +
              "Train AUC: {:.4f} | Train Loss: {:.4f} | ".format(
                  train_auc, loss.item()) +
              "Validation AUC: {:.4f} | Validation loss: {:.4f}".format(
                  val_auc, val_loss.item()))
    print("==== Test Phase ====")
    model.eval()
    test_auc = roc_auc_score(y_true=labels[test_idx].detach().numpy(),y_score=best_auc_logits[test_idx].detach().numpy()[:,1])
    auc_scores.append(test_auc)
    print("test auc : {}".format(test_auc))
    print("=================")

In [None]:
np.mean(auc_scores)

結果

- (32,16) , カーネルは独立
    - amazon : 0.749413165 (150 epochs)
    - alpha : 0.926 (150 epochs)
    - otc : 0.9601 (300 epochs)
    
- (32,16) , base = 3
    - amazon : 0.7517083258204111
    - alpha : 0.895
    - otc : 0.95332

結果

- (32,16) , etype = {-1,1}
    - amazon : 0.817841627 (150 epochs)
    - alpha :  0.7928971028971029 (150 epochs)
    - otc :  0.917612942612942 (150 epochs)

結果

- (32,16, 8) , カーネルは独立
    - amazon : (150 epochs)
    - alpha : 0.9206768231768232 (150 epochs)
    - otc :  (300 epochs)
    
- (32,16,8) , base=3
    - amazon : 
    - alpha : 
    - otc : 

## cross-validation for epinions

In [11]:
def subgraph_making(original_network_df,original_gt_df,all_ncount,sample_node_num,train_node_indice_original,neighbor_sampling=True):
    if neighbor_sampling == True:
        first_neighbors = np.unique(original_network_df.loc[(original_network_df.src.isin(train_node_indice_original)) | 
                                                       (original_network_df.dst.isin(train_node_indice_original)),['src','dst']].values)
        sampled_node_indice = np.random.choice(first_neighbors,sample_node_num,replace=False)
    else: 
        sampled_node_indice = set(np.random.choice(np.arange(all_ncount), sample_node_num,replace=False)) | set(train_node_indice_original)

    sub_network_df = \
        original_network_df.loc[(original_network_df.src.isin(sampled_node_indice)) & (original_network_df.dst.isin(sampled_node_indice))]

    sub_gt_df = \
        original_gt_df.copy().loc[(original_gt_df.node_id.isin(sub_network_df.src)) & (original_gt_df.node_id.isin(sub_network_df.dst))]

    from sklearn.preprocessing import LabelEncoder

    subGraph_map_encoder = LabelEncoder()

    subGraph_map_encoder.fit(list(set(sub_network_df.src) | set(sub_network_df.dst) | set(sub_gt_df.node_id)))

    sub_gt_df['node_id'] = subGraph_map_encoder.transform(sub_gt_df.node_id)

    sub_network_df['src'] = subGraph_map_encoder.transform(sub_network_df.src)

    sub_network_df['dst'] = subGraph_map_encoder.transform(sub_network_df.dst)
    
    sub_network_df = add_edge_norm(sub_network_df)

    return sub_network_df, sub_gt_df, subGraph_map_encoder

In [12]:
def subgraph_making_from_edges(original_network_df,original_gt_df,all_ncount,sample_node_num,train_node_indice_original,neighbor_sampling):
    if neighbor_sampling == 'first':
        first_neighbors = np.unique(original_network_df.loc[(original_network_df.src.isin(train_node_indice_original)) | 
                                                       (original_network_df.dst.isin(train_node_indice_original)),['src','dst']].values)
        sampled_node_indice = np.random.choice(first_neighbors,sample_node_num,replace=False)
    elif neighbor_sampling == 'first_second':
        first_neighbors = np.unique(original_network_df.loc[(original_network_df.src.isin(train_node_indice_original)) | 
                                                       (original_network_df.dst.isin(train_node_indice_original)),['src','dst']].values)
        second_neighbors = np.unique(original_network_df.loc[(original_network_df.src.isin(first_neighbors)) | 
                                                       (original_network_df.dst.isin(first_neighbors)),['src','dst']].values)
        sampled_node_indice = np.random.choice(list(set(first_neighbors) | set (second_neighbors)),sample_node_num)
        
    else: 
        sampled_node_indice = set(np.random.choice(np.arange(all_ncount), sample_node_num,replace=False)) | set(train_node_indice_original)

    sub_network_df = \
        original_network_df.loc[(original_network_df.src.isin(sampled_node_indice)) & (original_network_df.dst.isin(sampled_node_indice))]

    sub_gt_df = \
        original_gt_df.copy().loc[(original_gt_df.node_id.isin(sub_network_df.src)) & (original_gt_df.node_id.isin(sub_network_df.dst))]

    from sklearn.preprocessing import LabelEncoder

    subGraph_map_encoder = LabelEncoder()

    subGraph_map_encoder.fit(list(set(sub_network_df.src) | set(sub_network_df.dst) | set(sub_gt_df.node_id)))

    sub_gt_df['node_id'] = subGraph_map_encoder.transform(sub_gt_df.node_id)

    sub_network_df['src'] = subGraph_map_encoder.transform(sub_network_df.src)

    sub_network_df['dst'] = subGraph_map_encoder.transform(sub_network_df.dst)
    
    sub_network_df = add_edge_norm(sub_network_df)

    return sub_network_df, sub_gt_df, subGraph_map_encoder

In [13]:
def add_edge_norm(network_df,src='src'):
    network_df_ = network_df.copy()
    src_cnt = network_df_.groupby([src,'etype'])['time'].count().unstack(1,fill_value=0)
    src_dist = pd.DataFrame(src_cnt.values/src_cnt.sum(1).values.reshape(-1,1),
                                                        index=src_cnt.index,
                                                        columns=src_cnt.columns)
    src_dist_stacked = src_dist.stack()
    src_dist_stacked.name = 'norm'
    merged_network = pd.merge(network_df_,src_dist_stacked.reset_index(),on=['src','etype'])
    return merged_network

In [14]:
from sklearn.model_selection import StratifiedKFold

In [15]:
kf = StratifiedKFold(n_splits=10,shuffle=True,random_state=0)

In [16]:
# create graph
g = DGLGraph()
g.add_nodes(num_nodes)
g.add_edges(merged_network['src'].values, merged_network['dst'].values)
g.edata.update({'rel_type': edge_type, 'norm': edge_norm})

In [17]:
auc_scores = []

for i, (for_train_val_idx, for_test_idx) in enumerate(kf.split(np.arange(len(all_idx)),y=known_labels)):
    # 元々のグラフにおけるインデックス
    train_val_idx = all_idx[for_train_val_idx]
    train_idx, val_idx = train_test_split(train_val_idx,test_size=0.33,stratify=known_labels[for_train_val_idx])
    test_idx = all_idx[for_test_idx]
    
    # making subgrap
    sub_network_df , sub_gt_df, subGraph_map_encoder =  subgraph_making(merged_network,epinions_gt_padded,num_nodes,20000,
                                                                        train_val_idx)
#     sub_network_df , sub_gt_df, subGraph_map_encoder =  subgraph_making_from_edges(merged_network,epinions_gt_padded,num_nodes,20000,
#                                                                         np.random.choice(train_val_idx,10),'first_second')

    sub_num_nodes = subGraph_map_encoder.classes_.shape[0]
    sub_num_rels = num_rels
    sub_node_feature_array =  node_feature_array[subGraph_map_encoder.classes_]
    sub_edge_type = torch.from_numpy(sub_network_df['etype'].values)
    sub_edge_norm = torch.from_numpy(sub_network_df['norm'].values.astype('float32')).unsqueeze(1)

    # サブグラフ上のtrainとvalのインデックス
    sub_train_idx = subGraph_map_encoder.transform([idx for idx in train_idx if idx in subGraph_map_encoder.classes_])
    sub_val_idx = subGraph_map_encoder.transform([idx for idx in val_idx if idx in subGraph_map_encoder.classes_])
    # create graph
    subg = DGLGraph()
    subg.add_nodes(sub_num_nodes)
    subg.add_edges(sub_network_df['src'].values, sub_network_df['dst'].values)
    subg.edata.update({'rel_type': sub_edge_type, 'norm': sub_edge_norm})
    print("{}".format(np.unique(labels[subGraph_map_encoder.inverse_transform(sub_train_idx)].numpy(),return_counts=True)))
    # create model
    model = Model(len(subg),
                  n_hidden,
                  num_classes,
                  sub_num_rels,
                  sub_node_feature_array,
                  num_bases=n_bases,
                  num_hidden_layers=n_hidden_layers)
    # optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=l2norm)
    print("Start {}-th fold".format(i))
    print("==== Train Phase ====")
    model.train()
    best_auc = 0.0
    best_model = None
    for epoch in range(n_epochs):
        optimizer.zero_grad()
        logits = model.forward(subg)
        loss = F.cross_entropy(logits[sub_train_idx], labels[subGraph_map_encoder.inverse_transform(sub_train_idx)])
        loss.backward()

        optimizer.step()

        train_auc = roc_auc_score(y_true=labels[subGraph_map_encoder.inverse_transform(sub_train_idx)].detach().numpy(),
                                  y_score=logits[sub_train_idx].detach().numpy()[:,1])
        train_loss = F.cross_entropy(logits[sub_train_idx], labels[subGraph_map_encoder.inverse_transform(sub_train_idx)])
        val_auc = roc_auc_score(y_true=labels[subGraph_map_encoder.inverse_transform(sub_val_idx)].detach().numpy(),
                                y_score=logits[sub_val_idx].detach().numpy()[:,1])
        val_loss = F.cross_entropy(logits[sub_val_idx], labels[subGraph_map_encoder.inverse_transform(sub_val_idx)])
        
        if val_auc >= best_auc:
            best_auc = val_auc
            best_model = cp.deepcopy(model)
            
        print("Epoch {:05d} | ".format(epoch) +
              "Train AUC: {:.4f} | Train Loss: {:.4f} | ".format(
                  train_auc, loss.item()) +
              "Validation AUC: {:.4f} | Validation loss: {:.4f}".format(
                  val_auc, val_loss.item()))
    del logits
    print("==== Test Phase ====")
    best_model.eval()
    best_model.features = torch.from_numpy(node_feature_array)
    all_logits = best_model.forward(g)
    test_auc = roc_auc_score(y_true=labels[test_idx].detach().numpy(),y_score=all_logits[test_idx].detach().numpy()[:,1])
    auc_scores.append(test_auc)
    print("test auc : {}".format(test_auc))
    del best_model
    del all_logits
    print("=================")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


(array([0, 1]), array([630,  63]))
Start 0-th fold
==== Train Phase ====
Epoch 00000 | Train AUC: 0.5198 | Train Loss: 0.4038 | Validation AUC: 0.4936 | Validation loss: 0.4177
Epoch 00001 | Train AUC: 0.5192 | Train Loss: 0.4033 | Validation AUC: 0.4936 | Validation loss: 0.4178
Epoch 00002 | Train AUC: 0.5177 | Train Loss: 0.4032 | Validation AUC: 0.5013 | Validation loss: 0.4180
Epoch 00003 | Train AUC: 0.5199 | Train Loss: 0.4031 | Validation AUC: 0.5077 | Validation loss: 0.4177
Epoch 00004 | Train AUC: 0.5192 | Train Loss: 0.4031 | Validation AUC: 0.5077 | Validation loss: 0.4177
Epoch 00005 | Train AUC: 0.5192 | Train Loss: 0.4031 | Validation AUC: 0.5077 | Validation loss: 0.4177
Epoch 00006 | Train AUC: 0.5192 | Train Loss: 0.4031 | Validation AUC: 0.5077 | Validation loss: 0.4177
Epoch 00007 | Train AUC: 0.5192 | Train Loss: 0.4031 | Validation AUC: 0.5061 | Validation loss: 0.4177
Epoch 00008 | Train AUC: 0.5272 | Train Loss: 0.4017 | Validation AUC: 0.5061 | Validation loss

Epoch 00079 | Train AUC: 0.5207 | Train Loss: 0.4031 | Validation AUC: 0.4936 | Validation loss: 0.4177
Epoch 00080 | Train AUC: 0.5207 | Train Loss: 0.4031 | Validation AUC: 0.4936 | Validation loss: 0.4177
Epoch 00081 | Train AUC: 0.5207 | Train Loss: 0.4031 | Validation AUC: 0.4936 | Validation loss: 0.4177
Epoch 00082 | Train AUC: 0.5207 | Train Loss: 0.4031 | Validation AUC: 0.4936 | Validation loss: 0.4177
Epoch 00083 | Train AUC: 0.5207 | Train Loss: 0.4031 | Validation AUC: 0.4936 | Validation loss: 0.4177
Epoch 00084 | Train AUC: 0.5207 | Train Loss: 0.4031 | Validation AUC: 0.4936 | Validation loss: 0.4177
Epoch 00085 | Train AUC: 0.5207 | Train Loss: 0.4031 | Validation AUC: 0.4936 | Validation loss: 0.4177
Epoch 00086 | Train AUC: 0.5207 | Train Loss: 0.4031 | Validation AUC: 0.4936 | Validation loss: 0.4177
Epoch 00087 | Train AUC: 0.5207 | Train Loss: 0.4031 | Validation AUC: 0.4936 | Validation loss: 0.4177
Epoch 00088 | Train AUC: 0.5207 | Train Loss: 0.4031 | Validatio



test auc : 0.5487349778621126


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


(array([0, 1]), array([641,  44]))
Start 1-th fold
==== Train Phase ====
Epoch 00000 | Train AUC: 0.1659 | Train Loss: 1.2750 | Validation AUC: 0.1674 | Validation loss: 1.2661
Epoch 00001 | Train AUC: 0.1556 | Train Loss: 1.2750 | Validation AUC: 0.1296 | Validation loss: 1.2742
Epoch 00002 | Train AUC: 0.5474 | Train Loss: 0.3787 | Validation AUC: 0.5172 | Validation loss: 0.4254
Epoch 00003 | Train AUC: 0.5482 | Train Loss: 0.3787 | Validation AUC: 0.5187 | Validation loss: 0.4254
Epoch 00004 | Train AUC: 0.5482 | Train Loss: 0.3787 | Validation AUC: 0.5187 | Validation loss: 0.4254
Epoch 00005 | Train AUC: 0.5475 | Train Loss: 0.3787 | Validation AUC: 0.5172 | Validation loss: 0.4254
Epoch 00006 | Train AUC: 0.5579 | Train Loss: 0.3787 | Validation AUC: 0.5172 | Validation loss: 0.4254
Epoch 00007 | Train AUC: 0.5572 | Train Loss: 0.3787 | Validation AUC: 0.5157 | Validation loss: 0.4255
Epoch 00008 | Train AUC: 0.5574 | Train Loss: 0.3787 | Validation AUC: 0.5280 | Validation loss

Epoch 00079 | Train AUC: 0.8951 | Train Loss: 0.3495 | Validation AUC: 0.8887 | Validation loss: 0.3533
Epoch 00080 | Train AUC: 0.8323 | Train Loss: 0.3480 | Validation AUC: 0.8316 | Validation loss: 0.3584
Epoch 00081 | Train AUC: 0.8331 | Train Loss: 0.3480 | Validation AUC: 0.8322 | Validation loss: 0.3583
Epoch 00082 | Train AUC: 0.8215 | Train Loss: 0.3495 | Validation AUC: 0.7822 | Validation loss: 0.3695
Epoch 00083 | Train AUC: 0.8226 | Train Loss: 0.3480 | Validation AUC: 0.7822 | Validation loss: 0.3695
Epoch 00084 | Train AUC: 0.8229 | Train Loss: 0.3480 | Validation AUC: 0.7822 | Validation loss: 0.3695
Epoch 00085 | Train AUC: 0.8240 | Train Loss: 0.3465 | Validation AUC: 0.7822 | Validation loss: 0.3695
Epoch 00086 | Train AUC: 0.8240 | Train Loss: 0.3465 | Validation AUC: 0.7822 | Validation loss: 0.3695
Epoch 00087 | Train AUC: 0.8240 | Train Loss: 0.3465 | Validation AUC: 0.7822 | Validation loss: 0.3695
Epoch 00088 | Train AUC: 0.8240 | Train Loss: 0.3465 | Validatio

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


(array([0, 1]), array([667,  53]))
Start 2-th fold
==== Train Phase ====
Epoch 00000 | Train AUC: 0.8619 | Train Loss: 0.3767 | Validation AUC: 0.9057 | Validation loss: 0.3720
Epoch 00001 | Train AUC: 0.8094 | Train Loss: 0.3601 | Validation AUC: 0.8678 | Validation loss: 0.3570
Epoch 00002 | Train AUC: 0.7997 | Train Loss: 0.3615 | Validation AUC: 0.8678 | Validation loss: 0.3570
Epoch 00003 | Train AUC: 0.7997 | Train Loss: 0.3615 | Validation AUC: 0.8678 | Validation loss: 0.3570
Epoch 00004 | Train AUC: 0.7997 | Train Loss: 0.3615 | Validation AUC: 0.8678 | Validation loss: 0.3571
Epoch 00005 | Train AUC: 0.8182 | Train Loss: 0.3601 | Validation AUC: 0.8670 | Validation loss: 0.3571
Epoch 00006 | Train AUC: 0.8246 | Train Loss: 0.3628 | Validation AUC: 0.8656 | Validation loss: 0.3601
Epoch 00007 | Train AUC: 0.7816 | Train Loss: 0.3615 | Validation AUC: 0.8146 | Validation loss: 0.3661
Epoch 00008 | Train AUC: 0.7827 | Train Loss: 0.3601 | Validation AUC: 0.8146 | Validation loss

Epoch 00079 | Train AUC: 0.8552 | Train Loss: 0.3740 | Validation AUC: 0.9043 | Validation loss: 0.3745
Epoch 00080 | Train AUC: 0.8552 | Train Loss: 0.3740 | Validation AUC: 0.9043 | Validation loss: 0.3745
Epoch 00081 | Train AUC: 0.8552 | Train Loss: 0.3740 | Validation AUC: 0.9043 | Validation loss: 0.3745
Epoch 00082 | Train AUC: 0.8552 | Train Loss: 0.3740 | Validation AUC: 0.9043 | Validation loss: 0.3745
Epoch 00083 | Train AUC: 0.8552 | Train Loss: 0.3740 | Validation AUC: 0.9043 | Validation loss: 0.3745
Epoch 00084 | Train AUC: 0.8552 | Train Loss: 0.3740 | Validation AUC: 0.9043 | Validation loss: 0.3745
Epoch 00085 | Train AUC: 0.8552 | Train Loss: 0.3740 | Validation AUC: 0.9043 | Validation loss: 0.3745
Epoch 00086 | Train AUC: 0.8552 | Train Loss: 0.3740 | Validation AUC: 0.9043 | Validation loss: 0.3745
Epoch 00087 | Train AUC: 0.8552 | Train Loss: 0.3740 | Validation AUC: 0.9043 | Validation loss: 0.3745
Epoch 00088 | Train AUC: 0.8552 | Train Loss: 0.3740 | Validatio

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


(array([0, 1]), array([632,  63]))
Start 3-th fold
==== Train Phase ====
Epoch 00000 | Train AUC: 0.4540 | Train Loss: 1.2241 | Validation AUC: 0.5015 | Validation loss: 1.2183
Epoch 00001 | Train AUC: 0.4461 | Train Loss: 1.2241 | Validation AUC: 0.5015 | Validation loss: 1.2183
Epoch 00002 | Train AUC: 0.4547 | Train Loss: 1.2241 | Validation AUC: 0.5031 | Validation loss: 1.2182
Epoch 00003 | Train AUC: 0.4679 | Train Loss: 1.2123 | Validation AUC: 0.5170 | Validation loss: 1.2055
Epoch 00004 | Train AUC: 0.8446 | Train Loss: 0.3526 | Validation AUC: 0.6979 | Validation loss: 0.3803
Epoch 00005 | Train AUC: 0.8593 | Train Loss: 0.3507 | Validation AUC: 0.6948 | Validation loss: 0.3859
Epoch 00006 | Train AUC: 0.8593 | Train Loss: 0.3507 | Validation AUC: 0.6957 | Validation loss: 0.3859
Epoch 00007 | Train AUC: 0.8593 | Train Loss: 0.3507 | Validation AUC: 0.6957 | Validation loss: 0.3859
Epoch 00008 | Train AUC: 0.8593 | Train Loss: 0.3507 | Validation AUC: 0.6966 | Validation loss

Epoch 00079 | Train AUC: 0.8647 | Train Loss: 0.3536 | Validation AUC: 0.7407 | Validation loss: 0.3775
Epoch 00080 | Train AUC: 0.8647 | Train Loss: 0.3536 | Validation AUC: 0.7407 | Validation loss: 0.3775
Epoch 00081 | Train AUC: 0.8647 | Train Loss: 0.3536 | Validation AUC: 0.7407 | Validation loss: 0.3775
Epoch 00082 | Train AUC: 0.8647 | Train Loss: 0.3536 | Validation AUC: 0.7407 | Validation loss: 0.3775
Epoch 00083 | Train AUC: 0.8647 | Train Loss: 0.3536 | Validation AUC: 0.7407 | Validation loss: 0.3775
Epoch 00084 | Train AUC: 0.8647 | Train Loss: 0.3536 | Validation AUC: 0.7407 | Validation loss: 0.3775
Epoch 00085 | Train AUC: 0.8647 | Train Loss: 0.3536 | Validation AUC: 0.7407 | Validation loss: 0.3775
Epoch 00086 | Train AUC: 0.8647 | Train Loss: 0.3536 | Validation AUC: 0.7407 | Validation loss: 0.3775
Epoch 00087 | Train AUC: 0.8647 | Train Loss: 0.3536 | Validation AUC: 0.7407 | Validation loss: 0.3775
Epoch 00088 | Train AUC: 0.8647 | Train Loss: 0.3536 | Validatio

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


(array([0, 1]), array([622,  61]))
Start 4-th fold
==== Train Phase ====
Epoch 00000 | Train AUC: 0.4832 | Train Loss: 1.2193 | Validation AUC: 0.4634 | Validation loss: 1.2358
Epoch 00001 | Train AUC: 0.5100 | Train Loss: 0.4049 | Validation AUC: 0.5175 | Validation loss: 0.3885
Epoch 00002 | Train AUC: 0.5100 | Train Loss: 0.4047 | Validation AUC: 0.5175 | Validation loss: 0.3885
Epoch 00003 | Train AUC: 0.5100 | Train Loss: 0.4047 | Validation AUC: 0.5175 | Validation loss: 0.3885
Epoch 00004 | Train AUC: 0.5100 | Train Loss: 0.4047 | Validation AUC: 0.5175 | Validation loss: 0.3885
Epoch 00005 | Train AUC: 0.5100 | Train Loss: 0.4047 | Validation AUC: 0.5175 | Validation loss: 0.3885
Epoch 00006 | Train AUC: 0.5100 | Train Loss: 0.4047 | Validation AUC: 0.5175 | Validation loss: 0.3885
Epoch 00007 | Train AUC: 0.5100 | Train Loss: 0.4047 | Validation AUC: 0.5175 | Validation loss: 0.3885
Epoch 00008 | Train AUC: 0.5100 | Train Loss: 0.4047 | Validation AUC: 0.5175 | Validation loss

Epoch 00079 | Train AUC: 0.5100 | Train Loss: 0.4047 | Validation AUC: 0.5175 | Validation loss: 0.3885
Epoch 00080 | Train AUC: 0.5100 | Train Loss: 0.4047 | Validation AUC: 0.5175 | Validation loss: 0.3885
Epoch 00081 | Train AUC: 0.5100 | Train Loss: 0.4047 | Validation AUC: 0.5175 | Validation loss: 0.3885
Epoch 00082 | Train AUC: 0.5100 | Train Loss: 0.4047 | Validation AUC: 0.5175 | Validation loss: 0.3885
Epoch 00083 | Train AUC: 0.5100 | Train Loss: 0.4047 | Validation AUC: 0.5175 | Validation loss: 0.3885
Epoch 00084 | Train AUC: 0.5100 | Train Loss: 0.4047 | Validation AUC: 0.5175 | Validation loss: 0.3885
Epoch 00085 | Train AUC: 0.5100 | Train Loss: 0.4047 | Validation AUC: 0.5175 | Validation loss: 0.3885
Epoch 00086 | Train AUC: 0.5100 | Train Loss: 0.4047 | Validation AUC: 0.5175 | Validation loss: 0.3885
Epoch 00087 | Train AUC: 0.5100 | Train Loss: 0.4047 | Validation AUC: 0.5175 | Validation loss: 0.3885
Epoch 00088 | Train AUC: 0.5100 | Train Loss: 0.4047 | Validatio

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


(array([0, 1]), array([642,  58]))
Start 5-th fold
==== Train Phase ====
Epoch 00000 | Train AUC: 0.1218 | Train Loss: 1.2718 | Validation AUC: 0.2300 | Validation loss: 1.2508
Epoch 00001 | Train AUC: 0.0603 | Train Loss: 1.2560 | Validation AUC: 0.1329 | Validation loss: 1.2481
Epoch 00002 | Train AUC: 0.1540 | Train Loss: 1.2699 | Validation AUC: 0.2567 | Validation loss: 1.2508
Epoch 00003 | Train AUC: 0.1541 | Train Loss: 1.2688 | Validation AUC: 0.2707 | Validation loss: 1.2481
Epoch 00004 | Train AUC: 0.1628 | Train Loss: 1.2674 | Validation AUC: 0.2847 | Validation loss: 1.2454
Epoch 00005 | Train AUC: 0.1713 | Train Loss: 1.2674 | Validation AUC: 0.2847 | Validation loss: 1.2454
Epoch 00006 | Train AUC: 0.1886 | Train Loss: 1.2645 | Validation AUC: 0.2847 | Validation loss: 1.2477
Epoch 00007 | Train AUC: 0.1886 | Train Loss: 1.2645 | Validation AUC: 0.2847 | Validation loss: 1.2480
Epoch 00008 | Train AUC: 0.1886 | Train Loss: 1.2645 | Validation AUC: 0.2847 | Validation loss

Epoch 00079 | Train AUC: 0.1887 | Train Loss: 1.2660 | Validation AUC: 0.2987 | Validation loss: 1.2427
Epoch 00080 | Train AUC: 0.1887 | Train Loss: 1.2660 | Validation AUC: 0.2987 | Validation loss: 1.2427
Epoch 00081 | Train AUC: 0.1887 | Train Loss: 1.2660 | Validation AUC: 0.2987 | Validation loss: 1.2427
Epoch 00082 | Train AUC: 0.1887 | Train Loss: 1.2660 | Validation AUC: 0.2987 | Validation loss: 1.2427
Epoch 00083 | Train AUC: 0.1887 | Train Loss: 1.2660 | Validation AUC: 0.2987 | Validation loss: 1.2427
Epoch 00084 | Train AUC: 0.1887 | Train Loss: 1.2660 | Validation AUC: 0.2987 | Validation loss: 1.2427
Epoch 00085 | Train AUC: 0.1887 | Train Loss: 1.2660 | Validation AUC: 0.2987 | Validation loss: 1.2427
Epoch 00086 | Train AUC: 0.1887 | Train Loss: 1.2660 | Validation AUC: 0.2987 | Validation loss: 1.2427
Epoch 00087 | Train AUC: 0.1887 | Train Loss: 1.2660 | Validation AUC: 0.2987 | Validation loss: 1.2427
Epoch 00088 | Train AUC: 0.1887 | Train Loss: 1.2660 | Validatio

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


(array([0, 1]), array([670,  72]))
Start 6-th fold
==== Train Phase ====
Epoch 00000 | Train AUC: 0.4804 | Train Loss: 1.2099 | Validation AUC: 0.4889 | Validation loss: 1.2139
Epoch 00001 | Train AUC: 0.4811 | Train Loss: 1.2099 | Validation AUC: 0.4889 | Validation loss: 1.2139
Epoch 00002 | Train AUC: 0.2055 | Train Loss: 1.2503 | Validation AUC: 0.2525 | Validation loss: 1.2445
Epoch 00003 | Train AUC: 0.1931 | Train Loss: 1.2503 | Validation AUC: 0.1904 | Validation loss: 1.2537
Epoch 00004 | Train AUC: 0.1875 | Train Loss: 1.2490 | Validation AUC: 0.1742 | Validation loss: 1.2537
Epoch 00005 | Train AUC: 0.1933 | Train Loss: 1.2490 | Validation AUC: 0.1904 | Validation loss: 1.2537
Epoch 00006 | Train AUC: 0.5154 | Train Loss: 0.4121 | Validation AUC: 0.5111 | Validation loss: 0.4097
Epoch 00007 | Train AUC: 0.5168 | Train Loss: 0.4121 | Validation AUC: 0.5111 | Validation loss: 0.4097
Epoch 00008 | Train AUC: 0.5189 | Train Loss: 0.4121 | Validation AUC: 0.5111 | Validation loss

Epoch 00079 | Train AUC: 0.5196 | Train Loss: 0.4121 | Validation AUC: 0.5111 | Validation loss: 0.4097
Epoch 00080 | Train AUC: 0.5196 | Train Loss: 0.4121 | Validation AUC: 0.5111 | Validation loss: 0.4097
Epoch 00081 | Train AUC: 0.5196 | Train Loss: 0.4121 | Validation AUC: 0.5111 | Validation loss: 0.4097
Epoch 00082 | Train AUC: 0.5196 | Train Loss: 0.4121 | Validation AUC: 0.5111 | Validation loss: 0.4097
Epoch 00083 | Train AUC: 0.5196 | Train Loss: 0.4121 | Validation AUC: 0.5111 | Validation loss: 0.4097
Epoch 00084 | Train AUC: 0.5196 | Train Loss: 0.4121 | Validation AUC: 0.5111 | Validation loss: 0.4097
Epoch 00085 | Train AUC: 0.5196 | Train Loss: 0.4121 | Validation AUC: 0.5111 | Validation loss: 0.4097
Epoch 00086 | Train AUC: 0.5196 | Train Loss: 0.4121 | Validation AUC: 0.5111 | Validation loss: 0.4097
Epoch 00087 | Train AUC: 0.5196 | Train Loss: 0.4121 | Validation AUC: 0.5111 | Validation loss: 0.4097
Epoch 00088 | Train AUC: 0.5196 | Train Loss: 0.4121 | Validatio

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


(array([0, 1]), array([639,  54]))
Start 7-th fold
==== Train Phase ====




Epoch 00000 | Train AUC: 0.5231 | Train Loss: 0.3912 | Validation AUC: 0.5103 | Validation loss: 0.4110
Epoch 00001 | Train AUC: 0.5224 | Train Loss: 0.3912 | Validation AUC: 0.5103 | Validation loss: 0.4110
Epoch 00002 | Train AUC: 0.5300 | Train Loss: 0.3912 | Validation AUC: 0.5087 | Validation loss: 0.4110
Epoch 00003 | Train AUC: 0.5286 | Train Loss: 0.3912 | Validation AUC: 0.5386 | Validation loss: 0.4110
Epoch 00004 | Train AUC: 0.6672 | Train Loss: 0.3754 | Validation AUC: 0.7047 | Validation loss: 0.3786
Epoch 00005 | Train AUC: 0.1338 | Train Loss: 1.2512 | Validation AUC: 0.1501 | Validation loss: 1.2574
Epoch 00006 | Train AUC: 0.5238 | Train Loss: 0.3945 | Validation AUC: 0.5087 | Validation loss: 0.4120
Epoch 00007 | Train AUC: 0.5308 | Train Loss: 0.3912 | Validation AUC: 0.5103 | Validation loss: 0.4110
Epoch 00008 | Train AUC: 0.5316 | Train Loss: 0.3912 | Validation AUC: 0.5253 | Validation loss: 0.4110
Epoch 00009 | Train AUC: 0.5316 | Train Loss: 0.3912 | Validatio

Epoch 00079 | Train AUC: 0.5239 | Train Loss: 0.3912 | Validation AUC: 0.5119 | Validation loss: 0.4110
Epoch 00080 | Train AUC: 0.5239 | Train Loss: 0.3912 | Validation AUC: 0.5119 | Validation loss: 0.4110
Epoch 00081 | Train AUC: 0.5239 | Train Loss: 0.3912 | Validation AUC: 0.5119 | Validation loss: 0.4110
Epoch 00082 | Train AUC: 0.5239 | Train Loss: 0.3912 | Validation AUC: 0.5119 | Validation loss: 0.4110
Epoch 00083 | Train AUC: 0.5239 | Train Loss: 0.3912 | Validation AUC: 0.5119 | Validation loss: 0.4110
Epoch 00084 | Train AUC: 0.5239 | Train Loss: 0.3912 | Validation AUC: 0.5119 | Validation loss: 0.4110
Epoch 00085 | Train AUC: 0.5239 | Train Loss: 0.3912 | Validation AUC: 0.5119 | Validation loss: 0.4110
Epoch 00086 | Train AUC: 0.5239 | Train Loss: 0.3912 | Validation AUC: 0.5119 | Validation loss: 0.4110
Epoch 00087 | Train AUC: 0.5239 | Train Loss: 0.3912 | Validation AUC: 0.5119 | Validation loss: 0.4110
Epoch 00088 | Train AUC: 0.5239 | Train Loss: 0.3912 | Validatio

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


(array([0, 1]), array([647,  53]))
Start 8-th fold
==== Train Phase ====
Epoch 00000 | Train AUC: 0.1133 | Train Loss: 1.2576 | Validation AUC: 0.0991 | Validation loss: 1.2715
Epoch 00001 | Train AUC: 0.5292 | Train Loss: 0.3907 | Validation AUC: 0.5114 | Validation loss: 0.3925
Epoch 00002 | Train AUC: 0.5299 | Train Loss: 0.3907 | Validation AUC: 0.5114 | Validation loss: 0.3926
Epoch 00003 | Train AUC: 0.5207 | Train Loss: 0.3907 | Validation AUC: 0.5114 | Validation loss: 0.3926
Epoch 00004 | Train AUC: 0.5214 | Train Loss: 0.3907 | Validation AUC: 0.5114 | Validation loss: 0.3926
Epoch 00005 | Train AUC: 0.5214 | Train Loss: 0.3907 | Validation AUC: 0.5114 | Validation loss: 0.3926
Epoch 00006 | Train AUC: 0.5214 | Train Loss: 0.3907 | Validation AUC: 0.5114 | Validation loss: 0.3926
Epoch 00007 | Train AUC: 0.5214 | Train Loss: 0.3907 | Validation AUC: 0.5128 | Validation loss: 0.3926
Epoch 00008 | Train AUC: 0.5214 | Train Loss: 0.3907 | Validation AUC: 0.5128 | Validation loss

Epoch 00079 | Train AUC: 0.5214 | Train Loss: 0.3907 | Validation AUC: 0.5128 | Validation loss: 0.3926
Epoch 00080 | Train AUC: 0.5214 | Train Loss: 0.3907 | Validation AUC: 0.5128 | Validation loss: 0.3926
Epoch 00081 | Train AUC: 0.5214 | Train Loss: 0.3907 | Validation AUC: 0.5128 | Validation loss: 0.3926
Epoch 00082 | Train AUC: 0.5214 | Train Loss: 0.3907 | Validation AUC: 0.5128 | Validation loss: 0.3926
Epoch 00083 | Train AUC: 0.5214 | Train Loss: 0.3907 | Validation AUC: 0.5128 | Validation loss: 0.3926
Epoch 00084 | Train AUC: 0.5214 | Train Loss: 0.3907 | Validation AUC: 0.5128 | Validation loss: 0.3926
Epoch 00085 | Train AUC: 0.5214 | Train Loss: 0.3907 | Validation AUC: 0.5128 | Validation loss: 0.3926
Epoch 00086 | Train AUC: 0.5214 | Train Loss: 0.3907 | Validation AUC: 0.5128 | Validation loss: 0.3926
Epoch 00087 | Train AUC: 0.5214 | Train Loss: 0.3907 | Validation AUC: 0.5128 | Validation loss: 0.3926
Epoch 00088 | Train AUC: 0.5214 | Train Loss: 0.3907 | Validatio

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


(array([0, 1]), array([666,  57]))
Start 9-th fold
==== Train Phase ====
Epoch 00000 | Train AUC: 0.5249 | Train Loss: 0.3927 | Validation AUC: 0.4937 | Validation loss: 0.3784
Epoch 00001 | Train AUC: 0.5291 | Train Loss: 0.3924 | Validation AUC: 0.4953 | Validation loss: 0.3783
Epoch 00002 | Train AUC: 0.5291 | Train Loss: 0.3924 | Validation AUC: 0.4953 | Validation loss: 0.3782
Epoch 00003 | Train AUC: 0.5291 | Train Loss: 0.3923 | Validation AUC: 0.4953 | Validation loss: 0.3781
Epoch 00004 | Train AUC: 0.5291 | Train Loss: 0.3923 | Validation AUC: 0.4953 | Validation loss: 0.3780
Epoch 00005 | Train AUC: 0.5291 | Train Loss: 0.3923 | Validation AUC: 0.4953 | Validation loss: 0.3779
Epoch 00006 | Train AUC: 0.5291 | Train Loss: 0.3923 | Validation AUC: 0.4953 | Validation loss: 0.3779
Epoch 00007 | Train AUC: 0.5291 | Train Loss: 0.3923 | Validation AUC: 0.4953 | Validation loss: 0.3778
Epoch 00008 | Train AUC: 0.5291 | Train Loss: 0.3923 | Validation AUC: 0.4953 | Validation loss

Epoch 00079 | Train AUC: 0.7890 | Train Loss: 0.3619 | Validation AUC: 0.6844 | Validation loss: 0.3603
Epoch 00080 | Train AUC: 0.7882 | Train Loss: 0.3633 | Validation AUC: 0.6844 | Validation loss: 0.3603
Epoch 00081 | Train AUC: 0.7882 | Train Loss: 0.3633 | Validation AUC: 0.7085 | Validation loss: 0.3573
Epoch 00082 | Train AUC: 0.7882 | Train Loss: 0.3633 | Validation AUC: 0.7085 | Validation loss: 0.3573
Epoch 00083 | Train AUC: 0.7882 | Train Loss: 0.3633 | Validation AUC: 0.7085 | Validation loss: 0.3573
Epoch 00084 | Train AUC: 0.7882 | Train Loss: 0.3633 | Validation AUC: 0.7085 | Validation loss: 0.3574
Epoch 00085 | Train AUC: 0.7882 | Train Loss: 0.3633 | Validation AUC: 0.7085 | Validation loss: 0.3574
Epoch 00086 | Train AUC: 0.7882 | Train Loss: 0.3633 | Validation AUC: 0.7085 | Validation loss: 0.3574
Epoch 00087 | Train AUC: 0.7882 | Train Loss: 0.3633 | Validation AUC: 0.7085 | Validation loss: 0.3574
Epoch 00088 | Train AUC: 0.7882 | Train Loss: 0.3633 | Validatio

In [18]:
np.mean(auc_scores)

0.5996410233578654

In [None]:
def print_varsize():
    import types
    print("{}{: >15}{}{: >10}{}".format('|','Variable Name','|','  Size','|'))
    print(" -------------------------- ")
    for k, v in globals().items():
        if hasattr(v, 'size') and not k.startswith('_') and not isinstance(v,types.ModuleType):
            print("{}{: >15}{}{: >10}{}".format('|',k,'|',str(v.size),'|'))
        elif hasattr(v, '__len__') and not k.startswith('_') and not isinstance(v,types.ModuleType):
            try:
                print("{}{: >15}{}{: >10}{}".format('|',k,'|',str(len(v)),'|'))
            except:
                pass

In [None]:
print_varsize()