In [139]:
import pandas as pd
import numpy as np
import torch
from collections import defaultdict
import random
import networkx as nx
import torch.nn as nn
from tqdm import trange, tqdm
from torch.autograd import Variable
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score


In [125]:
from random_graph import random_graph_gcn
node_df, edge_df = random_graph_gcn(1000,3000,report_rate=0.5, driver_rate=0.8,nums_features=120)

In [3]:
node_df.shape

(1000, 123)

In [4]:
# 数据格式处理
# 数据集被分成num_clusters个子数据集（子图）->其实相当于，被分成了几个batch

In [45]:
num_clusters = 8

In [46]:

# node_lookup: store node index
node_lookup = pd.DataFrame({'node': node_df.index,}, index=node_df.cust_id)

# delete no-edge-node 
diff_node = list(set(node_df['cust_id'])-(set(node_df['cust_id']) - set(edge_df['cust_id']) - set(edge_df['opp_id'])))

node_df = node_df.iloc[node_lookup.iloc[diff_node]['node']].reset_index(drop=True)

#build up graph
graph = nx.from_edgelist([(cust,opp) for cust, opp in zip(edge_df['cust_id'],edge_df['opp_id'])])

# random_clustering
clusters = [cluster for cluster in range(num_clusters)]
cluster_membership = {node : random.choice(clusters) for node in graph.nodes()}

# build-up membership dict
sg_nodes = {}
sg_edges = {}
sg_train_nodes = {}
sg_test_nodes = {}
sg_train_features = {}
sg_test_features = {}
sg_train_targets = {}
sg_test_targets = {}

for cluster in clusters:
    #print(cluster)
    subgraph = graph.subgraph([node for node in sorted(graph.nodes()) if cluster_membership[node] == cluster])
    sg_nodes[cluster] = [node for node in sorted(subgraph.nodes())]

    mapper = {node: i for i, node in enumerate(sorted(sg_nodes[cluster]))}
    sg_edges[cluster] = [[mapper[edge[0]], mapper[edge[1]]] for edge in subgraph.edges()] +  [[mapper[edge[1]], mapper[edge[0]]] for edge in subgraph.edges()]

    sg_train_nodes[cluster] = [node for node in node_df[node_df['is_driver'] == True]['cust_id'] if node in sg_nodes[cluster]]
    sg_test_nodes[cluster] = [node for node in node_df[node_df['is_driver'] == False]['cust_id'] if node in sg_nodes[cluster]]

    sg_test_nodes[cluster] = sorted(sg_test_nodes[cluster])
    sg_train_nodes[cluster] = sorted(sg_train_nodes[cluster])
    
    feats_name = list(set(node_df.columns) - set(['cust_id','is_driver','is_reported']))
    sg_train_features[cluster] = pd.concat([node_df[(node_df['cust_id'] == cust)&(node_df['is_driver'] == True)][feats_name] for cust in sg_nodes[cluster]],axis = 0)
    sg_test_features[cluster] = pd.concat([node_df[(node_df['cust_id'] == cust)&(node_df['is_driver'] == False)][feats_name] for cust in sg_nodes[cluster]],axis = 0)
    sg_train_targets[cluster] = pd.concat([node_df[(node_df['cust_id'] == cust)&(node_df['is_driver'] == True)][['is_reported']] for cust in sg_nodes[cluster]],axis = 0)
    sg_test_targets[cluster] = pd.concat([node_df[(node_df['cust_id'] == cust)&(node_df['is_driver'] == False)][['is_reported']] for cust in sg_nodes[cluster]],axis = 0)

In [205]:
graph,sg_nodes,sg_edges,sg_train_nodes,sg_test_nodes,sg_features,sg_targets = data_format_process(node_df,edge_df,7)

In [7]:
import ClusterGraph_v0

In [8]:
a = ClusterGraph_v0.preprocessing(node_df,edge_df,18)

In [9]:
graph,data_dict = a.run()

In [None]:
torch.nn.functional.relu()

In [131]:
class preprocessing():
    
    def __init__(self, node_df, edge_df, num_clusters):
        
        
        assert all(col in node_df.columns for col in ['cust_id','is_driver','is_reported'])
        assert all(col in edge_df.columns for col in ['cust_id','opp_id'])
        assert type(num_clusters) == int
        
        self.node_df = node_df
        self.edge_df = edge_df
        self.num_clusters = num_clusters
        self.data_dict = {}

    def run(self):
        
        self.delete_nodes()
        graph = self.build_graph(self.edge_df)
        clusters, cluster_membership = self.random_clustering(graph)
        self.data_dict['sg_nodes'], self.data_dict['sg_edges'], self.data_dict['sg_train_nodes'], self.data_dict['sg_test_nodes'], self.data_dict['sg_train_features'], self.data_dict['sg_test_features'], self.data_dict['sg_train_targets'], self.data_dict['sg_test_targets'] = self.build_membership_dict(graph,clusters, cluster_membership)
        
        
        return graph, self.data_dict
    
    
    def delete_nodes(self):
        
        # node_lookup: store node index
        node_lookup = pd.DataFrame({'node': self.node_df.index,}, index=self.node_df.cust_id)

        # delete no-edge-node 
        diff_node = list(set(self.node_df['cust_id'])-(set(self.node_df['cust_id']) - set(self.edge_df['cust_id']) - set(self.edge_df['opp_id'])))

        self.node_df = self.node_df.iloc[node_lookup.iloc[diff_node]['node']].reset_index(drop=True)

        
    def build_graph(self, edge_df):
        
        #build up graph using networkx
        graph = nx.from_edgelist([(cust,opp) for cust, opp in zip(edge_df['cust_id'],edge_df['opp_id'])])
        
        return graph
    
    def random_clustering(self,graph):
        
        # random_clustering
        clusters = [cluster for cluster in range(self.num_clusters)]
        cluster_membership = {node : random.choice(clusters) for node in graph.nodes()}
        
        return clusters, cluster_membership
    
    def build_membership_dict(self,graph,clusters, cluster_membership):
        
        # build-up membership dict
        sg_nodes = {}
        sg_edges = {}
        sg_train_nodes = {}
        sg_test_nodes = {}
        sg_train_features = {}
        sg_test_features = {}
        sg_train_targets = {}
        sg_test_targets = {}

        for cluster in clusters:

            #print(cluster)
            subgraph = graph.subgraph([node for node in sorted(graph.nodes()) if cluster_membership[node] == cluster])
            sg_nodes[cluster] = [node for node in sorted(subgraph.nodes())]

            mapper = {node: i for i, node in enumerate(sorted(sg_nodes[cluster]))}
            sg_edges[cluster] = [[mapper[edge[0]], mapper[edge[1]]] for edge in subgraph.edges()] +  [[mapper[edge[1]], mapper[edge[0]]] for edge in subgraph.edges()]

            sg_train_nodes[cluster] = [node for node in self.node_df[self.node_df['is_driver'] == True]['cust_id'] if node in sg_nodes[cluster]]
            sg_test_nodes[cluster] = [node for node in self.node_df[self.node_df['is_driver'] == False]['cust_id'] if node in sg_nodes[cluster]]

            sg_test_nodes[cluster] = sorted(sg_test_nodes[cluster])
            sg_train_nodes[cluster] = sorted(sg_train_nodes[cluster])
            
            feats_name = list(set(self.node_df.columns) - set(['cust_id','is_driver','is_reported']))
            sg_train_features[cluster] = pd.concat([self.node_df[(self.node_df['cust_id'] == cust)&(self.node_df['is_driver'] == True)][feats_name] for cust in sg_nodes[cluster]],axis = 0)
            sg_test_features[cluster] = pd.concat([self.node_df[(self.node_df['cust_id'] == cust)&(self.node_df['is_driver'] == False)][feats_name] for cust in sg_nodes[cluster]],axis = 0)
            sg_train_targets[cluster] = pd.concat([self.node_df[(self.node_df['cust_id'] == cust)&(self.node_df['is_driver'] == True)][['is_reported']] * 1 for cust in sg_nodes[cluster]],axis = 0)
            sg_test_targets[cluster] = pd.concat([self.node_df[(self.node_df['cust_id'] == cust)&(self.node_df['is_driver'] == False)][['is_reported']] * 1 for cust in sg_nodes[cluster]],axis = 0)
          
        return sg_nodes, sg_edges, sg_train_nodes, sg_test_nodes, sg_train_features, sg_test_features, sg_train_targets, sg_test_targets

In [117]:
## 定义gcn
class GCN(torch.nn.Module):
    
    def __init__(self, input_dim, output_dim, activation = torch.nn.functional.relu):
        
        super(GCN, self).__init__()
        
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.activation = activation
        
        self.weight1 = nn.Parameter(torch.Tensor(self.input_dim, self.output_dim))
        #self.weight2 = nn.Parameter(torch.Tensor(self.input_dim * 2, self.output_dim))
        self.bn1 = nn.BatchNorm1d(self.output_dim)
        #self.bn2 = nn.BatchNorm1d(self.output_dim)
        self.reset_parameters()
    
    def reset_parameters(self):
        
        torch.nn.init.kaiming_uniform_(self.weight1)
        #torch.nn.init.kaiming_uniform_(self.weight2.weight)
        
    def forward(self,features):
        
        output = self.bn1(self.activation(torch.matmul(features,self.weight1)))
        #print(output.shape)
        
        return output

In [118]:
class residul_block(torch.nn.Module):
    
    def __init__(self,input_dim, output_dim,):
        
        super(residul_block, self).__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.gcn = GCN(self.input_dim, self.output_dim)
        self.linear_1 = nn.Linear(output_dim,output_dim * 2)
        self.linear_2 = nn.Linear(output_dim * 2,output_dim)
        self.bn1 = nn.BatchNorm1d(self.output_dim * 2)
        self.bn2 = nn.BatchNorm1d(self.output_dim)
        
        
    def forward(self, features):
        output = self.gcn(features)
        dummy = output
        output = self.linear_1(output)
        output = self.bn1(output)
        output = self.linear_2(output)
        output = self.bn2(output)
        output = torch.add(output,dummy)
        
        return output

In [119]:
class ListModule(torch.nn.Module):
    """
    Abstract list layer class.
    """
    def __init__(self, *args):
        """
        Module initializing.
        """
        super(ListModule, self).__init__()
        idx = 0
        for module in args:
            self.add_module(str(idx), module)
            idx += 1

    def __getitem__(self, idx):
        """
        Getting the indexed layer.
        """
        if idx < 0 or idx >= len(self._modules):
            raise IndexError('index {} is out of range'.format(idx))
        it = iter(self._modules.values())
        for i in range(idx):
            next(it)
        return next(it)

    def __iter__(self):
        """
        Iterating on the layers.
        """
        return iter(self._modules.values())

    def __len__(self):
        """
        Number of layers.
        """
        return len(self._modules)

In [120]:
class StackedGCN(torch.nn.Module):
    """
    Multi-layer GCN model.
    """
    def __init__(self, hidden_dims, input_channels, output_channels):
        """
        :param args: Arguments object.
        :input_channels: Number of features.
        :output_channels: Number of target features. 
        """
        super(StackedGCN, self).__init__()
        self.hidden_dims = hidden_dims
        self.input_channels = input_channels
        self.output_channels = output_channels
        self.setup_layers()

    def setup_layers(self):
        """
        Creating the layes based on the args.
        """
        self.layers = []
        self.all_dims = [self.input_channels] + self.hidden_dims + [self.output_channels]
        for i, _ in enumerate(self.all_dims[:-1]):
            self.layers.append(residul_block(self.all_dims[i],self.all_dims[i+1]))
        self.layers = ListModule(*self.layers)

    def forward(self, features):
        """
        Making a forward pass.
        :param edges: Edge list LongTensor.
        :param features: Feature matrix input FLoatTensor.
        :return predictions: Prediction matrix output FLoatTensor.
        """
        #print(self.layers)
        for i, _ in enumerate(self.all_dims[:-2]):
            features = torch.nn.functional.relu(self.layers[i](features))
            if i>1:
                features = torch.nn.functional.dropout(features,0.3)
        features = self.layers[i+1](features)
        predictions = torch.nn.functional.log_softmax(features, dim=1)
        return predictions

In [133]:
pre_model = preprocessing(node_df, edge_df, 7)
graph, data_dict = pre_model.run()

In [144]:
data_dict['sg_train_features'][0].shape[1]

120

In [221]:
class run_model():
    
    def __init__(self,
                 node_df,
                 edge_df,
                 num_clusters,
                 hidden_dims,
                 epochs = 20,
                 lr = 0.01,
                 device = 'cpu'):
        
        self.node_df = node_df
        self.edge_df = edge_df
        self.num_clusters = num_clusters
        self.clusters = [cluster for cluster in range(self.num_clusters)]
        self.hidden_dims = hidden_dims
        self.epochs = epochs
        self.lr = lr
        self.device = device
        
        print('-----------------------*-----------------------')
        print('data preprocessing ..\n')
        pre_model = preprocessing(self.node_df, self.edge_df, self.num_clusters)
        self.graph, self.data_dict = pre_model.run()
        print('preprocessing completed!')
        print('-----------------------*-----------------------')
        
        self.feature_count = int(self.data_dict['sg_train_features'][0].shape[1])  #input dim
        self.class_count = int(np.max(self.data_dict['sg_train_targets'][0]) + 1)  # output dim
        
        print('-----------------------*-----------------------')
        print('model construction\n')
        self.creat_model()
        self.ToTensor()
        print(self.model)
        print('-----------------------*-----------------------')
        
        
    def creat_model(self):
        
        self.model = StackedGCN(self.hidden_dims,self.feature_count,self.class_count)
        self.model = self.model.to(self.device)
    
    def ToTensor(self):
        
        for cluster in self.clusters:
            self.data_dict['sg_nodes'][cluster] = torch.LongTensor(self.data_dict['sg_nodes'][cluster])
            self.data_dict['sg_edges'][cluster] = torch.LongTensor(self.data_dict['sg_edges'][cluster]).t()
            self.data_dict['sg_train_nodes'][cluster] = torch.LongTensor(self.data_dict['sg_train_nodes'][cluster])
            self.data_dict['sg_test_nodes'][cluster] = torch.LongTensor(self.data_dict['sg_test_nodes'][cluster])
            
            self.data_dict['sg_train_features'][cluster] = torch.FloatTensor(np.array(self.data_dict['sg_train_features'][cluster]))
            self.data_dict['sg_train_targets'][cluster] = torch.LongTensor(np.array(self.data_dict['sg_train_targets'][cluster]))
            self.data_dict['sg_test_features'][cluster] = torch.FloatTensor(np.array(self.data_dict['sg_test_features'][cluster]))
            self.data_dict['sg_test_targets'][cluster] = torch.LongTensor(np.array(self.data_dict['sg_test_targets'][cluster]))
        
    
    def do_forward_pass(self, cluster):
        
        edges = self.data_dict['sg_edges'][cluster].to(self.device)
        macro_nodes = self.data_dict['sg_nodes'][cluster].to(self.device)
        train_nodes = self.data_dict['sg_train_nodes'][cluster].to(self.device)
        train_features = self.data_dict['sg_train_features'][cluster].to(self.device)
        train_target = self.data_dict['sg_train_targets'][cluster].to(self.device).squeeze()
        predictions = self.model(train_features)
#         print('predictions ',predictions.shape)
#         print('train_target ', train_target.shape)
#         print('train_nodes', train_nodes.shape)
        #average_loss = torch.nn.functional.nll_loss(predictions[train_nodes], train_target[train_nodes])
        average_loss = torch.nn.functional.nll_loss(predictions, train_target)
        node_count = train_nodes.shape[0]

        return average_loss, node_count
    
    def do_prediction(self, cluster):
        """
        Scoring a cluster.
        :param cluster: Cluster index.
        :return prediction: Prediction matrix with probabilities.
        :return target: Target vector.
        """
        edges = self.data_dict['sg_edges'][cluster].to(self.device)
        macro_nodes = self.data_dict['sg_nodes'][cluster].to(self.device)
        test_nodes = self.data_dict['sg_test_nodes'][cluster].to(self.device)
        test_features = self.data_dict['sg_test_features'][cluster].to(self.device)
        test_target = self.data_dict['sg_test_targets'][cluster].to(self.device).squeeze()

        prediction = self.model(test_features)
        
        return prediction, test_target
    
    def update_average_loss(self, batch_average_loss, node_count):
        """
        Updating the average loss in the epoch.
        :param batch_average_loss: Loss of the cluster. 
        :param node_count: Number of nodes in currently processed cluster.
        :return average_loss: Average loss in the epoch.
        """
        self.accumulated_training_loss = self.accumulated_training_loss + batch_average_loss.item()*node_count
        self.node_count_seen = self.node_count_seen + node_count
        average_loss = self.accumulated_training_loss/self.node_count_seen
        
        return average_loss
    
    def train(self):
        """
        Training a model.
        """
        print("Training started.\n")
        #epochs = trange(self.epochs, desc = "Train Loss")
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.lr)
        self.model.train()
        for epoch in range(self.epochs):
            random.shuffle(self.clusters)
            self.node_count_seen = 0
            self.accumulated_training_loss = 0
            for cluster in self.clusters:
                self.optimizer.zero_grad()
                batch_average_loss, node_count = self.do_forward_pass(cluster)
                batch_average_loss.backward()
                self.optimizer.step()
                average_loss = self.update_average_loss(batch_average_loss, node_count)
                print("Epoch {:03d} Cluster {:03d} Loss: {:.4f}".format(epoch, cluster, average_loss))
            #epochs.set_description("Train Loss: %g" % round(average_loss,4))
            self.test()
    
    
    def test(self):
        """
        Scoring the test and printing the F-1 score.
        """
        self.model.eval()
        self.predictions = []
        self.targets = []
        for cluster in self.clusters:
            prediction, target = self.do_prediction(cluster)
            self.predictions.append(prediction.cpu().detach().numpy())
            self.targets.append(target.cpu().detach().numpy())
        self.targets = np.concatenate(self.targets)
        self.predictions = np.concatenate(self.predictions).argmax(1)
        score = f1_score(self.targets, self.predictions, average="micro")
        print('-----------------------*-----------------------')
        print("test accuracy: ", sum(self.predictions == self.targets)/len(self.targets))
        print("\nF-1 score: {:.4f}".format(score))
        print('-----------------------*-----------------------')

In [222]:
a = run_model(node_df,edge_df,num_clusters = 10,hidden_dims = [128,64])

-----------------------*-----------------------
data preprocessing ..

preprocessing completed!
-----------------------*-----------------------
-----------------------*-----------------------
model construction
StackedGCN(
  (layers): ListModule(
    (0): residul_block(
      (gcn): GCN(
        (bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (linear_1): Linear(in_features=128, out_features=256, bias=True)
      (linear_2): Linear(in_features=256, out_features=128, bias=True)
      (bn1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): residul_block(
      (gcn): GCN(
        (bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (linear_1): Linear(in_features=64, out_features=128, bias=True)
      (linear_2): Linear(in_features=128, out_features=6

In [220]:
a.train()

Training started.

Epoch 000 Cluster 003 Loss: 0.9521
Epoch 000 Cluster 006 Loss: 0.9012
Epoch 000 Cluster 004 Loss: 0.9254
Epoch 000 Cluster 002 Loss: 0.9120
Epoch 000 Cluster 005 Loss: 0.8762
Epoch 000 Cluster 008 Loss: 0.8442
Epoch 000 Cluster 007 Loss: 0.8344
Epoch 000 Cluster 000 Loss: 0.8195
Epoch 000 Cluster 009 Loss: 0.8022
Epoch 000 Cluster 001 Loss: 0.7897
-----------------------*-----------------------
test accuracy:  0.5540540540540541

F-1 score: 0.5541
-----------------------*-----------------------
Epoch 001 Cluster 005 Loss: 1.5781
Epoch 001 Cluster 000 Loss: 1.2038
Epoch 001 Cluster 004 Loss: 1.1202
Epoch 001 Cluster 003 Loss: 1.0155
Epoch 001 Cluster 008 Loss: 0.9470
Epoch 001 Cluster 009 Loss: 0.8949
Epoch 001 Cluster 007 Loss: 0.8645
Epoch 001 Cluster 001 Loss: 0.8374
Epoch 001 Cluster 006 Loss: 0.8132
Epoch 001 Cluster 002 Loss: 0.8004
-----------------------*-----------------------
test accuracy:  0.6238738738738738

F-1 score: 0.6239
-----------------------*-----

Epoch 018 Cluster 007 Loss: 0.6620
Epoch 018 Cluster 005 Loss: 0.6696
Epoch 018 Cluster 009 Loss: 0.6592
Epoch 018 Cluster 008 Loss: 0.6622
Epoch 018 Cluster 002 Loss: 0.6672
Epoch 018 Cluster 000 Loss: 0.6682
Epoch 018 Cluster 006 Loss: 0.6649
Epoch 018 Cluster 001 Loss: 0.6621
-----------------------*-----------------------
test accuracy:  0.6238738738738738

F-1 score: 0.6239
-----------------------*-----------------------
Epoch 019 Cluster 005 Loss: 0.6965
Epoch 019 Cluster 008 Loss: 0.6851
Epoch 019 Cluster 006 Loss: 0.6692
Epoch 019 Cluster 004 Loss: 0.6737
Epoch 019 Cluster 002 Loss: 0.6769
Epoch 019 Cluster 009 Loss: 0.6685
Epoch 019 Cluster 000 Loss: 0.6691
Epoch 019 Cluster 003 Loss: 0.6662
Epoch 019 Cluster 001 Loss: 0.6635
Epoch 019 Cluster 007 Loss: 0.6631
-----------------------*-----------------------
test accuracy:  0.6238738738738738

F-1 score: 0.6239
-----------------------*-----------------------


In [213]:
sum(predictions == targets)/len(targets)

0.6126126126126126

In [121]:
features = torch.FloatTensor(np.array(data_dict['sg_train_features'][10]))
edges = torch.LongTensor(data_dict['sg_edges'][10]).t()

In [122]:
model = StackedGCN([128,64],120,2)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [123]:
print(model)

StackedGCN(
  (layers): ListModule(
    (0): residul_block(
      (gcn): GCN(
        (bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (linear_1): Linear(in_features=128, out_features=256, bias=True)
      (linear_2): Linear(in_features=256, out_features=128, bias=True)
      (bn1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): residul_block(
      (gcn): GCN(
        (bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (linear_1): Linear(in_features=64, out_features=128, bias=True)
      (linear_2): Linear(in_features=128, out_features=64, bias=True)
      (bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (bn2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    

In [103]:
for i in range(3):
    optimizer.zero_grad()
    model(features)
    optimizer.step()

torch.Size([32, 128])
torch.Size([32, 64])
torch.Size([32, 2])
torch.Size([32, 128])
torch.Size([32, 64])
torch.Size([32, 2])
torch.Size([32, 128])
torch.Size([32, 64])
torch.Size([32, 2])
