In [1]:
import numpy as np
import struct, random
import networkx as nx
import pandas as pd
# import networkx.algorithms.isomorphism.vf2userfunc as vf2
# from networkx.algorithms import isomorphism
from VF2 import GraphMatcher
import random
from networkx.generators import gnm_random_graph
from collections import OrderedDict
import networkx.algorithms.isomorphism as iso
from sklearn.datasets import make_classification
import functools
import sys, os
from os.path import join
from sklearn.metrics import accuracy_score
import json
from functools import partial

In [2]:
# helper functions
def nodeID2node(nodeID, G):
    """
    takes a nodeID and returns a node in the graph G
    """
    node=None
    for n,d in G.nodes_iter(data=True):
        if d['nodeID']==nodeID:
            node=n
            break
    return node

def node2nodeID(node, G):
    """
    takes a node and returns a nodeID for that node in the graph G
    """
    return G.node[node]['nodeID']

## Define graph matching model

In [3]:
def checkNodeMatch(g1node, g2node, node_map):
    g1node = g1node['nodeID']
    g2node = g2node['nodeID']
    if g1node in node_map:
        return node_map[g1node]==g2node
    else:
        return True
    
class GraphMatching:
    def __init__(self, G1, G2, node_map):
        self.G1 = G1
        self.G2 = G2
        self.node_map = node_map
        
    def match(self, timeLimit=30):
        G1_max = max(nx.connected_component_subgraphs(self.G1), key=len)
        G2_max = max(nx.connected_component_subgraphs(self.G2), key=len)
        
        GM = GraphMatcher(G1_max, G2_max, node_match=functools.partial(checkNodeMatch, node_map=self.node_map))
        GM.mapping = node_map
        GM.subgraph_is_isomorphic(timeLimit=timeLimit)
        self.mapping = GM.mapping
    
    def predict(self, testDataDf):
        # make a dictionary from the node mapping beteen G1 and G2
        l_g1 = list(map(functools.partial(node2nodeID, G=self.G1), self.mapping.keys()))
        l_g2 = list(map(functools.partial(node2nodeID, G=self.G2), self.mapping.values()))
        d_g1_g2 = dict(zip(l_g1, l_g2))
        
        lookup = lambda a, D: D[a] if a in D else 1
        testDataDf['G2.nodeID'] = testDataDf['G1.nodeID'].apply(functools.partial(lookup, D=d_g1_g2))
        return testDataDf

## Make pipeline

In [4]:
print('building graph matching model ...')

building graph matching model ...


In [5]:
dataDir = "../../data"
rawDataDir = os.path.join(dataDir, "raw_data")
rootDir = os.path.join(dataDir, "..")
assert os.path.exists(dataDir)
assert os.path.exists(rawDataDir)
assert os.path.exists(rootDir)

In [6]:
# read the graphs
print('reading the graphs ...')
G1 = nx.read_gml(join(rawDataDir, 'G1.gml'))
G2 = nx.read_gml(join(rawDataDir, 'G2.gml'))

reading the graphs ...


In [7]:
print('G1',len(G1.nodes()), len(G1.edges()))
print('G2',len(G2.nodes()), len(G2.edges()))

G1 1000 5521
G2 755 5139


In [8]:
print('reading train data (known node mappings) and priming the graph matching process ...')
trainDataDf = pd.read_csv(join(dataDir, 'trainData.csv'), index_col=0)
trainTargetsDF = pd.read_csv(join(dataDir, 'trainTargets.csv'), index_col=0)
# print(trainDataDf.shape, trainTargetsDF.shape)
# print(trainDataDf.head())
# print(trainTargetsDF.head())
df = pd.concat([trainDataDf,trainTargetsDF], axis=1)
df = df[['G1.nodeID', 'G2.nodeID']]
node_map = pd.Series(df['G2.nodeID'].values, index=df['G1.nodeID']).to_dict()

reading train data (known node mappings) and priming the graph matching process ...


In [9]:
# align the graphs
print('performing the graph matching ...')
gm = GraphMatching(G1, G2, node_map=node_map)
gm.match()

performing the graph matching ...


### Try the model on trainData

In [10]:
print('trying the model on training data (known mappings) ...')

trying the model on training data (known mappings) ...


In [11]:
# evaluate the model on train data
train_performance = OrderedDict()
trainDataDf = pd.read_csv(join(dataDir, 'trainData.csv'), index_col=0)
prediction = gm.predict(trainDataDf)['G2.nodeID']
train_truth = pd.read_csv(join(dataDir, 'trainTargets.csv'), index_col=0)['G2.nodeID']
accuracy = accuracy_score(train_truth, prediction)

train_performance = OrderedDict([
    ('train', OrderedDict([
        ('score', OrderedDict([
                ('metric', 'accuracy'),
                ('value', accuracy)])
        )
    ]))
])

print('accuracy on training data:', accuracy)

accuracy on training data: 0.695364238411


## Submit predictions on testData

In [12]:
print('applying the model on test data (unknown mappings ...)')

applying the model on test data (unknown mappings ...)


In [13]:
print('making predictions on testData (assuming that testData is available) ...')
try:
    print('1. reading testData ...')
    testDataDf = pd.read_csv(join(dataDir, 'testData.csv'), index_col=0)
    print('2. making predictions ...')
    prediction = pd.DataFrame(gm.predict(testDataDf)['G2.nodeID'])
    print('3. formatting and saving testTargets.csv')
    prediction.insert(0, 'graph', 'G2.gml')
    # print(prediction.head())
    prediction.to_csv('testTargets.csv')
except:
    print('Looks like this is a redacted dataset. testData is unavailable. Cannot complete this step ...')

making predictions on testData (assuming that testData is available) ...
1. reading testData ...
2. making predictions ...
3. formatting and saving testTargets.csv


## Compute performance on testData

In [14]:
test_performance = OrderedDict()

In [15]:
print('computing performance on testData (assuming the testTargets is available) ...')
try:
    print('1. reading testTargets...')# read the y_truth
    y_truth = pd.read_csv(join(dataDir, 'testTargets.csv'))['G2.nodeID']
    print('2. reading predictions ...')
    # read the y_predicted
    y_predicted = pd.read_csv('testTargets.csv')['G2.nodeID']
    print('3. computing accuracy ...')
    accuracy = accuracy_score(y_truth, y_predicted)
    print('performance on test data:',accuracy)
    print('4. saving the performance score...')
    test_performance = OrderedDict([
        ('test', OrderedDict([
            ('score', OrderedDict([
                    ('metric', 'accuracy'),
                    ('value', accuracy)])
            )
        ]))
    ])
except:
    print('Looks like this is a redacted dataset. testTargets is unavailable. cannot complete this step ...')

computing performance on testData (assuming the testTargets is available) ...
1. reading testTargets...
2. reading predictions ...
3. computing accuracy ...
performance on test data: 0.683774834437
4. saving the performance score...


In [16]:
overall_performance = OrderedDict()
overall_performance.update(train_performance)
overall_performance.update(test_performance)

with open('performance.json', 'w', encoding='utf-8') as f:
    json.dump(overall_performance, f, indent=2)
print(json.dumps(overall_performance, indent=2))

{
  "train": {
    "score": {
      "metric": "accuracy",
      "value": 0.695364238410596
    }
  },
  "test": {
    "score": {
      "metric": "accuracy",
      "value": 0.6837748344370861
    }
  }
}
