In [5]:
import sys
sys.path.append("../")

import json
import os
import argparse
from scipy.io import loadmat
import numpy as np
import networkx as nx
from networkx.readwrite import json_graph
from input.data_preprocess import DataPreprocess

import utils.graph_utils as graph_utils

In [43]:



class Dataset:
    """
    this class receives input from graphsage format with predefined folder structure, the data folder must contains these files:
    G.json, id2idx.json, features.npy (optional)

    Arguments:
    - data_dir: Data directory which contains files mentioned above.
    """

    def __init__(self, data_dir):
        self.data_dir = data_dir
        self._load_G()
        self._load_id2idx()
        self._load_features()
        # self.load_edge_features()
        print("Dataset info:")
        print("- Nodes: ", len(self.G.nodes()))
        print("- Edges: ", len(self.G.edges()))

    def _load_G(self):
        G_data = json.load(open(os.path.join(self.data_dir, "G.json")))
        print(G_data)
        self.G = json_graph.node_link_graph(G_data)
        if type(self.G.nodes()[0]) is int:
            mapping = {k: str(k) for k in self.G.nodes()}
            self.G = nx.relabel_nodes(self.G, mapping)

    def _load_id2idx(self):
        print(self.data_dir)
        id2idx_file = os.path.join(self.data_dir, 'id2idx.json')
        conversion = type(self.G.nodes()[0])
        print(conversion)
        self.id2idx = {}
        id2idx = json.load(open(id2idx_file))
        for k, v in id2idx.items():
            print(k)
            print(v)
            self.id2idx[str(k)] = v

    def _load_features(self):
        self.features = None
        feats_path = os.path.join(self.data_dir, 'feats.npy')
        if os.path.isfile(feats_path):
            self.features = np.load(feats_path)
        else:
            self.features = None
        return self.features

    def load_edge_features(self):
        self.edge_features= None
        feats_path = os.path.join(self.data_dir, 'edge_feats.mat')
        if os.path.isfile(feats_path):
            edge_feats = loadmat(feats_path)['edge_feats']
            self.edge_features = np.zeros((len(edge_feats[0]),
                                           len(self.G.nodes()),
                                           len(self.G.nodes())))
            for idx, matrix in enumerate(edge_feats[0]):
                self.edge_features[idx] = matrix.toarray()
        else:
            self.edge_features = None
        return self.edge_features

    def get_adjacency_matrix(self, sparse=False):
        return graph_utils.construct_adjacency(self.G, self.id2idx, sparse=False)

    def get_nodes_degrees(self):
        return graph_utils.build_degrees(self.G, self.id2idx)

    def get_nodes_clustering(self):
        return graph_utils.build_clustering(self.G, self.id2idx)

    def get_edges(self):
        return graph_utils.get_edges(self.G, self.id2idx)

    def check_id2idx(self):
        # print("Checking format of dataset")
        for i, node in enumerate(self.G.nodes()):
            if (self.id2idx[node] != i):
                print("Failed at node %s" % str(node))
                return False
        # print("Pass")
        return True




class SynDataset:
    def __init__(self, num_nodes, p_create_edge, num_feats=0, seed=1, from_graph=None, num_del=0):
        if from_graph is None:
            self.G = nx.generators.random_graphs.gnp_random_graph(num_nodes, p_create_edge, seed=seed)
            self.id2idx = {id: i for i, id in enumerate(self.G.nodes())}
            if num_feats > 0:
                self.features = np.zeros((len(self.G.nodes()), num_feats))
                for i in range(self.features.shape[0]):
                    self.features[i][np.random.randint(0, num_feats)] = 1
            else:
                self.features = None
        else:
            self.G = from_graph.G.copy()
            if num_del > 0:
                count_del = 0
                self.considernodes = []
                self.considernodes2 = []
                for node in self.G.nodes():
                    if len(self.G.neighbors(node)) > 4:
                        for node2 in self.G.neighbors(node):
                            if len(self.G.neighbors(node2)) > 2:
                                self.G.remove_edge(node, node2)
                                self.considernode = node
                                self.considernode2 = node2
                                self.considernodes.append(node)
                                self.considernodes2.append(node2)
                                count_del += 1
                                if count_del == num_del:
                                    break
                            if count_del == num_del:
                                break
                    if count_del == num_del:
                        break
            array = np.arange(len(self.G.nodes()))
            np.random.shuffle(array)
            
            self.id2idx = {id: array[i] for i, id in enumerate(self.G.nodes())}
            if num_feats > 0:
                self.features = from_graph.features[array]
            self.groundtruth = {array[i]:i for i in range(len(from_graph.G.nodes()))}
            self.groundtruth_matrix = np.zeros((len(self.G.nodes()), len(self.G.nodes())))
            self.groundtruth_matrix[array, np.arange(len(self.G.nodes()))] = 1

        self.edge_features = None
        print("Dataset info:")
        print("- Nodes: ", len(self.G.nodes()))
        print("- Edges: ", len(self.G.edges()))
        

    def get_adjacency_matrix(self):
        return graph_utils.construct_adjacency(self.G, self.id2idx)

    def get_nodes_degrees(self):
        return graph_utils.build_degrees(self.G, self.id2idx)

    def get_nodes_clustering(self):
        return graph_utils.build_clustering(self.G, self.id2idx)

    def get_edges(self):
        return graph_utils.get_edges(self.G, self.id2idx)




def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--source_dataset', default="../data/allmv_tmdb/allmv/graphsage/")
    parser.add_argument('--target_dataset', default="../data/allmv_tmdb/tmdb/graphsage/")
    parser.add_argument('--groundtruth', default="../data/allmv_tmdb/dictionaries/groundtruth")
    parser.add_argument('--output_dir', default="../data/allmv_tmdb/statistics/")
    return parser.parse_args([])

def main(args):    
    source_dataset = Dataset(args.source_dataset)
    target_dataset = Dataset(args.target_dataset)
    groundtruth = graph_utils.load_gt(args.groundtruth, source_dataset.id2idx, target_dataset.id2idx, "dict")
    DataPreprocess.evaluateDataset(source_dataset, target_dataset, groundtruth, args.output_dir)








In [44]:
if __name__ == "__main__":
    args = parse_args()
    main(args)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



../data/allmv_tmdb/allmv/graphsage/
<class 'dict'>
2
0
25
1
166
2
265
3
274
4
347
5
458
6
481
7
509
8
537
9
672
10
771
11
917
12
930
13
1137
14
1199
15
1207
16
1282
17
1338
18
1464
19
1545
20
1737
21
1760
22
1809
23
2978
24
3301
25
3363
26
3479
27
3490
28
3542
29
3616
30
3742
31
3833
32
3844
33
3912
34
4285
35
4386
36
4441
37
4564
38
4596
39
4741
40
4884
41
4930
42
5040
43
5140
44
5591
45
5631
46
5671
47
5696
48
6156
49
6225
50
6286
51
6378
52
3
53
290
54
863
55
1586
56
2041
57
2043
58
2326
59
2592
60
2600
61
2948
62
2962
63
3074
64
3085
65
3099
66
3116
67
3121
68
3418
69
3480
70
3622
71
3658
72
3681
73
4189
74
4307
75
4316
76
4385
77
4464
78
4843
79
4854
80
5009
81
5461
82
5704
83
5758
84
6395
85
4
86
11
87
293
88
311
89
380
90
395
91
443
92
475
93
496
94
668
95
737
96
800
97
843
98
918
99
992
100
1058
101
1068
102
1076
103
1139
104
1281
105
1371
106
1711
107
1716
108
2124
109
2258
110
2262
111
2431
112
2439
113
2596
114
2800
115
2807
116
2944
117
2968
118
3080
119
3288
120
3440
121
3

6133
2410
6219
2411
77
2412
799
2413
1250
2414
1584
2415
2629
2416
2778
2417
3000
2418
3198
2419
3426
2420
3512
2421
3572
2422
4184
2423
4713
2424
259
2425
282
2426
1106
2427
1274
2428
1458
2429
1549
2430
2378
2431
2702
2432
2752
2433
3022
2434
3069
2435
4035
2436
4291
2437
4396
2438
4706
2439
4883
2440
5035
2441
5096
2442
5184
2443
5586
2444
79
2445
167
2446
583
2447
701
2448
961
2449
1289
2450
1541
2451
1670
2452
2909
2453
4887
2454
5386
2455
5751
2456
6268
2457
91
2458
240
2459
286
2460
362
2461
547
2462
686
2463
788
2464
798
2465
991
2466
1538
2467
1566
2468
1685
2469
1825
2470
2112
2471
2486
2472
2517
2473
2585
2474
2768
2475
2808
2476
2873
2477
2906
2478
2986
2479
3181
2480
3202
2481
3349
2482
3444
2483
3602
2484
3633
2485
3666
2486
3811
2487
4421
2488
4563
2489
4727
2490
4997
2491
5331
2492
6152
2493
6215
2494
6230
2495
6465
2496
81
2497
423
2498
470
2499
717
2500
911
2501
1483
2502
1570
2503
1941
2504
2299
2505
2336
2506
2357
2507
2396
2508
2421
2509
2422
2510
2450
2511
82
2512

3247
4869
4358
4870
2936
4871
4847
4872
5364
4873
5437
4874
1253
4875
2694
4876
1924
4877
5289
4878
1078
4879
2132
4880
2894
4881
6072
4882
3204
4883
473
4884
3612
4885
4262
4886
6062
4887
6189
4888
3127
4889
951
4890
1127
4891
6213
4892
479
4893
1548
4894
5011
4895
5687
4896
5491
4897
3245
4898
4266
4899
1102
4900
2578
4901
2851
4902
3068
4903
3071
4904
3163
4905
486
4906
1029
4907
2863
4908
4306
4909
1771
4910
5022
4911
5334
4912
5991
4913
6186
4914
516
4915
3132
4916
3146
4917
4893
4918
5554
4919
6377
4920
2111
4921
1335
4922
3232
4923
4672
4924
728
4925
1300
4926
1456
4927
1757
4928
3014
4929
5856
4930
6447
4931
6241
4932
2805
4933
3100
4934
3570
4935
3575
4936
4088
4937
5115
4938
501
4939
998
4940
1327
4941
2030
4942
2641
4943
5797
4944
2773
4945
1701
4946
4430
4947
2697
4948
3682
4949
510
4950
2434
4951
2447
4952
666
4953
5139
4954
512
4955
565
4956
2994
4957
3153
4958
4783
4959
5080
4960
3496
4961
6164
4962
2799
4963
5174
4964
5900
4965
6066
4966
2836
4967
3505
4968
3644
4969
39

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)




2055
354
2057
355
2178
356
2322
357
2373
358
2615
359
2669
360
2787
361
2990
362
3016
363
3022
364
3030
365
3201
366
3279
367
3343
368
3395
369
3515
370
3678
371
3683
372
3853
373
3894
374
4038
375
4228
376
4260
377
4263
378
4352
379
4393
380
4508
381
4551
382
4608
383
4653
384
4692
385
4745
386
4787
387
4827
388
4846
389
4855
390
4861
391
4875
392
4940
393
4961
394
4992
395
5135
396
5563
397
5643
398
5678
399
5682
400
5710
401
5735
402
5845
403
10
404
60
405
96
406
616
407
743
408
1478
409
1565
410
1716
411
2886
412
2912
413
2924
414
2939
415
3508
416
3526
417
3593
418
3717
419
3862
420
3914
421
3959
422
3960
423
4425
424
11
425
331
426
387
427
682
428
950
429
952
430
953
431
1224
432
1453
433
1503
434
1515
435
1630
436
1711
437
1721
438
1734
439
1746
440
1776
441
1843
442
1844
443
1958
444
2070
445
2149
446
2223
447
2265
448
2532
449
2654
450
2962
451
3161
452
3223
453
3269
454
3363
455
3512
456
3667
457
3744
458
3838
459
3882
460
3900
461
3923
462
4518
463
4748
464
4885
465
5641
46

2300
2443
2521
2444
2695
2445
2702
2446
3218
2447
3399
2448
3625
2449
3647
2450
3672
2451
4067
2452
4229
2453
4251
2454
4265
2455
4309
2456
4322
2457
4411
2458
4571
2459
4647
2460
4996
2461
5004
2462
5049
2463
5080
2464
5481
2465
5489
2466
5662
2467
105
2468
294
2469
585
2470
801
2471
1103
2472
1689
2473
2345
2474
2693
2475
3988
2476
4447
2477
4956
2478
5514
2479
5793
2480
106
2481
172
2482
384
2483
939
2484
960
2485
999
2486
1083
2487
1112
2488
1586
2489
1704
2490
1747
2491
1787
2492
1948
2493
2637
2494
2967
2495
3407
2496
3491
2497
3550
2498
3622
2499
4720
2500
371
2501
465
2502
2510
2503
108
2504
3733
2505
1352
2506
1440
2507
2776
2508
3143
2509
4003
2510
4277
2511
4471
2512
4935
2513
5214
2514
5853
2515
111
2516
923
2517
1157
2518
1270
2519
1510
2520
1996
2521
2318
2522
3080
2523
3248
2524
3701
2525
4590
2526
5127
2527
112
2528
162
2529
189
2530
492
2531
553
2532
1106
2533
1210
2534
2026
2535
2305
2536
2833
2537
2926
2538
3262
2539
3938
2540
5099
2541
113
2542
346
2543
415
2544
683

2610
4853
822
4854
1132
4855
2390
4856
3751
4857
2009
4858
4606
4859
5191
4860
5353
4861
3253
4862
5219
4863
827
4864
5869
4865
6000
4866
2593
4867
3498
4868
2176
4869
4396
4870
854
4871
1687
4872
3927
4873
5841
4874
4154
4875
840
4876
1120
4877
1426
4878
3612
4879
3816
4880
4077
4881
1970
4882
2988
4883
2071
4884
2124
4885
2538
4886
3160
4887
4122
4888
4708
4889
4906
4890
4993
4891
5231
4892
5239
4893
5286
4894
5501
4895
849
4896
1079
4897
5894
4898
1105
4899
1507
4900
4151
4901
4235
4902
4460
4903
855
4904
4006
4905
2170
4906
2759
4907
3729
4908
5863
4909
865
4910
2016
4911
867
4912
1788
4913
3357
4914
4776
4915
4822
4916
871
4917
1598
4918
2726
4919
3517
4920
4973
4921
874
4922
2777
4923
5527
4924
4146
4925
1407
4926
2597
4927
3296
4928
3749
4929
4706
4930
889
4931
890
4932
2044
4933
3311
4934
892
4935
5187
4936
893
4937
3375
4938
898
4939
4610
4940
1941
4941
4981
4942
5294
4943
5704
4944
5824
4945
2482
4946
4317
4947
4406
4948
905
4949
1642
4950
907
4951
5287
4952
2783
4953
3728
49

TypeError: unhashable type: 'dict'

In [37]:
G_data = json.load(open(os.path.join('../data/allmv_tmdb/allmv/graphsage/', "G.json")))
G = json_graph.node_link_graph(G_data)

In [42]:
G.nodes()

NodeView(('2', '25', '166', '265', '274', '347', '458', '481', '509', '537', '672', '771', '917', '930', '1137', '1199', '1207', '1282', '1338', '1464', '1545', '1737', '1760', '1809', '2978', '3301', '3363', '3479', '3490', '3542', '3616', '3742', '3833', '3844', '3912', '4285', '4386', '4441', '4564', '4596', '4741', '4884', '4930', '5040', '5140', '5591', '5631', '5671', '5696', '6156', '6225', '6286', '6378', '3', '290', '863', '1586', '2041', '2043', '2326', '2592', '2600', '2948', '2962', '3074', '3085', '3099', '3116', '3121', '3418', '3480', '3622', '3658', '3681', '4189', '4307', '4316', '4385', '4464', '4843', '4854', '5009', '5461', '5704', '5758', '6395', '4', '11', '293', '311', '380', '395', '443', '475', '496', '668', '737', '800', '843', '918', '992', '1058', '1068', '1076', '1139', '1281', '1371', '1711', '1716', '2124', '2258', '2262', '2431', '2439', '2596', '2800', '2807', '2944', '2968', '3080', '3288', '3440', '3530', '3587', '3694', '3919', '4245', '4408', '4474'