In [5]:
import pandas as pd
import numpy as np
import networkx as nx
import itertools
import pickle
import os
import random
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [6]:
def get_edges_dict(traindata):
    if os.path.isfile("edgesdic.pkl"):
        f = open("edgesdic.pkl","rb")
        edges = pickle.load(f)
        f.close()
        return edges
    else:
        edges = dict()
        for edge in range(len(traindata)): 
            edges[(traindata.loc[[edge]]['id_1'].values[0],traindata.loc[[edge]]['id_2'].values[0])] = 1 
            edges[(traindata.loc[[edge]]['id_2'].values[0],traindata.loc[[edge]]['id_1'].values[0])] = 1  
        f = open("edgesdic.pkl","wb")
        pickle.dump(edges,f)
        f.close()
        return edges

In [7]:
def save_pfile(data,dname):
    if os.path.isfile(dname):
        print("data exixt")
    else:
        pickle.dump(data ,open(dname,'wb'))

In [8]:
def get_negative_edges(traindata,g):
    edges = get_edges_dict(traindata)
    # for missing edges.
    negative_edges = set([])
    maxNodenum = max(max(traindata['id_1']),max(traindata['id_2']))
    #產生與原圖同edge數目的negative edges
    while (len(negative_edges)<len(traindata)):
        node1=random.randint(0, maxNodenum) 
        node2=random.randint(0, maxNodenum) 
        tmp = edges.get((node1,node2),0) #edge不在graph裡標為0
        if tmp == 0 and node1!=node2 and g.has_node(node1) and g.has_node(node2): # if edge不在graph裡才要做處理
            negative_edges.add((node1,node2))
        else:
            continue
    return negative_edges

In [9]:
def random_remove_edge(graph):
    edges = list(graph.edges)
    chosen_edge = random.choice(edges)
    graph.remove_edge(chosen_edge[0], chosen_edge[1])

In [35]:
train_df = pd.read_csv('facebook_edges.csv', header=0)
#train_df = pd.read_csv('facebook_combined.txt', names=['id_1', 'id_2'], header=0, sep=' ')

In [36]:
train_G = nx.from_pandas_edgelist(train_df, 'id_1', 'id_2')

In [37]:
print(nx.info(train_G))

Name: 
Type: Graph
Number of nodes: 22470
Number of edges: 171002
Average degree:  15.2205


In [38]:
negative_edges=get_negative_edges(train_df,train_G)

In [39]:
len(negative_edges)

171002

In [40]:
save_pfile(negative_edges,"negative_edges.p")

In [41]:
train,test = train_test_split(train_df, test_size=0.5)

In [42]:
test

Unnamed: 0,id_1,id_2
19171,1239,12217
153198,15283,21961
59997,4314,14171
8624,525,4464
136775,12461,19776
...,...,...
166028,14222,19750
10663,688,2237
139434,9900,16646
81045,6181,13121


In [43]:
train.to_csv('train_pos_data.csv',index=False)

In [44]:
test.to_csv('test_pos_data.csv',index=False)