In [24]:
import csv
import networkx as nx
import matplotlib.pyplot as plt
from concurrent.futures import ProcessPoolExecutor
import numpy as np
from cdlib import algorithms
import igraph as ig
from netgraph import Graph
import networkx.algorithms.community as nxcom
import random
# Function to read node data from CSV file
def draw_communities(graph, communities):
    pos = nx.spring_layout(graph)  # You can use other layouts as well
    
    # Draw nodes and edges
    nx.draw(graph, pos, with_labels=True, node_color='lightgray', edge_color='gray', font_size=8, node_size=300)
    
    # Iterate over communities
    for idx, community in enumerate(communities):
        nx.draw_networkx_nodes(graph, pos, nodelist=community, node_color=plt.cm.tab10.colors[idx], node_size=300)
    
    # Show the plot
    plt.title("Graph with Communities")
    plt.show()
def read_nodes(filename):
    nodes = {}
    with open(filename, 'r') as file:
        reader = csv.reader(file)
        for row in reader:
            nodes[int(row[0])] = int(row[1])
    return nodes

# Function to read edge data from CSV file
def read_edges(filename):
    edges = []
    with open(filename, 'r') as file:
        reader = csv.reader(file)
        for row in reader:
            source, target, timestep = map(int, row)
            edges.append((source, target, timestep))
    return edges

# Function to create a graph for a specific time step
# def create_graph(timestep, nodes, edges):
#     G = nx.DiGraph()
#     for node, value in nodes.items():
#         if value <= timestep:  # Add nodes based on the timestep
#             G.add_node(node, value=value)
#     for edge in edges:
#         if edge[2] <= timestep:
#             G.add_edge(edge[0], edge[1])
#     # giant_component_size = len(max(nx.weakly_connected_components(G), key=len))
#     return G
def create_graph(timestep, nodes, edges):
    G = nx.Graph()
    
    # Randomly sample nodes based on the sampling ratio
   
    for node, value in nodes.items():
        if value <= timestep:  # Add nodes based on the timestep
            G.add_node(node, value=value)
    
    for edge in edges:
        if edge[2] <= timestep:
            # Check if both nodes of the edge are present in the sampled nodes
            if edge[0] in G.nodes() and edge[1] in G.nodes():
                G.add_edge(edge[0], edge[1])
    
    # giant_component_size = len(max(nx.weakly_connected_components(G), key=len))
    return G


In [25]:
nodes = read_nodes('final_nodes_time.csv')
edges = read_edges('final_edges_time.csv')
G = create_graph(50, nodes, edges)
print(nx.info(G))


Graph with 10133 nodes and 37885 edges


In [26]:
#TO ADD NEW NODES TO THE GRAPH:
for node,value in nodes.items():
    if value > 50 and value <= 53:
        G.add_node(node, value=value)

In [27]:
print(nx.info(G))   
lol = G.nodes()

Graph with 11112 nodes and 37885 edges


In [28]:
negetive_edges = []
for u in lol:
    for v in lol:
        if u != v and not G.has_edge(u,v):
            negetive_edges.append((u,v))
print(len(negetive_edges))

123389676


In [106]:
negetive_edges = random.sample(negetive_edges, 5000)
print(len(negetive_edges))
data = []
for i in negetive_edges:
    data.append((i[0],i[1],0))
len(data)

5000


5000

In [107]:
for edge in edges:
    if edge[2] <= 53 and edge[2] >50:
        data.append((edge[0],edge[1],1))
print(len(data))

11650


In [108]:
random.shuffle(data)

In [48]:
mapping_node  = {}
i = 1
for node in G.nodes():
    mapping_node[node]=i
    i+=1
print(mapping_node)
data_actual = []
for i in data:
    if i[0] in G.nodes() and i[1] in G.nodes():
        data_actual.append((mapping_node[i[0]],mapping_node[i[1]],i[2]))
print(data_actual[:10])



{9203201: 1, 9203202: 2, 9203203: 3, 9203204: 4, 9203205: 5, 9203206: 6, 9203207: 7, 9203208: 8, 9203209: 9, 9203210: 10, 9203211: 11, 9203212: 12, 9203213: 13, 9203214: 14, 9203215: 15, 9203216: 16, 9203217: 17, 9203218: 18, 9203219: 19, 9203220: 20, 9203221: 21, 9203222: 22, 9203223: 23, 9203224: 24, 9203225: 25, 9203226: 26, 9203001: 27, 9203060: 28, 9204201: 29, 9204202: 30, 9204203: 31, 9204204: 32, 9204205: 33, 9204206: 34, 9204207: 35, 9204208: 36, 9204209: 37, 9204210: 38, 9204211: 39, 9204212: 40, 9204213: 41, 9204214: 42, 9204215: 43, 9204216: 44, 9204217: 45, 9204218: 46, 9204219: 47, 9204220: 48, 9204221: 49, 9204222: 50, 9204223: 51, 9204224: 52, 9204225: 53, 9204226: 54, 9204227: 55, 9204228: 56, 9204229: 57, 9204230: 58, 9204231: 59, 9204232: 60, 9204233: 61, 9204234: 62, 9204235: 63, 9204236: 64, 9204237: 65, 9204238: 66, 9204239: 67, 9204017: 68, 9205201: 69, 9205202: 70, 9205203: 71, 9205204: 72, 9205205: 73, 9205206: 74, 9205207: 75, 9205208: 76, 9205209: 77, 9205210

In [85]:
import torch
from torch_geometric.data import Data
edge_index = []
for i ,j in G.edges():
    if(i in G.nodes and j in G.nodes()):
        edge_index.append((mapping_node[i],mapping_node[j]))
edge_index=torch.tensor(edge_index,dtype=torch.long).t().contiguous()
x = []
max = 0 
for m in mapping_node.values():
    if m > max:
        max = m
for i in range(1,max+1):
    x.append(1)
x = torch.tensor(x,dtype=torch.float).view(-1,1)
data = Data(x=x, edge_index=edge_index)

In [86]:
data.validate(raise_on_error=True)

True

In [92]:
from torch.nn import Linear
from torch_geometric.nn import GCNConv

class GCN(torch.nn.Module):
  def __init__(self):
    super(GCN, self).__init__()
    torch.manual_seed(42)
    self.conv1 = GCNConv(1, 4)
    self.conv2 = GCNConv(4, 4)
    self.conv3 = GCNConv(4, 2)
    self.classifier = Linear(2, 16)

  def forward(self, x, edge_index):
    h = self.conv1(x, edge_index)
    h = h.tanh()
    h = self.conv2(h, edge_index)
    h = h.tanh()
    h = self.conv3(h, edge_index)
    h = h.tanh()
    out = self.classifier(h)
    return out, h

model = GCN()
print(model)

GCN(
  (conv1): GCNConv(1, 4)
  (conv2): GCNConv(4, 4)
  (conv3): GCNConv(4, 2)
  (classifier): Linear(in_features=2, out_features=16, bias=True)
)


In [93]:
out, h = model(data.x, data.edge_index)

In [94]:
print("Embedding shape:", h.shape)

Embedding shape: torch.Size([11112, 2])


In [100]:
out_numpy = out.detach().numpy()

(11112, 16)


In [104]:
inverse_dict = {v: k for k, v in mapping_node.items()}
node_embedding_map = {}
for i in range(len(out_numpy)):
    node_embedding_map[inverse_dict[i+1]] = out_numpy[i]
# print(node_embedding_map)


In [110]:
result = []
for i, j, _ in data:
    if(i in G.nodes and j in G.nodes):
        result.append(node_embedding_map[i] + node_embedding_map[j])
print(len(result))

11633


In [112]:
from sklearn.model_selection import train_test_split
link = []
for a,b,i in data:
    if( a in G.nodes and b in G.nodes):
        link.append(i)
xtrain, xtest, ytrain, ytest = train_test_split(np.array(result), link, 
                                                test_size = 0.3, 
                                                random_state = 35)

In [113]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(class_weight="balanced")

lr.fit(xtrain, ytrain)

In [114]:
predictions = lr.predict_proba(xtest)
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
# accuracy_node2vec = accuracy_score(ytest, predictions)
roc_auc_score(ytest, predictions[:,1])

0.6539726681000801

In [9]:
from node2vec import Node2Vec

# Generate walks
node2vec = Node2Vec(G, dimensions=100, walk_length=16, num_walks=50)

# train node2vec model
n2w_model = node2vec.fit(window=7, min_count=1)


Computing transition probabilities: 100%|██████████| 11112/11112 [00:05<00:00, 2144.46it/s]
Generating walks (CPU: 1): 100%|██████████| 50/50 [00:26<00:00,  1.91it/s]


In [10]:
import pandas as pd
# Convert list to DataFrame
# df = pd.DataFrame(data, columns =['source', 'target', 'label'])
n2w_model_dict = {}
for node in G.nodes:
    n2w_model_dict[str(node)] = n2w_model.wv[str(node)]
print(int('9901206') in G.nodes)
result = []
for i, j, _ in data:
    if(i in G.nodes and j in G.nodes):
        result.append(n2w_model_dict[str(i)] + n2w_model_dict[str(j)])

False


In [11]:
print(len(result))


11633


In [111]:
from sklearn.model_selection import train_test_split
link = []
for a,b,i in data:
    if( a in G.nodes and b in G.nodes):
        link.append(i)
xtrain, xtest, ytrain, ytest = train_test_split(np.array(result), link, 
                                                test_size = 0.3, 
                                                random_state = 35)

In [13]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(class_weight="balanced")

lr.fit(xtrain, ytrain)

In [14]:
predictions = lr.predict_proba(xtest)

In [15]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
# accuracy_node2vec = accuracy_score(ytest, predictions)
roc_auc_score(ytest, predictions[:,1])


0.7831193677688085