In [1]:
! pip install networkx matplotlib torch torchvision torch-geometric

Collecting torch-geometric
  Downloading torch_geometric-2.5.3-py3-none-any.whl.metadata (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.2/64.2 kB[0m [31m586.2 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Downloading torch_geometric-2.5.3-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: torch-geometric
Successfully installed torch-geometric-2.5.3


In [2]:
import json
import os
import networkx as nx
import torch
from torch_geometric.utils import from_networkx
from torch_geometric.data import Data, DataLoader
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch.utils.data import random_split


# function to load json
def load_json(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

# make graph
def build_graph(data):
    G = nx.DiGraph()
    
    # add  root tweet
    root_id = data['id']
    G.add_node(root_id, **data)
    
    # recursive function to add children
    def add_children(children, parent_id):
        for child in children:
            G.add_node(child['id'], **child)
            G.add_edge(parent_id, child['id'])
            if 'children' in child:
                add_children(child['children'], child['id'])
    
    if 'children' in data:
        add_children(data['children'], root_id)
    
    return G



# function to normalize node attributes
def normalize_node_attributes(graph):
    all_attributes = set()
    for _, attrs in graph.nodes(data=True):
        all_attributes.update(attrs.keys())

    for node, attrs in graph.nodes(data=True):
        for attr in all_attributes:
            if attr not in attrs:
                attrs[attr] = None
                
    return graph


def convert_to_data(graph, label):
    data = from_networkx(graph)
    
    # check all nodes have feature vectors, initialize with default feature if not present
    if not hasattr(data, 'x') or data.x is None:
        num_nodes = data.num_nodes
        # initialize with example feature (all ones)
        data.x = torch.ones((num_nodes, 1))
    
    # add labels to all nodes
    data.y = torch.tensor([label] * data.num_nodes, dtype=torch.long)
    
    return data

# eventually would be better to take them out of folder programmatically instead of hand-writing
# json_dict = {"gossipcop_fake/gossipcop-1000240645.json": "fake", 
#             "gossipcop_fake/gossipcop-1000908841.json": 'fake', 
#             "gossipcop_fake/gossipcop-1012123555.json": 'fake',
#             "gossipcop_fake/gossipcop-1014383679.json": 'fake', 
#             "gossipcop_fake/gossipcop-1014616559.json": 'fake', 
#             "gossipcop_real/gossipcop-541230.json": 'real', 
#             "gossipcop_real/gossipcop-561182.json": 'real', 
#             'gossipcop_real/gossipcop-567233.json': 'real', 
#             'gossipcop_real/gossipcop-679264.json': 'real', 
#             'gossipcop_real/gossipcop-681826.json': 'real'
#             }

path = '/kaggle/input/test-network/nx_network_data/' # change to individual path


def create_json_dict(base_path):
    json_dict = {}
    
    for label in ['gossipcop_fake', 'gossipcop_real']:
        folder_path = os.path.join(base_path, label)
        files = os.listdir(folder_path)
        files = [f for f in files if f.endswith('.json')]
        for file in files[:10]:  # take the first 10 files, increase to as much as colab/kaggle can handle
            json_dict[os.path.join(label, file)] = 'fake' if label == 'gossipcop_fake' else 'real'
    
    return json_dict

json_dict = create_json_dict(path)

full_dataset = []

# loop to create mega dataset
for dataset in list(json_dict.keys()):
    file = load_json(path + dataset)
    graph = normalize_node_attributes(build_graph(file))
    data = convert_to_data(graph, 1) if json_dict[dataset] == 'fake' else convert_to_data(graph, 0)
    full_dataset.append(data)

print(full_dataset)

[Data(edge_index=[2, 28], time=[29], type=[29], user=[29], tweet_id=[29], id=[29], children=[29], num_nodes=29, x=[29, 1], y=[29]), Data(edge_index=[2, 2], time=[3], type=[3], user=[3], tweet_id=[3], id=[3], children=[3], num_nodes=3, x=[3, 1], y=[3]), Data(edge_index=[2, 15], time=[16], type=[16], user=[16], tweet_id=[16], id=[16], children=[16], num_nodes=16, x=[16, 1], y=[16]), Data(edge_index=[2, 13], time=[14], type=[14], user=[14], tweet_id=[14], id=[14], children=[14], num_nodes=14, x=[14, 1], y=[14]), Data(edge_index=[2, 6], time=[7], type=[7], user=[7], tweet_id=[7], id=[7], children=[7], num_nodes=7, x=[7, 1], y=[7]), Data(edge_index=[2, 15], time=[16], type=[16], user=[16], tweet_id=[16], id=[16], children=[16], num_nodes=16, x=[16, 1], y=[16]), Data(edge_index=[2, 46], time=[47], type=[47], user=[47], tweet_id=[47], id=[47], children=[47], num_nodes=47, x=[47, 1], y=[47]), Data(edge_index=[2, 41], time=[42], type=[42], user=[42], tweet_id=[42], id=[42], children=[42], num_n

In [54]:
# Split the dataset into training, validation, and test sets
train_size = int(0.7 * len(dataset))
val_size = int(0.15 * len(dataset))
test_size = len(dataset) - train_size - val_size

train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

# Create DataLoaders for each set
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False)

class GNN(nn.Module):
    def __init__(self, num_node_features):
        super(GNN, self).__init__()
        self.conv1 = GCNConv(num_node_features, 16)
        self.conv2 = GCNConv(16, 2)
    
    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        
        return F.log_softmax(x, dim=1)

# Initialize the model, optimizer, and loss function
num_node_features = 1  # Assuming each node has one feature, adjust based on your data
model = GNN(num_node_features)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fn = nn.CrossEntropyLoss()

In [56]:
# Training loop
def train(loader):
    model.train()
    total_loss = 0
    for data in loader:
        optimizer.zero_grad()
        out = model(data)
        loss = loss_fn(out, data.y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

# Evaluation function
def test(loader):
    model.eval()
    correct = 0
    for data in loader:
        with torch.no_grad():
            out = model(data)
            pred = out.argmax(dim=1)
            correct += (pred == data.y).sum().item()
    return correct / sum([data.num_nodes for data in loader.dataset])

# Training the model
for epoch in range(200):
    train_loss = train(train_loader)
    val_accuracy = test(val_loader)
    print(f'Epoch {epoch}, Loss: {train_loss}, Validation Accuracy: {val_accuracy}')

AttributeError: 'list' object has no attribute 'x'