In [1]:
! pip install torch_geometric networkx matplotlib torch torchvision torch-geometric torchsummary layers

Defaulting to user installation because normal site-packages is not writeable




Collecting torch_geometric
  Obtaining dependency information for torch_geometric from https://files.pythonhosted.org/packages/97/f0/66ad3a5263aa16efb534aaf4e7da23ffc28c84efbbd720b0c5ec174f6242/torch_geometric-2.5.3-py3-none-any.whl.metadata
  Downloading torch_geometric-2.5.3-py3-none-any.whl.metadata (64 kB)
     ---------------------------------------- 0.0/64.2 kB ? eta -:--:--
     ---------------------------------------- 0.0/64.2 kB ? eta -:--:--
     ------------------------ ------------- 41.0/64.2 kB 960.0 kB/s eta 0:00:01
     -------------------------------------- 64.2/64.2 kB 691.2 kB/s eta 0:00:00
Collecting layers
  Downloading layers-0.1.5.tar.gz (5.5 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting bashutils (from layers)
  Downloading Bashutils-0.0.4.tar.gz (4.2 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Downloading torch_geometric-2.5.3-py3

In [2]:
! pip install torch-scatter torch-sparse torch-cluster torch-spline-conv

Defaulting to user installation because normal site-packages is not writeable
Collecting torch-scatter
  Downloading torch_scatter-2.1.2.tar.gz (108 kB)
     ---------------------------------------- 0.0/108.0 kB ? eta -:--:--
     --- ------------------------------------ 10.2/108.0 kB ? eta -:--:--
     ------------------------------------ - 102.4/108.0 kB 1.2 MB/s eta 0:00:01
     -------------------------------------- 108.0/108.0 kB 1.0 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting torch-sparse
  Downloading torch_sparse-0.6.18.tar.gz (209 kB)
     ---------------------------------------- 0.0/210.0 kB ? eta -:--:--
     ----------- ---------------------------- 61.4/210.0 kB ? eta -:--:--
     ------------------------------- ------ 174.1/210.0 kB 2.6 MB/s eta 0:00:01
     -------------------------------------- 210.0/210.0 kB 3.2 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing meta

  error: subprocess-exited-with-error
  
  python setup.py bdist_wheel did not run successfully.
  exit code: 1
  
  [33 lines of output]
  running bdist_wheel
  running build
  running build_py
  creating build
  creating build\lib.win-amd64-cpython-311
  creating build\lib.win-amd64-cpython-311\torch_scatter
  copying torch_scatter\placeholder.py -> build\lib.win-amd64-cpython-311\torch_scatter
  copying torch_scatter\scatter.py -> build\lib.win-amd64-cpython-311\torch_scatter
  copying torch_scatter\segment_coo.py -> build\lib.win-amd64-cpython-311\torch_scatter
  copying torch_scatter\segment_csr.py -> build\lib.win-amd64-cpython-311\torch_scatter
  copying torch_scatter\testing.py -> build\lib.win-amd64-cpython-311\torch_scatter
  copying torch_scatter\utils.py -> build\lib.win-amd64-cpython-311\torch_scatter
  copying torch_scatter\__init__.py -> build\lib.win-amd64-cpython-311\torch_scatter
  creating build\lib.win-amd64-cpython-311\torch_scatter\composite
  copying torch_scatte

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
import os
import json
import networkx as nx
import random
from torch_geometric.utils import from_networkx, to_networkx
from torch_geometric.data import Data, DataLoader
from torch.utils.data import random_split
import matplotlib.pyplot as plt
import torch.optim as optim

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [8]:
# Load the data and check how many nodes don't have the sentiment label
# Function to load JSON
def load_json(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

# Function to build a graph
def build_graph(data):
    G = nx.DiGraph()
    
    # Add root tweet
    root_id = data['id']
    G.add_node(root_id, **data)
    
    # Recursive function to add children
    def add_children(children, parent_id):
        for child in children:
            G.add_node(child['id'], **child)
            G.add_edge(parent_id, child['id'])
            if 'children' in child:
                add_children(child['children'], parent_id)
    
    if 'children' in data:
        add_children(data['children'], root_id)
    
    return G

# Function to count nodes missing sentiment
def count_nodes_missing_sentiment(graph):
    total_missing_sentiment = 0
    for node, attrs in graph.nodes(data=True):
        if 'sentiment' not in attrs or attrs['sentiment'] is None:
            total_missing_sentiment += 1
    return total_missing_sentiment

# Function to normalize node attributes
def normalize_node_attributes(graph):
    all_attributes = set()
    for _, attrs in graph.nodes(data=True):
        all_attributes.update(attrs.keys())
    
    for node, attrs in graph.nodes(data=True):
        for attr in all_attributes:
            if attr not in attrs:
                attrs[attr] = None  # Or use another default value 
    return graph

# Adjusted convert to data
def convert_to_data(graph, label):
    data = from_networkx(graph)
    
    # Ensure node features exist and are consistent
    if not hasattr(data, 'x') or data.x is None:
        num_nodes = data.num_nodes
        degrees = torch.tensor([degree for _, degree in graph.degree()], dtype=torch.float).view(-1, 1)
        clustering = torch.tensor([nx.clustering(graph, node) for node in graph.nodes], dtype=torch.float).view(-1, 1)
        data.x = torch.cat([degrees, clustering], dim=1)
    
    # Ensure all required attributes are included
    if not hasattr(data, 'y') or data.y is None:
        data.y = torch.tensor([label] * data.num_nodes, dtype=torch.long)
    
    return data

# Path to the dataset
path = 'nx_network_data/nx_network_data'  # Change to individual path

# Function to create a dictionary of JSON files
def create_json_dict(base_path):
    json_dict = {}
    
    for label in ['politifact_fake', 'politifact_real']:
        folder_path = os.path.join(base_path, label)
        files = os.listdir(folder_path)
        files = [f for f in files if f.endswith('.json')]
        for file in files:  
            json_dict[os.path.join(label, file)] = 'fake' if label == 'politifact_fake' else 'real'
    
    return json_dict

json_dict = create_json_dict(path)

full_dataset = []
total_missing_sentiment = 0

# Loop to create a mega dataset
for dataset in list(json_dict.keys()):
    file = load_json(os.path.join(path, dataset))  # Correctly join paths
    graph = build_graph(file)
    
    # Count nodes missing sentiment
    missing_sentiment_count = count_nodes_missing_sentiment(graph)
    total_missing_sentiment += missing_sentiment_count
    
    graph = normalize_node_attributes(graph)
    label = 1 if json_dict[dataset] == 'fake' else 0
    data = convert_to_data(graph, label)
    
    if data is not None:
        full_dataset.append(data)

print(f"Total nodes missing 'sentiment': {total_missing_sentiment}")

# Check and clean up the dataset
cleaned_dataset = [data for data in full_dataset if data.y is not None]

# Splitting the dataset
fake_news_data = [data for data in cleaned_dataset if data.y[0].item() == 1]
real_news_data = [data for data in cleaned_dataset if data.y[0].item() == 0]

balanced_full_dataset = fake_news_data + random.choices(fake_news_data, k=len(real_news_data) - len(fake_news_data)) + real_news_data
random.shuffle(balanced_full_dataset)

train_size = int(0.7 * len(balanced_full_dataset))
val_size = int(0.15 * len(balanced_full_dataset))
test_size = len(balanced_full_dataset) - train_size - val_size

train_dataset, val_dataset, test_dataset = random_split(balanced_full_dataset, [train_size, val_size, test_size])

# Create DataLoaders for each set
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False)

# Define the GCN model
class GCN(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)
        self.dropout = nn.Dropout(0.5)  # added dropout layer

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.dropout(x)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

Total nodes missing 'sentiment': 578173


In [11]:
# Load the data and remove the sentiment attribute
# Function to load JSON
def load_json(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

# Function to build a graph
def build_graph(data):
    G = nx.DiGraph()
    
    # Add root tweet
    root_id = data['id']
    if 'sentiment' in data:
        del data['sentiment']  # Remove sentiment attribute
    G.add_node(root_id, **data)
    
    # Recursive function to add children
    def add_children(children, parent_id):
        for child in children:
            if 'sentiment' in child:
                del child['sentiment']  # Remove sentiment attribute
            G.add_node(child['id'], **child)
            G.add_edge(parent_id, child['id'])
            if 'children' in child:
                add_children(child['children'], child['id'])
    
    if 'children' in data:
        add_children(data['children'], root_id)
    
    return G

# Function to normalize node attributes
def normalize_node_attributes(graph):
    all_attributes = set()
    for _, attrs in graph.nodes(data=True):
        all_attributes.update(attrs.keys())
    
    for node, attrs in graph.nodes(data=True):
        for attr in all_attributes:
            if attr not in attrs:
                attrs[attr] = None  # Or use another default value 
    return graph


# Adjusted convert to data
def convert_to_data(graph, label):
    data = from_networkx(graph)
    
    # Ensure node features exist and are consistent
    if not hasattr(data, 'x') or data.x is None:
        num_nodes = data.num_nodes
        degrees = torch.tensor([degree for _, degree in graph.degree()], dtype=torch.float).view(-1, 1)
        clustering = torch.tensor([nx.clustering(graph, node) for node in graph.nodes], dtype=torch.float).view(-1, 1)
        data.x = torch.cat([degrees, clustering], dim=1)
    
    # Ensure all required attributes are included
    if not hasattr(data, 'y') or data.y is None:
        data.y = torch.tensor([label] * data.num_nodes, dtype=torch.long)
    
    return data

# Path to the dataset
path = 'nx_network_data/nx_network_data'    # Change to individual path

# Function to create a dictionary of JSON files
def create_json_dict(base_path):
    json_dict = {}
    
    for label in ['politifact_fake', 'politifact_real']:
        folder_path = os.path.join(base_path, label)
        files = os.listdir(folder_path)
        files = [f for f in files if f.endswith('.json')]
        for file in files:  
            json_dict[os.path.join(label, file)] = 'fake' if label == 'politifact_fake' else 'real'
    
    return json_dict

json_dict = create_json_dict(path)

full_dataset = []

# Loop to create a mega dataset
for dataset in list(json_dict.keys()):
    file = load_json(os.path.join(path, dataset))  # Correctly join paths
    graph = build_graph(file)
    graph = normalize_node_attributes(graph)
    label = 1 if json_dict[dataset] == 'fake' else 0
    data = convert_to_data(graph, label)
    
    if data is not None:
        full_dataset.append(data)

# Check and clean up the dataset
cleaned_dataset = [data for data in full_dataset if data.y is not None]


# Splitting the dataset
fake_news_data = [data for data in cleaned_dataset if data.y[0].item() == 1]
real_news_data = [data for data in cleaned_dataset if data.y[0].item() == 0]

balanced_full_dataset = fake_news_data + random.choices(fake_news_data, k=len(real_news_data) - len(fake_news_data)) + real_news_data
random.shuffle(balanced_full_dataset)

train_size = int(0.7 * len(balanced_full_dataset))
val_size = int(0.15 * len(balanced_full_dataset))
test_size = len(balanced_full_dataset) - train_size - val_size

train_dataset, val_dataset, test_dataset = random_split(balanced_full_dataset, [train_size, val_size, test_size])

# Create DataLoaders for each set
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False)

# Define the GCN model
class GCN(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)
        self.dropout = nn.Dropout(0.5)  # added dropout layer

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.dropout(x)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1) 

In [12]:
# Define the EarlyStopping class
class EarlyStopping:
    def __init__(self, patience=25, min_delta=0.0001, path='checkpoint.pt'):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_loss = None
        self.early_stop = False
        self.path = path

    def __call__(self, val_loss, model):
        if self.best_loss is None:
            self.best_loss = val_loss
            self.save_checkpoint(val_loss, model)
        elif val_loss > self.best_loss - self.min_delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            if val_loss < self.best_loss:
                self.save_checkpoint(val_loss, model)
                self.best_loss = val_loss
                self.counter = 0

    def save_checkpoint(self, val_loss, model):
        torch.save(model.state_dict(), self.path)
        print(f'Validation loss decreased ({self.best_loss:.6f} --> {val_loss:.6f}).  Saving model ...')

# Define the training function
def train_model(epochs, model, optimizer, criterion, train_loader, val_loader, early_stopper, device, checkpoint_path):
    train_losses = []
    val_losses = []
    
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        
        for data in train_loader:
            
            data = data.to(device)
            optimizer.zero_grad()
            output = model(data.x, data.edge_index)
            loss = criterion(output, data.y)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        train_losses.append(train_loss / len(train_loader))

        model.eval()
        val_loss = 0
        with torch.no_grad():
            for data in val_loader:
                data = data.to(device)
                output = model(data.x, data.edge_index)
                loss = criterion(output, data.y)
                val_loss += loss.item()

        val_losses.append(val_loss / len(val_loader))
        
        print(f'Epoch: {epoch+1}, Training Loss: {train_loss / len(train_loader):.4f}, Validation Loss: {val_loss / len(val_loader):.4f}')

        early_stopper(val_loss, model)
        if early_stopper.early_stop:
            print("Early stopping triggered.")
            break

    model.load_state_dict(torch.load(checkpoint_path))
    return train_losses, val_losses 

In [None]:
# Initialize the model, optimizer, and loss function
input_dimensions = 2  # Assuming two input features
hidden_dimensions = 16
output_dimensions = 2
gcn_model = GCN(in_channels=input_dimensions, hidden_channels=hidden_dimensions, out_channels=output_dimensions)
optimizer = torch.optim.Adam(gcn_model.parameters(), lr=0.0001, weight_decay=0.0001)
criterion = nn.CrossEntropyLoss()

# Initialize early stopping
early_stopper = EarlyStopping(patience=25, min_delta=0.0001, path='gnn_checkpoint.pt')

# Move model to device
gcn_model.to(device)
print(gcn_model)

# Train the GNN model
epochs = 10
train_losses_gcn, val_losses_gcn = train_model(epochs, gcn_model, optimizer, criterion, train_loader, val_loader, early_stopper, device, 'gnn_checkpoint.pt')

# Plot training and validation losses
plt.plot(train_losses_gcn, label='Training Loss GCN')
plt.plot(val_losses_gcn, label='Validation Loss GCN')
plt.xlabel('Epochs')
plt.ylabel('Loss GCN')
plt.legend()
plt.show()

GCN(
  (conv1): GCNConv(2, 16)
  (conv2): GCNConv(16, 2)
  (dropout): Dropout(p=0.5, inplace=False)
)
Epoch: 1, Training Loss: 278.7524, Validation Loss: 350.9149
Validation loss decreased (16492.999742 --> 16492.999742).  Saving model ...
Epoch: 2, Training Loss: 248.6027, Validation Loss: 324.6030
Validation loss decreased (16492.999742 --> 15256.342283).  Saving model ...
Epoch: 3, Training Loss: 245.1599, Validation Loss: 296.9014
Validation loss decreased (15256.342283 --> 13954.365019).  Saving model ...
Epoch: 4, Training Loss: 219.3559, Validation Loss: 269.2669
Validation loss decreased (13954.365019 --> 12655.545672).  Saving model ...
Epoch: 5, Training Loss: 218.9396, Validation Loss: 239.9239
Validation loss decreased (12655.545672 --> 11276.421448).  Saving model ...
Epoch: 6, Training Loss: 160.5592, Validation Loss: 215.2856
Validation loss decreased (11276.421448 --> 10118.424992).  Saving model ...
Epoch: 7, Training Loss: 178.9537, Validation Loss: 188.3588
Validatio