In [4]:
import torch
import torch_geometric
import numpy as np


## Loading Dataset

In [5]:
from torch_geometric.datasets import Planetoid

dataset = Planetoid(root = 'Citeseer', name = 'Citeseer')

In [None]:
data = dataset[0]
print(dataset)
print(data)
print("number of graphs:\t\t",len(dataset))
print("number of classes:\t\t",dataset.num_classes)
print("number of classes:\t\t",np.unique(data.y))
print(f"Number of nodes:\t\t",data.num_nodes)
print("number of node features:\t",data.num_node_features)
print("number of edge features:\t",data.num_edge_features)
print("X shape: ", data.x.shape)
print("Edge shape: ", data.edge_index.shape)
print("Y shape: ", data.y.shape)

Citeseer()
Data(x=[3327, 3703], edge_index=[2, 9104], y=[3327], train_mask=[3327], val_mask=[3327], test_mask=[3327])
number of graphs:		 1
number of classes:		 6
number of classes:		 [0 1 2 3 4 5]
Number of nodes:		 3327
number of node features:	 3703
number of edge features:	 0
X shape:  torch.Size([3327, 3703])
Edge shape:  torch.Size([2, 9104])
Y shape:  torch.Size([3327])


train mask is a binary mask (a tensor or array of True/False or 1/0 values) that indicates which nodes in the graph should be used during training.

## EDA

In [6]:
import networkx as nx
import matplotlib.pyplot as plt
def visualize_graph(edge_index, labels, num_nodes):
    G = nx.Graph()
    edge_list = edge_index.t().tolist()  
    G.add_edges_from(edge_list)
    G.add_nodes_from(range(num_nodes)) 

    # Ensure labels match number of nodes
    labels = labels[:num_nodes]  # Truncate extra labels if necessary

    # Generate positions for the graph
    pos = nx.spring_layout(G, seed=42)  # Use spring layout for visualization

    plt.figure(figsize=(12, 8))
    nx.draw(
        G,
        pos,
        with_labels=False,
        node_color=labels,
        cmap=plt.cm.rainbow,
        node_size=50,
    )
    plt.title("Citeseer Graph Structure")
    plt.show()



In [7]:
# visualize_graph(data.edge_index, data.y.numpy(), data.num_nodes)

In [8]:
print(data.edge_index)

tensor([[ 628,  158,  486,  ..., 2820, 1643,   33],
        [   0,    1,    1,  ..., 3324, 3325, 3326]])


## Model Training


$$
H^{(l+1)} = \sigma\left(\widehat{A} H^{(l)} W^{(l)}\right)
$$

$$
\widehat{A} = \widetilde{D}^{-1/2} \widetilde{A} \widetilde{D}^{-1/2} 
$$

In [9]:
class GCN(torch.nn.Module):
    def __init__(self):
        super(GCN, self).__init__()
        self.conv1 = torch_geometric.nn.GCNConv(dataset.num_node_features, 16)
        self.conv2 = torch_geometric.nn.GCNConv(16, dataset.num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = torch.nn.functional.relu(self.conv1(x, edge_index))
        x = torch.nn.functional.dropout(x, training= self.training)

        x = torch.nn.functional.log_softmax(self.conv2(x, edge_index), dim= 1)

        return x


In [10]:
def train(model, data, optimizer, criterion, train_mask):
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = criterion(out[data.train_mask], data.y[train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

def test(model, data, mask):
    model.eval()
    with torch.no_grad():
        pred1 = model(data)
        pred = pred1.argmax(dim = 1)
        correct = (pred[mask] == data.y[mask]).sum()
        acc = int(correct) / int(mask.sum())
    return acc


In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GCN().to(device)
data = data.to(device)
print(model)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.NLLLoss()

GCN(
  (conv1): GCNConv(3703, 16)
  (conv2): GCNConv(16, 6)
)


In [None]:
for epoch in range(200):
    loss = train(model, data, optimizer, criterion, data.train_mask)
    train_acc = test(model, data, data.train_mask)
    val_acc = test(model, data, data.val_mask)
    print(f'Epoch {epoch+1}, Loss: {loss:.4f}, Train Acc: {train_acc:.4f}, Val Acc: {val_acc:.4f}')

    if val_acc >0.6500:
        break


# Does each time data changes??

model.eval()
with torch.no_grad():
    pred1 = model(data)
    pred = pred1.argmax(dim=1) 
    correct = (pred[data.test_mask] == data.y[data.test_mask]).sum() 
    acc = int(correct) / int(data.test_mask.sum())
    # f1 = f1_score(pred[data.test_mask].cpu(), data.y[data.test_mask].cpu(), average='macro') 
    print(f'Accuracy: {acc:.4f}') # ,f'F1_score: {f1:.4f}'


Epoch 1, Loss: 1.7936, Train Acc: 0.6250, Val Acc: 0.4220
Epoch 2, Loss: 1.6210, Train Acc: 0.7000, Val Acc: 0.4480
Epoch 3, Loss: 1.4151, Train Acc: 0.7417, Val Acc: 0.4660
Epoch 4, Loss: 1.2427, Train Acc: 0.7833, Val Acc: 0.4680
Epoch 5, Loss: 1.0864, Train Acc: 0.8417, Val Acc: 0.4960
Epoch 6, Loss: 0.9388, Train Acc: 0.9000, Val Acc: 0.5160
Epoch 7, Loss: 0.7937, Train Acc: 0.9250, Val Acc: 0.5660
Epoch 8, Loss: 0.7215, Train Acc: 0.9500, Val Acc: 0.5800
Epoch 9, Loss: 0.6519, Train Acc: 0.9583, Val Acc: 0.5880
Epoch 10, Loss: 0.5484, Train Acc: 0.9667, Val Acc: 0.6060
Epoch 11, Loss: 0.4772, Train Acc: 0.9667, Val Acc: 0.6060
Epoch 12, Loss: 0.4457, Train Acc: 0.9667, Val Acc: 0.6100
Epoch 13, Loss: 0.3863, Train Acc: 0.9917, Val Acc: 0.6120
Epoch 14, Loss: 0.3552, Train Acc: 1.0000, Val Acc: 0.6180
Epoch 15, Loss: 0.2994, Train Acc: 1.0000, Val Acc: 0.6200
Epoch 16, Loss: 0.2632, Train Acc: 1.0000, Val Acc: 0.6200
Epoch 17, Loss: 0.2312, Train Acc: 1.0000, Val Acc: 0.6280
Epoch 

In [13]:
def visualize_features(features, title="Node Features"):
    plt.figure(figsize=(12, 6))
    plt.imshow(features, aspect='auto', cmap='coolwarm')
    plt.colorbar()
    plt.title(title)
    plt.xlabel("Feature Dimension")
    plt.ylabel("Node Index")
    plt.show()

# visualize_features(data.x.numpy(), title="Original Node Features")

# Forward pass visualization
model = GCN(data.num_features, 16, dataset.num_classes)
with torch.no_grad():
    x = data.x
    visualize_features(x.numpy(), title="Input Features to Layer 1")
    x = model.gcn1(x, data.edge_index)
    visualize_features(x.numpy(), title="Output of GCN Layer 1")
    x = model.gcn2(x, data.edge_index)
    visualize_features(x.numpy(), title="Output of GCN Layer 2")


TypeError: GCN.__init__() takes 1 positional argument but 4 were given

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GCN(data.num_features, 16, dataset.num_classes).to(device)
data = data.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.NLLLoss()

train_losses = []
train_accuracies = []
val_accuracies = []

for epoch in range(100):
    loss = train(model, data, optimizer, criterion, data.train_mask)
    train_acc = test(model, data, data.train_mask)
    val_acc = test(model, data, data.val_mask)

    train_losses.append(loss)
    train_accuracies.append(train_acc)
    val_accuracies.append(val_acc)

    print(f"Epoch {epoch+1}, Loss: {loss:.4f}, Train Acc: {train_acc:.4f}, Val Acc: {val_acc:.4f}")

# Plot metrics
plt.figure(figsize=(12, 6))
plt.plot(train_losses, label="Train Loss")
plt.plot(train_accuracies, label="Train Accuracy")
plt.plot(val_accuracies, label="Validation Accuracy")
plt.xlabel("Epoch")
plt.ylabel("Value")
plt.title("Training and Validation Metrics")
plt.legend()
plt.show()


In [None]:
def visualize_adjacency(edge_index, num_nodes):
    """
    Visualizes the adjacency matrix of the graph.
    """
    adj_matrix = torch.zeros((num_nodes, num_nodes))
    adj_matrix[edge_index[0], edge_index[1]] = 1
    adj_matrix += torch.eye(num_nodes)
    plt.figure(figsize=(8, 8))
    plt.imshow(adj_matrix.numpy(), cmap='Greys', interpolation='none')
    plt.title("Adjacency Matrix")
    plt.colorbar(label="Edge Presence")
    plt.xlabel("Node")
    plt.ylabel("Node")
    plt.show()

visualize_adjacency(data.edge_index, data.num_nodes)

In [None]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

def visualize_embeddings(embeddings, labels, method='PCA', title='Node Embeddings'):
    """
    Visualize high-dimensional node embeddings using PCA or t-SNE.
    """
    if method == 'PCA':
        reducer = PCA(n_components=2)
    elif method == 't-SNE':
        reducer = TSNE(n_components=2, random_state=42)
    else:
        raise ValueError("Unsupported reduction method. Choose 'PCA' or 't-SNE'.")

    reduced_embeddings = reducer.fit_transform(embeddings)
    plt.figure(figsize=(10, 8))
    scatter = plt.scatter(
        reduced_embeddings[:, 0], reduced_embeddings[:, 1], 
        c=labels, cmap=plt.cm.rainbow, s=15
    )
    plt.colorbar(scatter, label="Node Labels")
    plt.title(title)
    plt.show()

# Forward pass to get node embeddings
model = GCN(data.num_features, 16, dataset.num_classes)
with torch.no_grad():
    embeddings1 = model.gcn1(data.x, data.edge_index)
    visualize_embeddings(embeddings1.numpy(), data.y.numpy(), method='t-SNE')

    embeddings2 = model.gcn2(embeddings1, data.edge_index)
    visualize_embeddings(embeddings2.numpy(), data.y.numpy(), method='t-SNE')


In [None]:
def visualize_parameters(model):
    for name, param in model.named_parameters():
        if param.requires_grad:
            plt.figure()
            plt.hist(param.detach().cpu().numpy().flatten(), bins=50)
            plt.title(f"Distribution of {name}")
            plt.xlabel("Value")
            plt.ylabel("Frequency")
            plt.show()

visualize_parameters(model)


In [None]:
# Save the trained model
torch.save(model.state_dict(), "gcn_model.pth")
print("Model saved to gcn_model.pth")


**Recreate the model architecture**

model = GCN(data.num_features, 16, dataset.num_classes)

model.load_state_dict(torch.load("gcn_model.pth"))

model.eval()  # Set the model to evaluation mode

print("Model loaded from gcn_model.pth")
