# MSRC_21

## Graph Classification

In [None]:
import torch_geometric
import matplotlib.pyplot as plt
import networkx as nx
from torch_geometric.utils import to_networkx
import torch_geometric.datasets as datasets
from torch_geometric.loader import DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn.functional as F
from torch_geometric.datasets import TUDataset
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GCNConv, global_mean_pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import torch
import torch.nn.functional as F
from torch_geometric.nn import GATConv, global_mean_pool

from torch_geometric.data import Data, DataLoader, InMemoryDataset


import warnings
warnings.filterwarnings('ignore')

In [None]:
DATASET_PATH = 'dataset'
dataset = torch_geometric.datasets.TUDataset(root=DATASET_PATH, name="MSRC_21")

In [None]:
# Number of graphs
num_graphs = len(dataset)

# Number of classes (unique labels)
num_classes = dataset.num_classes

# Get the labels (for each graph in the dataset)
labels = [data.y.item() for data in dataset]

# Calculate the average number of nodes and edges
total_nodes = sum(data.num_nodes for data in dataset)
total_edges = sum(data.num_edges for data in dataset)
avg_nodes = total_nodes / num_graphs
avg_edges = total_edges / num_graphs

# Display the results
print(f"Number of graphs: {num_graphs}")
print(f"Number of classes: {num_classes}")
print(f"Labels: {set(labels)}")
print(f"Average number of nodes: {avg_nodes}")
print(f"Average number of edges: {avg_edges}")

In [None]:
# Shuffle the dataset and split into train and test sets
train_idx, test_idx = train_test_split(range(len(dataset)), test_size=0.2, random_state=42)
train_dataset = dataset[train_idx]
test_dataset = dataset[test_idx]

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [None]:
class GAT(torch.nn.Module):
    def __init__(self, hidden_channels, heads=8, dropout=0.6):
        super(GAT, self).__init__()
        torch.manual_seed(42)
        
        self.conv1 = GATConv(dataset.num_node_features, hidden_channels, heads=heads, dropout=dropout)
        # Combine the heads by averaging
        self.conv2 = GATConv(hidden_channels * heads, hidden_channels, heads=1, concat=False, dropout=dropout)
        self.lin = torch.nn.Linear(hidden_channels, dataset.num_classes)
        self.dropout = dropout

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        # First GAT layer
        x = self.conv1(x, edge_index)
        x = F.elu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)

        # Second GAT layer
        x, attn_weights = self.conv2(x, edge_index, return_attention_weights=True)
        x = F.elu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)

        # Global mean pooling
        x = global_mean_pool(x, data.batch)

        # Classifier
        x = self.lin(x)

        return F.log_softmax(x, dim=1), attn_weights

# Set device (use GPU if available)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize the model and move it to the appropriate device
hidden_channels = 64
model = GAT(hidden_channels=hidden_channels).to(device)

# Initialize the optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)

In [None]:


# Training function
def train():
    model.train()
    total_loss = 0
    for data in train_loader:
        data = data.to(device)  # Move the data to GPU
        optimizer.zero_grad()
        out, _ = model(data)
        loss = F.nll_loss(out, data.y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * data.num_graphs
    return total_loss / len(train_loader.dataset)

# Test function
def test(loader):
    model.eval()
    correct = 0
    predictions = []
    labels = []
    with torch.no_grad():
        for data in loader:
            data = data.to(device)  # Move the data to GPU
            out, _ = model(data)
            pred = out.argmax(dim=1)
            correct += (pred == data.y).sum().item()
            predictions.extend(pred.tolist())
            labels.extend(data.y.tolist())
    return correct / len(loader.dataset), predictions, labels

# Training loop
train_acc_list = []
test_acc_list = []
loss_list = []
num_epochs = 200

for epoch in range(1, num_epochs + 1):
    loss = train()
    train_acc, _, _ = test(train_loader)
    test_acc, predictions, true_labels = test(test_loader)
    
    train_acc_list.append(train_acc)
    test_acc_list.append(test_acc)
    loss_list.append(loss)
    
    if epoch % 10 == 0 or epoch == 1:
        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}')

# Classification report
print("Classification Report:")
print(classification_report(true_labels, predictions, target_names=[f'Class {i}' for i in range(dataset.num_classes)]))

# Confusion matrix
cm = confusion_matrix(true_labels, predictions)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=[f'Class {i}' for i in range(dataset.num_classes)],
            yticklabels=[f'Class {i}' for i in range(dataset.num_classes)])
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix")
plt.show()

# Plot accuracy curves
plt.figure(figsize=(8, 6))
plt.plot(range(1, num_epochs + 1), train_acc_list, label="Train Accuracy")
plt.plot(range(1, num_epochs + 1), test_acc_list, label="Test Accuracy")
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend()
plt.title("Accuracy over Epochs")
plt.show()

# Plot loss curve
plt.figure(figsize=(8, 6))
plt.plot(range(1, num_epochs + 1), loss_list, label="Training Loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.title("Training Loss over Epochs")
plt.show()


## Training with subgraphs

In [None]:
# Define parameters for sliding window
window_size = 62 # Size of the sliding window (number of nodes in the subgraph)
step_size = 5     # Step size for the sliding window

# Load the dataset
DATASET_PATH = 'dataset'
dataset = TUDataset(root=DATASET_PATH, name="MSRC_21")



def create_subgraphs(graph, window_size, step_size):
    subgraphs = []
    num_nodes = graph.num_nodes

    for start in range(0, num_nodes - window_size + 1, step_size):
        end = start + window_size

        # Create a subgraph with the correct node features
        subgraph_x = graph.x[start:end]  # Node features

        # Create masks for edges within the subgraph
        subgraph_edge_index = graph.edge_index
        mask = (subgraph_edge_index[0] >= start) & (subgraph_edge_index[0] < end) & \
               (subgraph_edge_index[1] >= start) & (subgraph_edge_index[1] < end)

        # Filter edges using the mask
        subgraph_edge_index = subgraph_edge_index[:, mask]

        # Adjust the indices of the filtered edges to reflect their position in the subgraph
        subgraph_edge_index[0] -= start
        subgraph_edge_index[1] -= start

        # Ensure that edge_index is not empty
        if subgraph_edge_index.size(1) == 0:
            # If no edges, create a self-loop for each node to avoid empty edge_index
            subgraph_edge_index = torch.stack([torch.arange(end - start), torch.arange(end - start)], dim=0)

        # Create the subgraph
        subgraph = Data(x=subgraph_x, edge_index=subgraph_edge_index, y=graph.y)  # Keep the label
        
        # **Add the original node indices as an attribute**
        subgraph.original_node_indices = list(range(start, end))

        subgraphs.append(subgraph)

    return subgraphs





class SubgraphDataset(InMemoryDataset):
    def __init__(self, dataset):
        super(SubgraphDataset, self).__init__(root=DATASET_PATH)
        # Flatten list of subgraphs
        self.data_list = []
        self.labels = []

        for graph in dataset:
            subgraphs = create_subgraphs(graph, window_size, step_size)
            self.data_list.extend(subgraphs)
            self.labels.extend([graph.y] * len(subgraphs))  # Add label for each subgraph

        # Convert to tensor
        self.data, self.slices = self.collate(self.data_list)

    def get_labels(self):
        return torch.tensor(self.labels)


train_graphs, test_graphs = train_test_split(dataset, test_size=0.2, random_state=42)

train_subgraph_dataset = SubgraphDataset(train_graphs)
test_subgraph_dataset = SubgraphDataset(test_graphs)


train_loader = DataLoader(train_subgraph_dataset.data_list, batch_size=32, shuffle=True)
test_loader = DataLoader(test_subgraph_dataset.data_list, batch_size=32, shuffle=False)

In [None]:
# Initialize model, optimizer, and loss function
hidden_channels = 64
heads = 8
dropout = 0.6
model = GAT(hidden_channels=hidden_channels, heads=heads, dropout=dropout)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Move data to device
def move_to_device(batch, device):
    batch = batch.to(device)
    return batch

# Weakly supervised training
model.train()
for epoch in range(50):  # Number of epochs
    total_loss = 0
    correct = 0
    total = 0
    for batch in train_loader:
        batch = move_to_device(batch, device)
        optimizer.zero_grad()
        out, attn_weights = model(batch)
        loss = criterion(out, batch.y)  # Compute loss using graph labels
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        
        
        _, predicted = torch.max(out, dim=1)
        correct += (predicted == batch.y).sum().item()
        total += batch.y.size(0)
    
    avg_loss = total_loss / len(train_loader)
    accuracy = correct / total
    print(f'Epoch {epoch + 1}, Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}')


In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report

# Evaluation function to select top-k subgraphs based on attention weights
def evaluate_model_with_attention(model, dataset, k=3):
    model.eval()
    all_predictions = []
    all_true_labels = []
    correct = 0
    total = 0

    with torch.no_grad():
        for graph in dataset:
            subgraphs = create_subgraphs(graph, window_size, step_size)
            subgraph_outputs = []
            subgraph_attention_scores = []

            for subgraph in subgraphs:
                subgraph = subgraph.to(device)
                output, attn_weights = model(subgraph)
                
                # Extract attention weights
                if isinstance(attn_weights, tuple) or isinstance(attn_weights, list):
                    attention_tensor = attn_weights[-1]
                else:
                    attention_tensor = attn_weights

                # Compute a single attention score for the subgraph
                attention_score = attention_tensor.mean().item()
                
                subgraph_outputs.append(output.unsqueeze(0))
                subgraph_attention_scores.append(attention_score)

            if not subgraph_outputs:
                continue

            subgraph_outputs = torch.cat(subgraph_outputs, dim=0)
            subgraph_attention_scores = torch.tensor(subgraph_attention_scores)

            # Select top-k subgraphs based on attention scores
            current_k = min(k, len(subgraph_outputs))
            if current_k == 0:
                continue

            top_k_values, top_k_indices = subgraph_attention_scores.topk(current_k, dim=0, largest=True, sorted=True)
            top_k_subgraphs = subgraph_outputs[top_k_indices]

            # Aggregate the top-k subgraph outputs (mean aggregation)
            final_prediction = top_k_subgraphs.mean(dim=0)

            

            # Apply softmax to get probabilities
            final_prediction = torch.softmax(final_prediction, dim=1)



            # Apply argmax to find the predicted class
            final_prediction_class = final_prediction.argmax(dim=1).item()  # Convert to scalar

            
            true_label = graph.y.item()

            all_predictions.append(final_prediction_class)
            all_true_labels.append(true_label)

            if final_prediction_class == true_label:
                correct += 1
            total += 1

    accuracy = correct / total if total > 0 else 0
    print(f'Accuracy: {accuracy:.4f}')

    cm = confusion_matrix(all_true_labels, all_predictions)
    report = classification_report(all_true_labels, all_predictions,target_names=class_names)
    
    # Plot confusion matrix
    plt.figure(figsize=(10, 8))
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title('Confusion Matrix')
    plt.colorbar()
    tick_marks = np.arange(20)
    plt.xticks(tick_marks, class_names, rotation=45)
    plt.yticks(tick_marks, class_names)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()
    plt.show()
    
    print(report)

# Evaluate the model using the top-k subgraph selection
class_names = [f'Class {i}' for i in range(dataset.num_classes)]
evaluate_model_with_attention(model, test_graphs,k=3)


In [None]:
import matplotlib.pyplot as plt
import networkx as nx
import torch
from torch_geometric.utils import to_networkx

def visualize_graph(nx_graph, title, pos, color='lightblue', with_labels=False, node_size=300, edge_color='gray'):
    """
    Function to visualize the entire graph.
    """
    plt.figure(figsize=(8, 6))
    nx.draw(nx_graph, pos, node_color=color, with_labels=with_labels, node_size=node_size, edge_color=edge_color)
    plt.title(title)
    plt.show()

def visualize_subgraph_highlight(nx_graph, subgraph, pos, color, title):
    """
    Function to visualize a subgraph with a specific color.
    Highlights the nodes and edges corresponding to the subgraph.
    """
    plt.figure(figsize=(8, 6))
    
    # Draw the full graph with light gray nodes and edges
    nx.draw(nx_graph, pos, node_color='lightgray', with_labels=False, node_size=300, edge_color='lightgray')  
    
    # Highlight the subgraph nodes and edges
    subgraph_nodes = subgraph.original_node_indices
    subgraph_edges = subgraph.edge_index.cpu().numpy().T  # Convert to numpy array
    # Map subgraph edge indices back to original graph node indices
    original_edges = [(subgraph.original_node_indices[src], subgraph.original_node_indices[dst]) for src, dst in subgraph_edges]
    
    nx.draw_networkx_nodes(nx_graph, pos, nodelist=subgraph_nodes, node_color=color, node_size=300)
    nx.draw_networkx_edges(nx_graph, pos, edgelist=original_edges, edge_color=color, width=2)
    
    plt.title(title)
    plt.show()




def evaluate_and_visualize_top_k_separately(model, dataset, k=3, random_seed=42):
    model.eval()
    
    with torch.no_grad():
        for graph in dataset:
            subgraphs = create_subgraphs(graph, window_size, step_size)
            print(f'Number of subgraphs: {len(subgraphs)}')
            subgraph_outputs = []
            subgraph_attention_scores = []

            # Convert the original PyTorch Geometric graph to a NetworkX graph
            nx_graph = to_networkx(graph, to_undirected=True)
            pos = nx.spring_layout(nx_graph, seed=random_seed)  # Generate positions once for consistency

            for subgraph in subgraphs:
                subgraph = subgraph.to(device)
                output, attn_weights = model(subgraph)

                # Extract attention weights and compute a single attention score
                if isinstance(attn_weights, (tuple, list)):
                    attention_tensor = attn_weights[-1]  # Use the last attention layer's weights
                else:
                    attention_tensor = attn_weights

                attention_score = attention_tensor.mean().item()
                subgraph_outputs.append(output.unsqueeze(0))
                subgraph_attention_scores.append(attention_score)

            if not subgraph_outputs:
                continue

            subgraph_outputs = torch.cat(subgraph_outputs, dim=0)
            subgraph_attention_scores = torch.tensor(subgraph_attention_scores)

            # Select top-k subgraphs based on attention scores
            current_k = min(k, len(subgraph_outputs))
            if current_k == 0:
                continue

            top_k_values, top_k_indices = subgraph_attention_scores.topk(current_k, dim=0, largest=True, sorted=True)
            
            # original graph
            visualize_graph(nx_graph, title="Original Graph", pos=pos)

            # top-k subgraphs
            colors = ['red', 'green', 'blue']  # Extend this list if k > 3
            for i, idx in enumerate(top_k_indices):
                top_subgraph = subgraphs[idx]
                visualize_subgraph_highlight(nx_graph, top_subgraph, pos, color=colors[i % len(colors)], title=f'Top-{i+1} Subgraph (Attention Score: {subgraph_attention_scores[idx]:.4f})')

            
            break



evaluate_and_visualize_top_k_separately(model, test_graphs)
