In [1]:
import torch
from torch_geometric.nn import HeteroConv, SAGEConv
from torch_geometric.datasets import IMDB
import torch_geometric.transforms as T
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import torch.nn.functional as F

import numpy as np



In [2]:
# Check for CUDA availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [3]:
# Define the train_val_test_split function
def train_val_test_split(y, val_frac=0.2, test_frac=0.2):
    y = y.numpy().astype(int)
    train_indices, temp_indices = train_test_split(
        np.arange(len(y)),
        test_size=val_frac + test_frac,
        stratify=y
    )
    val_indices, test_indices = train_test_split(
        temp_indices,
        test_size=test_frac / (val_frac + test_frac),
        stratify=y[temp_indices]
    )
    train_mask = torch.zeros(len(y), dtype=torch.bool)
    train_mask[train_indices] = True
    val_mask = torch.zeros(len(y), dtype=torch.bool)
    val_mask[val_indices] = True
    test_mask = torch.zeros(len(y), dtype=torch.bool)
    test_mask[test_indices] = True
    return train_mask, val_mask, test_mask

In [4]:
# Load the dataset
transform = T.ToUndirected()
data = IMDB(root="/content/drive/MyDrive/IMDB", transform=transform)[0]

# Prepare the response variable
data["movie"].y[data["movie"].y == 2] = 1
data["movie"].y = data["movie"].y.to(torch.float32)

In [5]:
# Split the dataset
train_mask, val_mask, test_mask = train_val_test_split(
    y=data["movie"].y,
    val_frac=0.2,
    test_frac=0.2
)

# Assign masks to data
data["movie"].train_mask = train_mask
data["movie"].val_mask = val_mask
data["movie"].test_mask = test_mask


In [6]:

# Define the model classes
class OutputLayer(torch.nn.Module):
    def __init__(self, data, node_type="movie"):
        super().__init__()
        self.node_type = node_type
        mp_dict = {}

        for meta_step in data.metadata()[1]:
            if meta_step[2] == self.node_type:
                src_channels = data[meta_step[0]].x.shape[1]
                dst_channels = data[meta_step[2]].x.shape[1]
                mp = SAGEConv((src_channels, dst_channels), 1, aggr='mean')
                mp_dict[meta_step] = mp

        # Add self-loop convolutions for node_type
        mp = SAGEConv((data[node_type].x.shape[1], data[node_type].x.shape[1]), 1, aggr='mean')
        mp_dict[(node_type, 'self', node_type)] = mp

        self.conv = HeteroConv(mp_dict, aggr="sum")

    def forward(self, x_dict, edge_index_dict):
        return self.conv(x_dict, edge_index_dict)

class InnerLayer(torch.nn.Module):
    def __init__(self, data):
        super().__init__()
        mp_dict = {}

        for meta_step in data.metadata()[1]:
            src_channels = data[meta_step[0]].x.shape[1]
            dst_channels = data[meta_step[2]].x.shape[1]
            mp = SAGEConv((src_channels, dst_channels), dst_channels, aggr='mean')
            mp_dict[meta_step] = mp

        # Add self-loop convolutions
        for node_type in data.node_types:
            mp = SAGEConv((data[node_type].x.shape[1], data[node_type].x.shape[1]), dst_channels, aggr='mean')
            mp_dict[(node_type, 'self', node_type)] = mp

        self.conv = HeteroConv(mp_dict, aggr="sum")

    def forward(self, x_dict, edge_index_dict):
        return self.conv(x_dict, edge_index_dict)

In [7]:

class HMPNN_sum_3Layer(torch.nn.Module):
    def __init__(self, data, node_type="movie"):
        super().__init__()
        self.node_type = node_type
        self.conv1 = InnerLayer(data)
        self.conv2 = InnerLayer(data)
        self.conv3 = OutputLayer(data, node_type=self.node_type)

    def forward(self, x_dict, edge_index_dict):
        x_dict_updates = self.conv1(x_dict, edge_index_dict)
        for node_type in x_dict_updates.keys():
            x_dict[node_type] = torch.sigmoid(x_dict_updates[node_type])
        x_dict_updates = self.conv2(x_dict, edge_index_dict)
        for node_type in x_dict_updates.keys():
            x_dict[node_type] = torch.sigmoid(x_dict_updates[node_type])
        x_dict = self.conv3(x_dict, edge_index_dict)
        return x_dict[self.node_type].squeeze(-1)  # Squeeze the output

In [8]:
# Initialize model
node_type_to_classify = "movie"
learning_rate = 1e-3
weight_decay = 1e-5
min_epochs = 50
max_epochs = 50
print_learning_progress_freq = 50

# Move model to GPU
model = HMPNN_sum_3Layer(data, node_type_to_classify).to(device)

# Move data to GPU
data = data.to(device)



In [9]:
# Define the train_model function
def train_model(model, data, node_type, learning_rate, weight_decay, min_epochs, max_epochs, print_freq):
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    criterion = torch.nn.BCEWithLogitsLoss()
    train_hist = []
    best_val_loss = float('inf')
    patience_counter = 0

    for epoch in range(max_epochs):
        model.train()
        optimizer.zero_grad()
        out = model(data.x_dict, data.edge_index_dict)
        loss = criterion(out[data[node_type].train_mask], data[node_type].y[data[node_type].train_mask])
        loss.backward()
        optimizer.step()
        train_hist.append(loss.item())

        if epoch % print_freq == 0:
            model.eval()
            with torch.no_grad():
                val_out = model(data.x_dict, data.edge_index_dict)
                val_loss = criterion(val_out[data[node_type].val_mask], data[node_type].y[data[node_type].val_mask])
                print(f"Epoch {epoch}, Train Loss: {loss.item():.4f}, Val Loss: {val_loss.item():.4f}")

            if val_loss < best_val_loss:
                best_val_loss = val_loss
                patience_counter = 0
            else:
                patience_counter += 1
                if patience_counter >= 10:
                    print("Early stopping triggered.")
                    break

    return model, train_hist

In [10]:
#  Plot training history
def plot_training_hist(train_hist):
    plt.plot(train_hist)
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training Loss Over Epochs')
    plt.show()

def compute_accuracy(pred, y, mask):
    pred_labels = (pred[mask].squeeze(-1) > 0.5).float()
    correct = torch.sum(pred_labels == y[mask])
    acc = correct / mask.sum()
    return acc.item()

In [11]:
# Plot ROC curves
def plot_roc_curves(data, pred, node_type):
    from sklearn.metrics import roc_curve, auc
    pred_prob = F.sigmoid(pred[data[node_type].test_mask]).cpu().numpy()
    y_true = data[node_type].y[data[node_type].test_mask].cpu().numpy()
    fpr, tpr, _ = roc_curve(y_true, pred_prob)
    roc_auc = auc(fpr, tpr)
    plt.figure()
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")
    plt.show()

In [None]:
# Train the model
model, train_hist = train_model(
    model,
    data,
    node_type_to_classify,
    learning_rate,
    weight_decay,
    min_epochs,
    max_epochs,
    print_learning_progress_freq,
)

In [None]:
# Plot training history
plot_training_hist(train_hist)

In [None]:
# Evaluate the model
with torch.no_grad():
    model.eval()
    pred = model(data.x_dict, data.edge_index_dict)
    train_acc = compute_accuracy(pred, data["movie"].y, data["movie"].train_mask)
    val_acc = compute_accuracy(pred, data["movie"].y, data["movie"].val_mask)
    test_acc = compute_accuracy(pred, data["movie"].y, data["movie"].test_mask)
    print(f"Train Acc: {train_acc:.4f}, Val Acc: {val_acc:.4f}, Test Acc: {test_acc:.4f}")

In [None]:
# Plot ROC curves
plot_roc_curves(data, pred, node_type_to_classify)

In [23]:
data

HeteroData(
  movie={
    x=[4278, 3066],
    y=[4278],
    train_mask=[4278],
    val_mask=[4278],
    test_mask=[4278],
  },
  director={ x=[2081, 3066] },
  actor={ x=[5257, 3066] },
  (movie, to, director)={ edge_index=[2, 4278] },
  (movie, to, actor)={ edge_index=[2, 12828] },
  (director, to, movie)={ edge_index=[2, 4278] },
  (actor, to, movie)={ edge_index=[2, 12828] },
  (director, rev_to, movie)={ edge_index=[2, 4278] },
  (actor, rev_to, movie)={ edge_index=[2, 12828] },
  (movie, rev_to, director)={ edge_index=[2, 4278] },
  (movie, rev_to, actor)={ edge_index=[2, 12828] }
)

## Encoder - Decoder

In [14]:
from torch.nn import Linear
from sklearn.metrics import accuracy_score

In [15]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv
from torch_geometric.nn import to_hetero
from sklearn.metrics import accuracy_score
from torch_geometric.nn import HeteroConv, NNConv, GATConv
from torch_geometric.nn.conv import GeneralConv
import torch_geometric.transforms as T


from torch import nn

In [16]:
class MLP(nn.Module):
    def __init__(self, in_channels, hidden_layers, out_channels):
        super(MLP, self).__init__()
        
        # Create a list to hold the layers
        layers = []
        
        # Input layer
        layers.append(nn.Linear(in_channels, hidden_layers[0]))
        layers.append(nn.ReLU())
        
        # Hidden layers
        for i in range(1, len(hidden_layers)):
            layers.append(nn.Linear(hidden_layers[i - 1], hidden_layers[i]))
            layers.append(nn.ReLU())
        
        # Output layer
        layers.append(nn.Linear(hidden_layers[-1], out_channels))
        
        # Combine all layers into a sequential model
        self.mlp = nn.Sequential(*layers)

    def forward(self, x):
        return self.mlp(x)

In [17]:
class GNNEncoder(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv((-1, -1), hidden_channels)
        self.conv2 = SAGEConv((-1, -1), out_channels)
        
    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x


In [18]:
class GNNEncoder2(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = GATConv((-1, -1), hidden_channels, add_self_loops=False)
        self.conv2 = GATConv((-1, -1), out_channels, add_self_loops=False)
        
    def forward(self, x, edge_index, edge_attr):
        x = self.conv1(x, edge_index, edge_attr).relu()
        x = self.conv2(x, edge_index, edge_attr)
        return x

In [19]:
class GNNEncoder3(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = GeneralConv((-1, -1), hidden_channels, in_edge_channels=-1)
        self.conv2 = GeneralConv((-1, -1), out_channels, in_edge_channels=-1)
        
    def forward(self, x, edge_index, edge_attr):
        x = self.conv1(x, edge_index, edge_attr).relu()
        x = self.conv2(x, edge_index, edge_attr)
        return x

In [20]:
class Decoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels, label):
        super().__init__()
        self.label = label
        hidden_layers = [128,64, 32]
        self.linear = MLP(in_channels, hidden_layers, out_channels)
        self.sigmoid = torch.nn.Sigmoid()

    def forward(self, z_dict):
        z = z_dict[self.label]
        output = self.linear(z)
        return self.sigmoid(output)

In [29]:
class Model(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.encoder = GNNEncoder(hidden_channels, hidden_channels)
        self.encoder = to_hetero(self.encoder, data.metadata(), aggr='mean')
        self.decoder = Decoder(hidden_channels, out_channels)
        
    def forward(self, x_dict, edge_index_dict):
        z_dict = self.encoder(x_dict, edge_index_dict)
        return self.decoder(z_dict)

In [30]:
class Model2(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels, label):
        super().__init__()
        self.encoder = GNNEncoder2(hidden_channels, hidden_channels)
        self.encoder = to_hetero(self.encoder, data.metadata(), aggr='mean')
        self.decoder = Decoder(hidden_channels, out_channels, label)
        
    def forward(self, x_dict, edge_index_dict, edge_attr):
        z_dict = self.encoder(x_dict, edge_index_dict, edge_attr)
        return self.decoder(z_dict)

In [40]:
def train(num_epochs, model, optimizer, loss_fn, data, label='movie', device='cpu', patience=50):
    best_test_accuracy = 0.0
    best_test_loss = float('inf')
    patience_counter = 0
    best_state = None

    # Move data to device
    data = data.to(device)
    model = model.to(device)

    for epoch in range(num_epochs):
        model.train()
        optimizer.zero_grad()
        
        # Forward pass
        out = model(data.x_dict, data.edge_index_dict, None).squeeze(1)
        
        # Get training masks and compute loss
        train_mask = data[label]['train_mask']
        train_loss = loss_fn(out[train_mask], data[label].y[train_mask])
        
        # Backward pass
        train_loss.backward()
        optimizer.step()
        
        # Compute training metrics
        with torch.no_grad():
            train_pred = out[train_mask]
            train_true = data[label].y[train_mask].cpu()
            train_pred_binary = (train_pred > 0.5).float()
            train_accuracy = accuracy_score(train_true.cpu(), train_pred_binary.cpu())

        # Evaluation phase
        model.eval()
        with torch.no_grad():
            # Forward pass for testing
            test_mask = data[label]['test_mask']
            test_pred = out[test_mask]
            test_pred_binary = (test_pred > 0.5).float()
            test_true = data[label].y[test_mask].cpu()
            
            # Compute test metrics
            test_loss = loss_fn(out[test_mask], data[label].y[test_mask])
            test_accuracy = accuracy_score(test_true.cpu(), test_pred_binary.cpu())

        # Print epoch metrics
        print(f"Epoch {epoch + 1}/{num_epochs}")
        print(f"Train Loss: {train_loss.item():.4f}, Train Accuracy: {train_accuracy:.4f}")
        print(f"Test Loss: {test_loss.item():.4f}, Test Accuracy: {test_accuracy:.4f}")
        print("----------------------------------------")

        # Check for improvement
        if test_loss < best_test_loss:
            best_test_loss = test_loss
            best_test_accuracy = test_accuracy
            best_state = model.state_dict().copy()
            patience_counter = 0
        else:
            patience_counter += 1
        
        # Early stopping
        if patience_counter >= patience:
            print(f"Early stopping triggered at epoch {epoch + 1}")
            break
    
    # Load best model
    if best_state:
        model.load_state_dict(best_state)
    return model, best_test_accuracy

In [41]:
label = 'movie'

In [42]:
model = Model2(hidden_channels=64, out_channels=1, label='movie')
optimizer = torch.optim.Adam(model.parameters())
loss_fn = torch.nn.BCELoss()

In [43]:
trained_model, best_accuracy = train(
    num_epochs=100, 
    model=model, 
    optimizer=optimizer, 
    loss_fn=loss_fn, 
    data=data,  # Your HeteroData object
    label='movie'
)

Epoch 1/100
Train Loss: 0.6655, Train Accuracy: 0.7346
Test Loss: 0.6654, Test Accuracy: 0.7348
----------------------------------------
Epoch 2/100
Train Loss: 0.6631, Train Accuracy: 0.7346
Test Loss: 0.6631, Test Accuracy: 0.7348
----------------------------------------
Epoch 3/100
Train Loss: 0.6609, Train Accuracy: 0.7346
Test Loss: 0.6609, Test Accuracy: 0.7348
----------------------------------------
Epoch 4/100
Train Loss: 0.6587, Train Accuracy: 0.7346
Test Loss: 0.6588, Test Accuracy: 0.7348
----------------------------------------
Epoch 5/100
Train Loss: 0.6565, Train Accuracy: 0.7346
Test Loss: 0.6565, Test Accuracy: 0.7348
----------------------------------------
Epoch 6/100
Train Loss: 0.6541, Train Accuracy: 0.7346
Test Loss: 0.6541, Test Accuracy: 0.7348
----------------------------------------
Epoch 7/100
Train Loss: 0.6515, Train Accuracy: 0.7346
Test Loss: 0.6516, Test Accuracy: 0.7348
----------------------------------------
Epoch 8/100
Train Loss: 0.6488, Train Acc