In [2]:
# !pip install torch-geometric

In [78]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
import numpy as np
import pandas as pd
from torch_geometric.data import Data
import ast
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score, precision_score, recall_score, f1_score
import os

In [4]:
def string_to_array(embedding_str):
    """Convert string representation of embedding to numpy array"""
    # Remove any whitespace and split by spaces
    try:
        # Clean the string and split numbers
        cleaned_str = embedding_str.strip('[]').replace('\n', ' ')
        numbers = [float(x) for x in cleaned_str.split() if x]
        return np.array(numbers)
    except:
        print(f"Error processing embedding string: {embedding_str[:100]}...")
        return None

# Load and process the data
def load_and_process_data():
    # Load the dataframe with embeddings
    print("Loading dataframe...")
    df = pd.read_csv('/kaggle/input/journal-recomm2/updated_dataframe.csv')
    
    print("Processing embeddings...")
    # Convert string representation of embeddings to numpy arrays
    df['embedding'] = df['embedding'].apply(string_to_array)
    
    # Remove any rows where embedding conversion failed
    df = df.dropna(subset=['embedding'])
    
    print("Loading node mapping...")
    # Load node mapping
    node_mapping = pd.read_csv('/kaggle/input/journal-recomm2/node_mapping.csv')
    
    print("Loading adjacency matrix...")
    # Load adjacency matrix
    matrix_data = np.load('/kaggle/input/journal-recomm2/adjacency_matrix.npz')
    adj_matrix = matrix_data['matrix']
    
    return df, node_mapping, adj_matrix

def modify_adjacency_matrix(adj_matrix):
    num_journals = 66
    
    # Set journal-journal submatrix to 1
    adj_matrix[:num_journals, :num_journals] = 1
    
    # Set diagonal to 0 (no self-loops)
    np.fill_diagonal(adj_matrix, 0)
    
    return adj_matrix

def initialize_embeddings(df, node_mapping, embedding_dim=384):
    num_nodes = len(node_mapping)
    num_journals = 66
    
    # Initialize all embeddings
    all_embeddings = np.zeros((num_nodes, embedding_dim))
    
    # For journals, initialize with small random values
    # These will be completely replaced by learned representations
    np.random.seed(42)
    journal_embeddings = np.random.normal(0, 0.01, (num_journals, embedding_dim))
    all_embeddings[:num_journals] = journal_embeddings
    
    # Fill paper embeddings from the dataframe
    for idx, row in df.iterrows():
        node_id = row['id']
        node_idx = node_mapping[node_mapping['node_id'] == node_id].index
        if len(node_idx) > 0:
            all_embeddings[node_idx[0]] = row['embedding']
    
    return all_embeddings


def create_mappings(journal_info_df):
    """
    Create mappings using journal info DataFrame
    Using node_id from journal info as the index
    """
    # Create journal name to index mapping using node_id from journal info
    journal_name_to_idx = {}
    for _, row in journal_info_df.iterrows():
        journal_name_to_idx[row['journal_name']] = row['node_id']
    return journal_name_to_idx

def create_correct_adjacency_matrix(df, journal_info_df, original_adj_matrix):
    """
    Create corrected adjacency matrix while preserving other relations
    Only update the paper-journal connections (first 66 columns)
    """
    journal_name_to_idx = create_mappings(journal_info_df)
    
    num_journals = 66
    
    # Create a copy of the original adjacency matrix to preserve other relations
    new_adj_matrix = original_adj_matrix.copy()
    
    # Zero out only the paper-journal connections
    new_adj_matrix[num_journals:, :num_journals] = 0
    new_adj_matrix[:num_journals, num_journals:] = 0
    
    print("Creating paper-journal connections...")
    connections_count = 0
    
    for _, row in df.iterrows():
        paper_id = row['id']
        journal_name = row['journal']
        
        if journal_name in journal_name_to_idx:
            # Get paper index (after journal indices)
            paper_idx = int(paper_id.replace('cs_', '')) + num_journals - 1
            journal_idx = journal_name_to_idx[journal_name]
            
            # Set connection in adjacency matrix
            new_adj_matrix[paper_idx, journal_idx] = 1
            new_adj_matrix[journal_idx, paper_idx] = 1  # Make it symmetric
            connections_count += 1
    
    return new_adj_matrix

In [17]:
# Load data
print("Loading data...")
df, node_mapping, adj_matrix = load_and_process_data()

adj_matrix = modify_adjacency_matrix(adj_matrix)
embeddings = initialize_embeddings(df, node_mapping)

journal_info_df = pd.read_csv('/kaggle/input/journal-recomm2/journal_nodes.csv')
print("Creating corrected adjacency matrix...")
adj_matrix = create_correct_adjacency_matrix(df, journal_info_df, adj_matrix)

Loading data...
Loading dataframe...
Processing embeddings...
Loading node mapping...
Loading adjacency matrix...
Creating corrected adjacency matrix...
Creating paper-journal connections...


In [19]:
def prepare_data(adj_matrix, embeddings, df, journal_info_df):
    """
    Prepare data for PyTorch Geometric with correct labels
    """
    journal_name_to_idx = create_mappings(journal_info_df)
    num_journals = 66
    
    # Create edge index
    edge_index = torch.tensor(np.array(np.where(adj_matrix > 0)), dtype=torch.long)
    
    # Convert embeddings to tensor
    embeddings_tensor = torch.tensor(embeddings, dtype=torch.float)
    
    # Create labels
    paper_labels = []
    for paper_idx in range(num_journals, adj_matrix.shape[0]):
        paper_id = f"cs_{paper_idx - num_journals + 1}"
        if paper_id in df['id'].values:
            journal_name = df[df['id'] == paper_id]['journal'].values[0]
            if journal_name in journal_name_to_idx:
                journal_idx = journal_name_to_idx[journal_name]
                paper_labels.append(journal_idx)
            else:
                print(f"Warning: Unknown journal name: {journal_name}")
                paper_labels.append(-1)
        else:
            paper_labels.append(-1)
    
    paper_labels = torch.tensor(paper_labels, dtype=torch.long)
    return edge_index, embeddings_tensor, paper_labels


Preparing data for model...


In [20]:
class JournalGCN(nn.Module):
    def __init__(self, num_journals, num_papers, embedding_dim=384):
        super(JournalGCN, self).__init__()
        
        # Learnable journal embeddings
        self.journal_embeddings = nn.Parameter(torch.randn(num_journals, embedding_dim) * 0.01)
        
        # Fixed paper embeddings (not parameters)
        self.register_buffer('paper_embeddings', torch.zeros(num_papers, embedding_dim))
        
        # GCN layers
        self.conv1 = GCNConv(embedding_dim, embedding_dim)
        self.conv2 = GCNConv(embedding_dim, embedding_dim)
        
        # Batch normalization and dropout
        self.batch_norm = nn.BatchNorm1d(embedding_dim)
        self.dropout = nn.Dropout(0.2)

    def set_paper_embeddings(self, embeddings):
        """Set the fixed paper embeddings"""
        self.paper_embeddings.copy_(embeddings)

    def forward(self, edge_index):
        # Combine journal and paper embeddings
        x = torch.cat([self.journal_embeddings, self.paper_embeddings], dim=0)
        
        # First GCN layer
        x = self.conv1(x, edge_index)
        x = self.batch_norm(x)
        x = F.relu(x)
        x = self.dropout(x)
        
        # Second GCN layer
        x = self.conv2(x, edge_index)
        
        # L2 normalize embeddings
        x = F.normalize(x, p=2, dim=1)
        
        # Split back into journal and paper embeddings
        journal_emb = x[:len(self.journal_embeddings)]
        paper_emb = x[len(self.journal_embeddings):]
        
        return journal_emb, paper_emb

In [49]:
def prepare_data_with_split(adj_matrix, embeddings, df, journal_info_df, val_size=0.1, test_size=0.1):
    """
    Prepare data for PyTorch Geometric with train/val/test splits,
    handling cases where some journals have too few papers
    """
    journal_name_to_idx = create_mappings(journal_info_df)
    num_journals = 66
    
    # Create edge index
    edge_index = torch.tensor(np.array(np.where(adj_matrix > 0)), dtype=torch.long)
    
    # Convert embeddings to tensor
    embeddings_tensor = torch.tensor(embeddings, dtype=torch.float)
    
    # Create labels and indices for papers
    paper_labels = []
    paper_indices = []
    
    for paper_idx in range(num_journals, adj_matrix.shape[0]):
        paper_id = f"cs_{paper_idx - num_journals + 1}"
        if paper_id in df['id'].values:
            journal_name = df[df['id'] == paper_id]['journal'].values[0]
            if journal_name in journal_name_to_idx:
                journal_idx = journal_name_to_idx[journal_name]
                paper_labels.append(journal_idx)
                paper_indices.append(paper_idx - num_journals)
    
    paper_labels = np.array(paper_labels)
    paper_indices = np.array(paper_indices)
    
    # Count samples per class
    unique_labels, label_counts = np.unique(paper_labels, return_counts=True)
    
    # Identify labels with enough samples for stratification (at least 3 samples)
    stratifiable_mask = np.isin(paper_labels, unique_labels[label_counts >= 3])
    
    # Split indices into stratifiable and non-stratifiable
    stratifiable_indices = paper_indices[stratifiable_mask]
    stratifiable_labels = paper_labels[stratifiable_mask]
    non_stratifiable_indices = paper_indices[~stratifiable_mask]
    
    # First split: train and temporary (val+test) for stratifiable data
    if len(stratifiable_indices) > 0:
        train_indices, temp_indices = train_test_split(
            stratifiable_indices,
            test_size=(val_size + test_size),
            stratify=stratifiable_labels,
            random_state=42
        )
        
        # Second split: val and test from temporary
        val_indices, test_indices = train_test_split(
            temp_indices,
            test_size=test_size/(val_size + test_size),
            random_state=42
        )
    else:
        train_indices = np.array([])
        val_indices = np.array([])
        test_indices = np.array([])
    
    # Handle non-stratifiable data: random split
    if len(non_stratifiable_indices) > 0:
        non_strat_train, non_strat_temp = train_test_split(
            non_stratifiable_indices,
            test_size=(val_size + test_size),
            random_state=42
        )
        
        non_strat_val, non_strat_test = train_test_split(
            non_strat_temp,
            test_size=test_size/(val_size + test_size),
            random_state=42
        )
        
        # Combine stratifiable and non-stratifiable splits
        train_indices = np.concatenate([train_indices, non_strat_train])
        val_indices = np.concatenate([val_indices, non_strat_val])
        test_indices = np.concatenate([test_indices, non_strat_test])
    
    # Create masks
    train_mask = torch.zeros(len(paper_indices), dtype=torch.bool)
    val_mask = torch.zeros(len(paper_indices), dtype=torch.bool)
    test_mask = torch.zeros(len(paper_indices), dtype=torch.bool)
    
    # Convert indices to positions in the original array
    train_positions = np.where(np.isin(paper_indices, train_indices))[0]
    val_positions = np.where(np.isin(paper_indices, val_indices))[0]
    test_positions = np.where(np.isin(paper_indices, test_indices))[0]
    
    train_mask[train_positions] = True
    val_mask[val_positions] = True
    test_mask[test_positions] = True
    
    # Convert labels to tensor
    paper_labels = torch.tensor(paper_labels, dtype=torch.long)
    
    # Print split statistics
    print(f"\nSplit Statistics:")
    print(f"Total samples: {len(paper_labels)}")
    print(f"Train samples: {train_mask.sum().item()}")
    print(f"Validation samples: {val_mask.sum().item()}")
    print(f"Test samples: {test_mask.sum().item()}")
    
    return edge_index, embeddings_tensor, paper_labels, train_mask, val_mask, test_mask

In [50]:
# Replace the original data preparation with:
edge_index, embeddings_tensor, paper_labels, train_mask, val_mask, test_mask = prepare_data_with_split(
    adj_matrix, 
    embeddings,
    df,
    journal_info_df
)

# Create model
num_journals = 66
num_papers = len(paper_labels)
model = JournalGCN(num_journals, num_papers)

# Set paper embeddings
paper_embeddings = embeddings_tensor[num_journals:]
model.set_paper_embeddings(paper_embeddings)


Split Statistics:
Total samples: 14012
Train samples: 11209
Validation samples: 1401
Test samples: 1402


In [59]:
def train_model_with_validation(model, edge_index, paper_labels, train_mask, val_mask, num_epochs=500, lr=0.01):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    edge_index = edge_index.to(device)
    paper_labels = paper_labels.to(device)
    train_mask = train_mask.to(device)
    val_mask = val_mask.to(device)
    
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=0.01)
    
    best_val_acc = 0
    best_model_state = None
    patience = 50
    counter = 0
    
    for epoch in range(num_epochs):
        # Training
        model.train()
        optimizer.zero_grad()
        
        journal_embeddings, paper_embeddings = model(edge_index)
        similarities = torch.mm(paper_embeddings, journal_embeddings.t())
        temperature = 0.1
        logits = similarities / temperature
        
        # Calculate loss only on training set
        train_loss = F.cross_entropy(logits[train_mask], paper_labels[train_mask])
        
        train_loss.backward()
        optimizer.step()
        
        # Evaluation
        model.eval()
        with torch.no_grad():
            # Training accuracy
            train_pred = torch.argmax(similarities[train_mask], dim=1)
            train_acc = (train_pred == paper_labels[train_mask]).float().mean()
            
            # Validation accuracy
            val_pred = torch.argmax(similarities[val_mask], dim=1)
            val_acc = (val_pred == paper_labels[val_mask]).float().mean()
            
            # Save best model
            if val_acc > best_val_acc:
                best_val_acc = val_acc
                best_model_state = model.state_dict()
                counter = 0
            else:
                counter += 1
            
            # Early stopping
            if counter >= patience:
                print(f"Early stopping at epoch {epoch}")
                break
        
        if (epoch + 1) % 10 == 0:
            print(f'Epoch {epoch+1}/{num_epochs}:')
            print(f'Train Loss: {train_loss.item():.4f}')
            print(f'Train Accuracy: {train_acc.item():.4f}')
            print(f'Validation Accuracy: {val_acc.item():.4f}')
    
    # Load best model
    model.load_state_dict(best_model_state)
    return model

In [60]:
# Replace the original training call with:
trained_model = train_model_with_validation(
    model, 
    edge_index, 
    paper_labels, 
    train_mask, 
    val_mask
)

Epoch 10/500:
Train Loss: 2.3903
Train Accuracy: 0.3761
Validation Accuracy: 0.3555
Epoch 20/500:
Train Loss: 1.7582
Train Accuracy: 0.5596
Validation Accuracy: 0.5175
Epoch 30/500:
Train Loss: 1.7611
Train Accuracy: 0.5544
Validation Accuracy: 0.5111
Epoch 40/500:
Train Loss: 1.3083
Train Accuracy: 0.6649
Validation Accuracy: 0.6517
Epoch 50/500:
Train Loss: 1.5415
Train Accuracy: 0.6173
Validation Accuracy: 0.5846
Epoch 60/500:
Train Loss: 1.1982
Train Accuracy: 0.7283
Validation Accuracy: 0.6959
Epoch 70/500:
Train Loss: 1.0674
Train Accuracy: 0.7478
Validation Accuracy: 0.7302
Epoch 80/500:
Train Loss: 1.1019
Train Accuracy: 0.7459
Validation Accuracy: 0.7116
Epoch 90/500:
Train Loss: 1.0384
Train Accuracy: 0.7338
Validation Accuracy: 0.7088
Epoch 100/500:
Train Loss: 1.0650
Train Accuracy: 0.7539
Validation Accuracy: 0.7402
Epoch 110/500:
Train Loss: 1.0417
Train Accuracy: 0.7659
Validation Accuracy: 0.7302
Epoch 120/500:
Train Loss: 0.9960
Train Accuracy: 0.7771
Validation Accura

In [62]:
# MODIFIED FUNCTION - Replaced original evaluate_model
def evaluate_model_split(model, edge_index, paper_labels, mask, k=5):
    """Evaluate model with top-k accuracy on a specific split"""
    device = next(model.parameters()).device
    
    edge_index = edge_index.to(device)
    paper_labels = paper_labels.to(device)
    mask = mask.to(device)
    
    model.eval()
    with torch.no_grad():
        journal_embeddings, paper_embeddings = model(edge_index)
        similarities = torch.mm(paper_embeddings, journal_embeddings.t())
        
        # Get top-k predictions for masked samples
        _, top_k_indices = torch.topk(similarities[mask], k, dim=1)
        correct = torch.any(top_k_indices == paper_labels[mask].unsqueeze(1), dim=1)
        accuracy = correct.float().mean()
        
        return accuracy.item()

# Separate function for final evaluation on test set
def final_evaluation(model, edge_index, paper_labels, test_mask):
    """
    Perform final evaluation on test set with different k values
    Should only be called once after training is complete
    """
    print("\nFinal Test Set Evaluation:")
    metrics = {}
    for k in [1, 3, 5, 10]:
        acc = evaluate_model_split(model, edge_index, paper_labels, test_mask, k=k)
        metrics[f'top_{k}_accuracy'] = acc
        print(f"Top-{k} Accuracy: {acc:.4f}")
    return metrics

In [64]:
print("Performing final evaluation on test set...")
test_metrics = final_evaluation(trained_model, edge_index, paper_labels, test_mask)

Performing final evaluation on test set...

Final Test Set Evaluation:
Top-1 Accuracy: 0.8466
Top-3 Accuracy: 0.9287
Top-5 Accuracy: 0.9501
Top-10 Accuracy: 0.9772


In [83]:
def compute_metrics(model, edge_index, paper_labels, mask, k_values=[1, 3, 5, 10]):
    """
    Compute comprehensive evaluation metrics
    
    Args:
        model: Trained GCN model
        edge_index: Edge indices tensor
        paper_labels: Ground truth journal labels
        mask: Mask for selecting evaluation samples
        k_values: List of k values for top-k metrics
    
    Returns:
        dict: Dictionary containing all computed metrics
    """
    device = next(model.parameters()).device
    edge_index = edge_index.to(device)
    paper_labels = paper_labels.to(device)
    mask = mask.to(device)
    
    model.eval()
    metrics = {}
    
    with torch.no_grad():
        # Get model predictions
        journal_embeddings, paper_embeddings = model(edge_index)
        similarities = torch.mm(paper_embeddings[mask], journal_embeddings.t())
        
        # Convert to probabilities
        probabilities = F.softmax(similarities, dim=1)
        
        # Get ground truth labels for masked samples
        true_labels = paper_labels[mask].cpu().numpy()
        prob_scores = probabilities.cpu().numpy()
        
        # Create one-hot encoded ground truth for NDCG
        n_samples = len(true_labels)
        n_classes = prob_scores.shape[1]
        y_true = np.zeros((n_samples, n_classes))
        y_true[np.arange(n_samples), true_labels] = 1
        
        # Compute NDCG for different k values
        for k in k_values:
            ndcg = ndcg_score(y_true, prob_scores, k=k)
            metrics[f'ndcg@{k}'] = ndcg
        
        # Compute Top-k accuracy
        for k in k_values:
            _, top_k_indices = torch.topk(similarities, k, dim=1)
            correct = torch.any(top_k_indices == paper_labels[mask].unsqueeze(1), dim=1)
            accuracy = correct.float().mean().item()
            metrics[f'top_{k}_accuracy'] = accuracy
        
        # Compute MRR (Mean Reciprocal Rank)
        rankings = (similarities.argsort(dim=1, descending=True) == paper_labels[mask].unsqueeze(1)).nonzero()
        mrr = (1.0 / (rankings[:, 1].float() + 1)).mean().item()
        metrics['mrr'] = mrr
        
        # Compute MAP (Mean Average Precision)
        ap_sum = 0
        for i, label in enumerate(paper_labels[mask]):
            ranking = similarities[i].argsort(descending=True)
            rank = (ranking == label).nonzero().item() + 1
            ap_sum += 1.0 / rank
        map_score = ap_sum / len(paper_labels[mask])
        metrics['map'] = map_score
        
        # Compute Coverage
        _, top_k_indices = torch.topk(similarities, max(k_values), dim=1)
        unique_recommendations = len(torch.unique(top_k_indices))
        coverage = unique_recommendations / len(journal_embeddings)
        metrics['coverage'] = coverage
        
        # Compute Average Rank of True Label
        ranks = []
        for i, label in enumerate(paper_labels[mask]):
            ranking = similarities[i].argsort(descending=True)
            rank = (ranking == label).nonzero().item() + 1
            ranks.append(rank)
        avg_rank = np.mean(ranks)
        metrics['average_rank'] = avg_rank
        
    return metrics

In [84]:
def evaluate_recommendations(model, edge_index, paper_labels, masks):
    """
    Evaluate model performance on all data splits
    
    Args:
        model: Trained GCN model
        edge_index: Edge indices tensor
        paper_labels: Ground truth journal labels
        masks: Dictionary containing train/val/test masks
    """
    k_values = list(range(1, 11))  # k from 1 to 10
    
    results = {}
    for split_name, mask in masks.items():
        print(f"\n{split_name} Set Metrics:")
        metrics = compute_metrics(model, edge_index, paper_labels, mask, k_values)
        
        # Print metrics in a formatted way
        print("\nTop-k Accuracy:")
        for k in k_values:
            print(f"Top-{k}: {metrics[f'top_{k}_accuracy']:.4f}")
        
        print("\nNDCG Scores:")
        for k in k_values:
            print(f"NDCG@{k}: {metrics[f'ndcg@{k}']:.4f}")
        
        print("\nOther Metrics:")
        print(f"MRR: {metrics['mrr']:.4f}")
        print(f"MAP: {metrics['map']:.4f}")
        print(f"Coverage: {metrics['coverage']:.4f}")
        
        results[split_name] = metrics
    
    return results

In [85]:

# After training the model
masks = {
    'Train': train_mask,
    'Validation': val_mask,
    'Test': test_mask
}

evaluation_results = evaluate_recommendations(
    model=trained_model,
    edge_index=edge_index,
    paper_labels=paper_labels,
    masks=masks
)


Train Set Metrics:

Top-k Accuracy:
Top-1: 0.8508
Top-2: 0.9030
Top-3: 0.9300
Top-4: 0.9437
Top-5: 0.9524
Top-6: 0.9588
Top-7: 0.9633
Top-8: 0.9675
Top-9: 0.9723
Top-10: 0.9761

NDCG Scores:
NDCG@1: 0.8508
NDCG@2: 0.8838
NDCG@3: 0.8972
NDCG@4: 0.9032
NDCG@5: 0.9065
NDCG@6: 0.9088
NDCG@7: 0.9103
NDCG@8: 0.9116
NDCG@9: 0.9131
NDCG@10: 0.9142

Other Metrics:
MRR: 0.8958
MAP: 0.8958
Coverage: 1.0000

Validation Set Metrics:

Top-k Accuracy:
Top-1: 0.8430
Top-2: 0.8936
Top-3: 0.9179
Top-4: 0.9293
Top-5: 0.9415
Top-6: 0.9500
Top-7: 0.9557
Top-8: 0.9607
Top-9: 0.9636
Top-10: 0.9700

NDCG Scores:
NDCG@1: 0.8430
NDCG@2: 0.8749
NDCG@3: 0.8871
NDCG@4: 0.8920
NDCG@5: 0.8967
NDCG@6: 0.8997
NDCG@7: 0.9016
NDCG@8: 0.9032
NDCG@9: 0.9041
NDCG@10: 0.9059

Other Metrics:
MRR: 0.8875
MAP: 0.8875
Coverage: 1.0000

Test Set Metrics:

Top-k Accuracy:
Top-1: 0.8466
Top-2: 0.9016
Top-3: 0.9287
Top-4: 0.9401
Top-5: 0.9501
Top-6: 0.9586
Top-7: 0.9665
Top-8: 0.9686
Top-9: 0.9750
Top-10: 0.9772

NDCG Scores:
NDCG

In [71]:
def save_model_outputs(model, edge_index, test_mask, journal_info_df, save_dir='outputs'):
    """
    Save journal embeddings and test indices with proper device management
    """
    import os
    
    # Create output directory if it doesn't exist
    os.makedirs(save_dir, exist_ok=True)
    
    # Determine device
    device = next(model.parameters()).device
    
    # Move tensors to the same device as the model
    edge_index = edge_index.to(device)
    test_mask = test_mask.to(device)
    
    # Get journal embeddings
    model.eval()
    with torch.no_grad():
        journal_embeddings, _ = model(edge_index)
        # Move embeddings to CPU before converting to numpy
        journal_embeddings = journal_embeddings.cpu().numpy()
    
    # Create DataFrame with journal embeddings
    journal_embeddings_df = pd.DataFrame(
        journal_embeddings,
        columns=[f'dim_{i}' for i in range(journal_embeddings.shape[1])]
    )
    
    # Add journal information
    journal_embeddings_df['journal_name'] = journal_info_df['journal_name'].values
    journal_embeddings_df['node_id'] = journal_info_df['node_id'].values
    
    # Save journal embeddings
    embeddings_path = os.path.join(save_dir, 'journal_embeddings.csv')
    journal_embeddings_df.to_csv(embeddings_path, index=False)
    print(f"Saved journal embeddings to {embeddings_path}")
    
    # Get and save test indices (move to CPU first)
    test_indices = torch.where(test_mask.cpu())[0].numpy()
    indices_path = os.path.join(save_dir, 'test_indices.npy')
    np.save(indices_path, test_indices)
    print(f"Saved test indices to {indices_path}")
    
    # Save test indices with paper IDs
    test_indices_df = pd.DataFrame({
        'index': test_indices,
        'paper_id': [f'cs_{idx - 65}' for idx in test_indices]  # -65 because first 66 nodes are journals
    })
    indices_csv_path = os.path.join(save_dir, 'test_indices.csv')
    test_indices_df.to_csv(indices_csv_path, index=False)
    print(f"Saved test indices with paper IDs to {indices_csv_path}")
    
    return journal_embeddings_df, test_indices, test_indices_df

In [73]:
# After training the model
x,y,z = save_model_outputs(
    model=trained_model,
    edge_index=edge_index,
    test_mask=test_mask,
    journal_info_df=journal_info_df,
    save_dir='/kaggle/working/'
)

Saved journal embeddings to /kaggle/working/journal_embeddings.csv
Saved test indices to /kaggle/working/test_indices.npy
Saved test indices with paper IDs to /kaggle/working/test_indices.csv


In [74]:

# To load the saved files later:
def load_saved_outputs(load_dir='outputs'):
    """
    Load the saved journal embeddings and test indices
    """
    # Load journal embeddings
    embeddings_path = os.path.join(load_dir, 'journal_embeddings.csv')
    journal_embeddings_df = pd.read_csv(embeddings_path)
    
    # Load test indices
    indices_path = os.path.join(load_dir, 'test_indices.npy')
    test_indices = np.load(indices_path)
    
    # Load test indices with paper IDs
    indices_csv_path = os.path.join(load_dir, 'test_indices.csv')
    test_indices_df = pd.read_csv(indices_csv_path)
    
    return journal_embeddings_df, test_indices, test_indices_df

In [77]:
# Example of loading:

journal_embeddings_df, test_indices, test_indices_df = load_saved_outputs('/kaggle/working/')