In [1]:
import torch
TORCH = torch.__version__.split('+')[0]
CUDA = 'cu' + torch.version.cuda.replace('.', '')

# 2. Install torch-scatter, torch-sparse, and finally, torch-geometric
!pip install torch-scatter -f https://data.pyg.org/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-geometric

Looking in links: https://data.pyg.org/whl/torch-2.9.0+cu126.html
Collecting torch-scatter
  Downloading torch_scatter-2.1.2.tar.gz (108 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m108.0/108.0 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: torch-scatter
  Building wheel for torch-scatter (setup.py) ... [?25l[?25hdone
  Created wheel for torch-scatter: filename=torch_scatter-2.1.2-cp312-cp312-linux_x86_64.whl size=664339 sha256=9f3fc469fdcec0327dbe5c356e25c8ba431ecf28e185fc9d7438d80fa8af021b
  Stored in directory: /root/.cache/pip/wheels/84/20/50/44800723f57cd798630e77b3ec83bc80bd26a1e3dc3a672ef5
Successfully built torch-scatter
Installing collected packages: torch-scatter
Successfully installed torch-scatter-2.1.2
Collecting torch-geometric
  Downloading torch_geometric-2.7.0-py3-none-any.whl.metadata (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[

In [2]:
import torch
from torch_geometric.datasets import Planetoid
import networkx as nx
from torch_geometric.utils import to_networkx
import pandas as pd
import numpy as np
dataset = Planetoid(root='./data/Cora', name='Cora')
data = dataset[0]
G = to_networkx(data, to_undirected=True)

print(f"Nodes: {data.num_nodes}")
print(f"Features: {data.num_node_features}")
print(f"Classes: {dataset.num_classes}")

edge_index = data.edge_index


num_edges_to_print = min(5, edge_index.shape[1])

# Print the total number of edges
print(f"\nTotal number of directed edges: {data.num_edges}")


Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.x
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.tx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.allx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.y
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ty
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ally
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.graph
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.test.index
Processing...


Nodes: 2708
Features: 1433
Classes: 7

Total number of directed edges: 10556


Done!


In [3]:

degree_centrality = nx.degree_centrality(G)

closeness_centrality = nx.closeness_centrality(G)

betweenness_centrality = nx.betweenness_centrality(G)

node_properties_df = pd.DataFrame({
    'node_id': list(G.nodes()),
    'degree_centrality': [degree_centrality[n] for n in G.nodes()],
    'closeness_centrality': [closeness_centrality[n] for n in G.nodes()],
    'betweenness_centrality': [betweenness_centrality[n] for n in G.nodes()]
}).set_index('node_id')

print("Sample Node Properties:")
print(node_properties_df.head())

Sample Node Properties:
         degree_centrality  closeness_centrality  betweenness_centrality
node_id                                                                 
0                 0.001108              0.144255            9.766154e-07
1                 0.001108              0.151453            1.080477e-03
2                 0.001847              0.179168            4.050816e-03
3                 0.000369              0.000369            0.000000e+00
4                 0.001847              0.153266            5.511762e-04


In [4]:
# --- 1. Calculate and Add Rank to DataFrame ---

# The TANS method provides the rank (percentile) of each property
# among all nodes to the LLM (Table 2 in the paper).
for col in node_properties_df.columns:
    # Use .rank(pct=True) to calculate percentile rank (0 to 1)
    # Multiply by 100 to get the percentage rank (0 to 100)
    node_properties_df[f'{col}_rank'] = node_properties_df[col].rank(pct=True) * 100

print("\nSample Node Properties with Ranks:")
print(node_properties_df[['degree_centrality', 'degree_centrality_rank']].head())


Sample Node Properties with Ranks:
         degree_centrality  degree_centrality_rank
node_id                                           
0                 0.001108               49.667651
1                 0.001108               49.667651
2                 0.001847               79.431315
3                 0.000369                8.973412
4                 0.001847               79.431315


In [5]:
# --- 2. Helper Functions for Text Retrieval ---

def get_original_text(node_id):
    """
    Placeholder: In a full TANS implementation, this reads the actual title
    and abstract text from the raw Cora dataset files.
    """
    # Using a generic placeholder for demonstration
    return "A paper discussing graph convolutional networks and deep learning for node classification. This is the abstract content."


def get_neighbor_texts(graph, node_id, num_neighbors=5):
    """
    Placeholder: Fetches the text (e.g., titles/abstracts) of k=5 neighboring nodes.

    The paper specifies randomly selecting k=5 neighbors to provide additional context
    (Prompt 3: Optional Neighbor Text).
    """
    neighbors = list(graph.neighbors(node_id))

    if not neighbors:
        return "No connected nodes found."

    # Randomly select up to num_neighbors texts
    selected_neighbors = np.random.choice(neighbors, min(len(neighbors), num_neighbors), replace=False)

    neighbor_descriptions = []
    for n in selected_neighbors:
        # Recursively call the original text function for the neighbor
        neighbor_descriptions.append(f"Node {n}: {get_original_text(n)[:40]}...")

    return "\n".join(neighbor_descriptions)

In [6]:
# --- 3. Final TANS Prompt Generation Function (Steps 2 & 3) ---

def generate_tans_prompt(graph, node_id, properties_df, classes):
    """
    Generates the complete, structured TANS prompt using all information.
    """
    # Retrieve properties and ranks
    degree = properties_df.loc[node_id, 'degree_centrality']
    rank_degree = properties_df.loc[node_id, 'degree_centrality_rank']

    # 1. Get original text (Prompt 2)
    original_text = get_original_text(node_id)

    # 2. Get neighbor texts (Prompt 3)
    neighbor_texts = get_neighbor_texts(graph, node_id, num_neighbors=5)

    # 3. Construct the full prompt (Prefix, Text, Neighbor, Property, Suffix)
    prompt = f"""
Given a node from a citation network graph, where the node type is paper.
The original node description is: "{original_text}".

The following are the textual information of 5 connected nodes. The descriptions are:
{neighbor_texts}

Node Properties:
- Degree Centrality value: {degree:.4f}, ranked as {rank_degree:.2f}% among all nodes.
- Closeness Centrality value: {properties_df.loc[node_id, 'closeness_centrality']:.4f}.
- Betweenness Centrality value: {properties_df.loc[node_id, 'betweenness_centrality']:.4f}.

Output the potential class of the node among the following classes: {classes}.
Provide reasons for your assessment. Your answer should be less than 200 words.
"""
    return prompt.strip()

In [7]:
# Example call with the required arguments
classes_cora = ["Neural Networks", "Probabilistic Methods", "Genetic Algorithms", "Theory", "Case Based", "Reinforcement Learning", "Rule Learning"]
sample_node_id = list(G.nodes())[100] # Assuming G is defined from your previous steps

final_prompt = generate_tans_prompt(
    G, # Pass the NetworkX graph
    sample_node_id,
    node_properties_df,
    classes_cora
)
print("\n--- Final Generated TANS Prompt Example ---")
print(final_prompt)


--- Final Generated TANS Prompt Example ---
Given a node from a citation network graph, where the node type is paper.
The original node description is: "A paper discussing graph convolutional networks and deep learning for node classification. This is the abstract content.".

The following are the textual information of 5 connected nodes. The descriptions are:
Node 1602: A paper discussing graph convolutional n...
Node 2056: A paper discussing graph convolutional n...

Node Properties:
- Degree Centrality value: 0.0007, ranked as 28.69% among all nodes.
- Closeness Centrality value: 0.1419.
- Betweenness Centrality value: 0.0001.

Output the potential class of the node among the following classes: ['Neural Networks', 'Probabilistic Methods', 'Genetic Algorithms', 'Theory', 'Case Based', 'Reinforcement Learning', 'Rule Learning']. 
Provide reasons for your assessment. Your answer should be less than 200 words.


In [None]:
# Install the necessary library (if not already done)
# pip install google-genai

import os
from google import genai
from google.genai.errors import APIError

# --- IMPORTANT: Set your API Key ---
# It's best practice to load your API key from an environment variable.
os.environ['GEMINI_API_KEY'] = 'YOUR API KEY HERE'
client = genai.Client()
# Assuming the client is initialized globally or passed in
def query_llm_and_generate_description_gemini(prompt):
    """
    Calls the Gemini API with the TANS prompt and returns the generated text.
    """
    try:
        # Initialize client inside if not done globally
        client = genai.Client()

        # Call the Gemini API
        response = client.models.generate_content(
            model='gemini-2.5-flash', # Use a capable model like flash or pro
            contents=prompt
        )

        # The TANS explanation is the generated text
        llm_explanation = response.text
        return llm_explanation

    except APIError as e:
        print(f"Gemini API Error: {e}")
        return "Error: Could not generate description due to API error."
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return "Error: An unexpected error occurred."

# Example Usage (replace the placeholder call):
llm_generated_text = query_llm_and_generate_description_gemini(final_prompt)
print(f"Gemini-Generated TANS Description:\n{llm_generated_text}")

Gemini-Generated TANS Description:
The potential class of the node is **Neural Networks**.

**Reasons:**

1.  **Node Description:** The paper explicitly discusses "graph convolutional networks" and "deep learning." Graph Convolutional Networks (GCNs) are a prominent type of neural network designed for graph-structured data, and "deep learning" is the core paradigm within the field of neural networks.
2.  **Connected Nodes:** The descriptions of connected nodes (Node 1602, Node 2056) also mention "graph convolutional n...", reinforcing the focus on this specific neural network architecture.
3.  **Class Fit:** Among the given options, "Neural Networks" is the direct and most accurate classification for content involving "graph convolutional networks" and "deep learning." The other classes (Probabilistic Methods, Genetic Algorithms, etc.) do not align with these specific technical terms.

The centrality measures describe the node's position in the network but do not provide direct evidenc

In [9]:
import torch
from torch_geometric.datasets import Planetoid
import networkx as nx
from torch_geometric.utils import to_networkx
import pandas as pd
import numpy as np

# Load the Pubmed dataset
print("--- Loading Pubmed Dataset ---")
dataset = Planetoid(root='./data/Pubmed', name='Pubmed')
data = dataset[0]
print(f"Nodes: {data.num_nodes}, Original Features: {data.num_node_features}, Classes: {dataset.num_classes}")

# Convert to NetworkX
G = to_networkx(data, to_undirected=True)

# Calculate Centralities (Required TANS Step 1)
# Note: This is a slow step (especially Betweenness) and is conceptually similar to Cora.
degree_centrality = nx.degree_centrality(G)
closeness_centrality = nx.closeness_centrality(G)
betweenness_centrality = nx.betweenness_centrality(G)

# Create DataFrame
node_properties_df = pd.DataFrame({
    'node_id': list(G.nodes()),
    'degree_centrality': [degree_centrality[n] for n in G.nodes()],
    'closeness_centrality': [closeness_centrality[n] for n in G.nodes()],
    'betweenness_centrality': [betweenness_centrality[n] for n in G.nodes()]
}).set_index('node_id')

# Calculate Ranks (Required TANS Step 2 for Prompting)
for col in node_properties_df.columns:
    node_properties_df[f'{col}_rank'] = node_properties_df[col].rank(pct=True) * 100

# --- Global Variables for Pubmed Prompting ---
# Pubmed has 3 classes (e.g., specific types of diabetic papers)
classes_pubmed = ["Experimental Diabetes", "Diabetes Mellitus", "Type 1 Diabetes"] # Placeholder for actual class names
sample_node_id = list(G.nodes())[500]
sample_text = "A paper discussing a novel finding related to insulin resistance in mice."

print("Pubmed setup complete.")

--- Loading Pubmed Dataset ---


Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.x
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.tx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.allx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.y
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.ty
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.ally
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.graph
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.pubmed.test.index
Processing...
Done!


Nodes: 19717, Original Features: 500, Classes: 3
Pubmed setup complete.


In [10]:
# --- USA Airport Network (Text-Free) Setup ---

# This dataset is not part of Planetoid. You would need to load it from a source
# like the official TANS repository or the original data source (e.g., using NetworkX
# after downloading the edge list).
# --- Conceptual Loading ---
# G_usa = nx.read_edgelist('usa.edgelist', nodetype=int)

# --- MOCKING DATA FOR CONTINUITY ---
# Since direct loading is complex, we mock a small text-free graph for demonstration.
G_usa_mock = nx.random_geometric_graph(n=1190, radius=0.1) # Mock USA graph (1,190 nodes)

# Calculate Centralities
degree_centrality_usa = nx.degree_centrality(G_usa_mock)
# ... (rest of centrality calculations)

# --- Global Variables for USA Prompting ---
# Airport classes relate to activity level
classes_usa = ["High Activity", "Moderate Activity", "Moderately Low Activity", "Low Activity"] # 4 classes
sample_node_id_usa = list(G_usa_mock.nodes())[50]
sample_text_usa = "" # CRITICAL: Text-free means the original text is empty.

# When generating the prompt for text-free graphs,
# you use the 'generate_tans_prompt' function but pass an empty string
# for 'original_text' and the 'get_neighbor_texts' function should be adapted
# to return "No textual descriptions available" for its output.

In [12]:
# --- Re-using the prompt logic with Text-Free adaptation ---

def get_original_text_text_free(node_id):
    """Returns empty string for text-free graphs."""
    return ""

def get_neighbor_texts_text_free(graph, node_id, num_neighbors=5):
    """Returns a placeholder indicating no neighbor text exists."""
    return "No textual descriptions available for connected nodes."


# --- Example Prompt for a Text-Free Graph (USA) ---

# 1. Extract node properties as a *DataFrame*, not a Series
high_activity_props = node_properties_df.loc[[sample_node_id_usa]].copy()

# 2. Add new values safely
high_activity_props.loc[sample_node_id_usa, "degree_centrality"] = 0.1749
high_activity_props.loc[sample_node_id_usa, "degree_centrality_rank"] = 99.58  # mock rank

# 3. Now generate the prompt using the retained 2D structure
final_prompt_usa = generate_tans_prompt(
    G_usa_mock,
    sample_node_id_usa,
    high_activity_props,   # <-- now a DataFrame
    classes_usa
)

print("\n--- Final Generated TANS Prompt Example (USA - Text-Free) ---")
print(final_prompt_usa)



--- Final Generated TANS Prompt Example (USA - Text-Free) ---
Given a node from a citation network graph, where the node type is paper.
The original node description is: "A paper discussing graph convolutional networks and deep learning for node classification. This is the abstract content.".

The following are the textual information of 5 connected nodes. The descriptions are:
Node 231: A paper discussing graph convolutional n...
Node 128: A paper discussing graph convolutional n...
Node 427: A paper discussing graph convolutional n...
Node 205: A paper discussing graph convolutional n...
Node 24: A paper discussing graph convolutional n...

Node Properties:
- Degree Centrality value: 0.1749, ranked as 99.58% among all nodes.
- Closeness Centrality value: 0.1458.
- Betweenness Centrality value: 0.0000.

Output the potential class of the node among the following classes: ['High Activity', 'Moderate Activity', 'Moderately Low Activity', 'Low Activity']. 
Provide reasons for your assess