In [93]:
import torch
TORCH = torch.__version__.split('+')[0]
CUDA = 'cu' + torch.version.cuda.replace('.', '')

# 2. Install torch-scatter, torch-sparse, and finally, torch-geometric
!pip install torch-scatter -f https://data.pyg.org/whl/torch-{TORCH}+{CUDA}.html
!pip install torch-geometric

Looking in links: https://data.pyg.org/whl/torch-2.9.0+cu126.html


In [94]:
import torch
from torch_geometric.datasets import Planetoid
import networkx as nx
from torch_geometric.utils import to_networkx
import pandas as pd
import numpy as np
dataset = Planetoid(root='./data/Cora', name='Cora')
data = dataset[0]
G = to_networkx(data, to_undirected=True)

print(f"Nodes: {data.num_nodes}")
print(f"Features: {data.num_node_features}")
print(f"Classes: {dataset.num_classes}")

edge_index = data.edge_index


num_edges_to_print = min(5, edge_index.shape[1])

# Print the total number of edges
print(f"\nTotal number of directed edges: {data.num_edges}")


Nodes: 2708
Features: 1433
Classes: 7

Total number of directed edges: 10556


In [95]:

degree_centrality = nx.degree_centrality(G)

closeness_centrality = nx.closeness_centrality(G)

betweenness_centrality = nx.betweenness_centrality(G)

node_properties_df = pd.DataFrame({
    'node_id': list(G.nodes()),
    'degree_centrality': [degree_centrality[n] for n in G.nodes()],
    'closeness_centrality': [closeness_centrality[n] for n in G.nodes()],
    'betweenness_centrality': [betweenness_centrality[n] for n in G.nodes()]
}).set_index('node_id')

print("Sample Node Properties:")
print(node_properties_df.head())

Sample Node Properties:
         degree_centrality  closeness_centrality  betweenness_centrality
node_id                                                                 
0                 0.001108              0.144255            9.766154e-07
1                 0.001108              0.151453            1.080477e-03
2                 0.001847              0.179168            4.050816e-03
3                 0.000369              0.000369            0.000000e+00
4                 0.001847              0.153266            5.511762e-04


In [96]:
# --- 1. Calculate and Add Rank to DataFrame ---

# The TANS method provides the rank (percentile) of each property
# among all nodes to the LLM (Table 2 in the paper).
for col in node_properties_df.columns:
    # Use .rank(pct=True) to calculate percentile rank (0 to 1)
    # Multiply by 100 to get the percentage rank (0 to 100)
    node_properties_df[f'{col}_rank'] = node_properties_df[col].rank(pct=True) * 100

print("\nSample Node Properties with Ranks:")
print(node_properties_df[['degree_centrality', 'degree_centrality_rank']].head())


Sample Node Properties with Ranks:
         degree_centrality  degree_centrality_rank
node_id                                           
0                 0.001108               49.667651
1                 0.001108               49.667651
2                 0.001847               79.431315
3                 0.000369                8.973412
4                 0.001847               79.431315


In [97]:
import pandas as pd
import networkx as nx

# Load node data
cora_content = pd.read_csv("/content/cora/cora.content", sep="\t", header=None)

# Last column is the class label
cora_content.columns = ["id"] + [f"w{i}" for i in range(1, cora_content.shape[1]-1)] + ["class"]

# Load edge list (citations)
edges = pd.read_csv("/content/cora/cora.cites", sep="\t", header=None, names=["source", "target"])

# Build directed citation graph
G = nx.from_pandas_edgelist(edges, source="source", target="target", create_using=nx.DiGraph())

def bow_to_text(row):
    words = [col for col in row.index if col.startswith("w") and row[col] == 1]
    return " ".join(words)

node_text = {
    row["id"]: bow_to_text(row)  # 'id' must match G node IDs
    for _, row in cora_content.iterrows()
}

def get_original_text(node_id):
    """
    Placeholder: In a full TANS implementation, this reads the actual title
    and abstract text from the raw Cora dataset files.
    """
    # Using a generic placeholder for demonstration
    if node_id not in node_text:
      return "No text found for this node."
    return node_text[node_id]

In [98]:
common_nodes = list(set(G.nodes()).intersection(node_text.keys()))
sample_node_id = common_nodes[0]

In [99]:
import numpy as np

def get_neighbor_texts(graph, node_id, num_neighbors=5):
    """
    Retrieves up to num_neighbors real texts of connected nodes.
    """
    if node_id not in graph:
        return "Node not found in graph."

    neighbors = list(graph.neighbors(node_id))

    if len(neighbors) == 0:
        return "No connected nodes found."

    selected = np.random.choice(
        neighbors,
        min(num_neighbors, len(neighbors)),
        replace=False
    )

    return "\n".join(
        [
            f"{n}: {get_original_text(n)[:200]}..."
            for n in selected
        ]
    )
test_node = cora_content.iloc[0]["id"]

print("--- Original Text ---")
print(get_original_text(test_node))

print("\n--- Neighbor Texts ---")
print(get_neighbor_texts(G, test_node, num_neighbors=5))

--- Original Text ---
w119 w126 w177 w253 w352 w457 w508 w522 w620 w649 w699 w703 w735 w846 w903 w1206 w1210 w1237 w1353 w1427

--- Neighbor Texts ---
686532: w133 w174 w212 w329 w330 w336 w435 w522 w565 w704 w726 w730 w798 w1171 w1209 w1212 w1258 w1302 w1329 w1340 w1424 w1426...
31349: w457 w649 w903 w1210 w1274...
1129442: w133 w136 w232 w238 w251 w265 w331 w469 w699 w875 w903 w1020 w1098 w1136 w1274 w1349 w1353 w1360...


In [100]:
# --- 3. Final TANS Prompt Generation Function (Steps 2 & 3) ---

def generate_tans_prompt(graph, node_id, properties_df, classes):
    """
    Generates the complete, structured TANS prompt using all information.
    """
    # Retrieve properties and ranks
    degree = properties_df.loc[node_id, 'degree_centrality']
    rank_degree = properties_df.loc[node_id, 'degree_centrality_rank']

    # 1. Get original text (Prompt 2)
    original_text = get_original_text(node_id)

    # 2. Get neighbor texts (Prompt 3)
    neighbor_texts = get_neighbor_texts(graph, node_id, num_neighbors=5)

    # 3. Construct the full prompt (Prefix, Text, Neighbor, Property, Suffix)
    prompt = f"""
Given a node from a citation network graph, where the node type is paper.
The original node description is: "{original_text}".

The following are the textual information of 5 connected nodes. The descriptions are:
{neighbor_texts}

Node Properties:
- Degree Centrality value: {degree:.4f}, ranked as {rank_degree:.2f}% among all nodes.
- Closeness Centrality value: {properties_df.loc[node_id, 'closeness_centrality']:.4f}.
- Betweenness Centrality value: {properties_df.loc[node_id, 'betweenness_centrality']:.4f}.

Output the potential class of the node among the following classes: {classes}.
Provide reasons for your assessment. Your answer should be less than 200 words.
"""
    return prompt.strip()

In [101]:
# Make sure node_properties_df is indexed by node_id

# Find nodes common to G, node_text, and node_properties_df
common_nodes = list(
    set(G.nodes()).intersection(node_text.keys()).intersection(node_properties_df.index)
)

# Pick a sample node
sample_node_id = common_nodes[0]  # safe


In [102]:
print("Sample node ID:", sample_node_id)
print("In G:", sample_node_id in G)
print("In node_text:", sample_node_id in node_text)
print("In node_properties_df:", sample_node_id in node_properties_df.index)

Sample node ID: 128
In G: True
In node_text: True
In node_properties_df: True


In [103]:

classes_cora = ["Neural Networks", "Probabilistic Methods", "Genetic Algorithms", "Theory", "Case Based", "Reinforcement Learning", "Rule Learning"]


common_nodes = list(
    set(G.nodes()).intersection(node_text.keys()).intersection(node_properties_df.index)
)

# Pick a sample node
sample_node_id = common_nodes[0]

# Generate TANS prompt
final_prompt = generate_tans_prompt(
    G,
    sample_node_id,
    node_properties_df,
    classes_cora
)
print("\n--- Final Generated TANS Prompt Example ---")
print(final_prompt)




--- Final Generated TANS Prompt Example ---
Given a node from a citation network graph, where the node type is paper.
The original node description is: "w2 w42 w188 w213 w358 w405 w465 w506 w508 w582 w636 w875 w989 w1072 w1231 w1232 w1259 w1264 w1275 w1394".

The following are the textual information of 5 connected nodes. The descriptions are:
20526: w100 w241 w331 w335 w549 w582 w633 w649 w830 w875 w1072 w1119 w1132 w1156 w1178 w1193 w1207 w1264 w1275 w1360 w1433...
91975: w158 w212 w238 w357 w447 w521 w595 w605 w624 w649 w656 w724 w830 w875 w940 w1072 w1264 w1275 w1309 w1360 w1424...
1114125: w94 w100 w335 w402 w582 w605 w774 w981 w1156 w1178 w1264 w1293 w1307 w1315 w1321 w1382...
39403: w127 w293 w335 w549 w582 w605 w626 w774 w912 w973 w989 w1133 w1156 w1263 w1264 w1293 w1307 w1315 w1321 w1382...

Node Properties:
- Degree Centrality value: 0.0015, ranked as 67.06% among all nodes.
- Closeness Centrality value: 0.1304.
- Betweenness Centrality value: 0.0007.

Output the potential c

In [104]:
# Install the necessary library (if not already done)
# pip install google-genai

import os
from google import genai
from google.genai.errors import APIError

# --- IMPORTANT: Set your API Key ---
# It's best practice to load your API key from an environment variable.
os.environ['GEMINI_API_KEY'] = 'AIzaSyAjX7evzLq__dhbzcJo8uNVkpEyp6JeQQY'
client = genai.Client()
# Assuming the client is initialized globally or passed in
def query_llm_and_generate_description_gemini(prompt, class_list):
    """
    Calls the Gemini API with the TANS prompt and returns the generated text.
    """
    try:
        # Initialize client inside if not done globally
        client = genai.Client()

        # Call the Gemini API
        response = client.models.generate_content(
            model='gemini-2.5-flash', # Use a capable model like flash or pro
            contents=prompt
        )

        # The TANS explanation is the generated text
        llm_explanation = response.text
        predicted_class = None
        for cls in class_list:
            if cls.lower() in llm_explanation.lower():
                predicted_class = cls
                break

        return predicted_class, llm_explanation

    except APIError as e:
        print(f"Gemini API Error: {e}")
        return "Error: Could not generate description due to API error."
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return "Error: An unexpected error occurred."

# Example Usage (replace the placeholder call):
classes_cora = [
    "Neural Networks", "Probabilistic Methods", "Genetic Algorithms",
    "Theory", "Case Based", "Reinforcement Learning", "Rule Learning"
]
predicted_class, llm_generated_text = query_llm_and_generate_description_gemini(final_prompt, classes_cora)
print(f"Predicted Class: {predicted_class}")
print(f"Gemini-Generated TANS Description:\n{llm_generated_text}")

Predicted Class: Neural Networks
Gemini-Generated TANS Description:
The node and its connected papers exhibit a strong overlap in their textual descriptions, with several anonymous keywords appearing frequently across multiple nodes. Most notably, `w1264` is present in all five papers, and `w582` appears in four. Keywords `w875`, `w1072`, and `w1275` are also shared extensively.

This high co-occurrence of specific, technical terms suggests a focused research domain with a distinct vocabulary. The node's moderate Degree Centrality (67.06%) indicates it's well-connected within its community, while its low Betweenness Centrality suggests it's not bridging disparate areas. This profile aligns with a paper deeply embedded in a specialized field.

Among the given options, **Neural Networks** are characterized by highly specific architectures, algorithms, and components, which often leads to a concentrated and consistently shared technical vocabulary among related papers. This pattern of foc

In [105]:
import torch
from torch_geometric.datasets import Planetoid
import networkx as nx
from torch_geometric.utils import to_networkx
import pandas as pd
import numpy as np

# Load the Pubmed dataset
print("--- Loading Pubmed Dataset ---")
dataset = Planetoid(root='./data/Pubmed', name='Pubmed')
data = dataset[0]
print(f"Nodes: {data.num_nodes}, Original Features: {data.num_node_features}, Classes: {dataset.num_classes}")

# Convert to NetworkX
G = to_networkx(data, to_undirected=True)

# Calculate Centralities (Required TANS Step 1)
# Note: This is a slow step (especially Betweenness) and is conceptually similar to Cora.
degree_centrality = nx.degree_centrality(G)
closeness_centrality = nx.closeness_centrality(G)
betweenness_centrality = nx.betweenness_centrality(G)

# Create DataFrame
node_properties_df = pd.DataFrame({
    'node_id': list(G.nodes()),
    'degree_centrality': [degree_centrality[n] for n in G.nodes()],
    'closeness_centrality': [closeness_centrality[n] for n in G.nodes()],
    'betweenness_centrality': [betweenness_centrality[n] for n in G.nodes()]
}).set_index('node_id')

# Calculate Ranks (Required TANS Step 2 for Prompting)
for col in node_properties_df.columns:
    node_properties_df[f'{col}_rank'] = node_properties_df[col].rank(pct=True) * 100

# --- Global Variables for Pubmed Prompting ---
# Pubmed has 3 classes (e.g., specific types of diabetic papers)
classes_pubmed = ["Experimental Diabetes", "Diabetes Mellitus", "Type 1 Diabetes"] # Placeholder for actual class names
sample_node_id = list(G.nodes())[500]
sample_text = "A paper discussing a novel finding related to insulin resistance in mice."

print("Pubmed setup complete.")

--- Loading Pubmed Dataset ---
Nodes: 19717, Original Features: 500, Classes: 3
Pubmed setup complete.


In [109]:
import numpy as np
import pandas as pd
import networkx as nx

# --- 1. SIMULATE PUBMED TEXT CONTENT ---
# The original texts (abstracts/titles) are not included in the PyG Planetoid object.
# We must simulate a DataFrame that holds the text content, similar to your 'cora_content'.

# Get all node IDs from your NetworkX graph
all_node_ids = list(G.nodes())

# Create placeholder texts for demonstration purposes
# In a real scenario, you would load the actual Pubmed abstracts/titles here.
def generate_placeholder_text(node_id):
    """Generates a unique placeholder abstract for a node."""
    # Use the node's true class from the data tensor for a more realistic simulation
    node_index = list(G.nodes()).index(node_id)
    true_label = data.y[node_index].item()

    # Class labels from your setup: ["Experimental Diabetes", "Diabetes Mellitus", "Type 1 Diabetes"]
    class_name = classes_pubmed[true_label]

    # Generate the text
    text = (
        f"Research paper ID {node_id}. This article, categorized as '{class_name}', "
        f"investigates the efficacy of novel genetic biomarkers in predicting "
        f"long-term outcomes for patients with {class_name}. "
        f"The findings suggest a strong correlation between the expression of "
        f"the GNG-23 receptor and improved therapeutic response."
    )
    return text

# Create the simulated content DataFrame
pubmed_texts_df = pd.DataFrame({
    'id': all_node_ids,
    'text': [generate_placeholder_text(n) for n in all_node_ids]
}).set_index('id')

print("Simulated Pubmed Text Content (pubmed_texts_df) created.")

# --- 2. DEFINE GET_ORIGINAL_TEXT ---
def get_pubmed_text(node_id):
    """
    Retrieves the simulated original text (abstract/title) for a given node ID.
    """
    try:
        return pubmed_texts_df.loc[node_id, 'text']
    except KeyError:
        return f"Text content not found for node ID {node_id}."


# --- 3. DEFINE GET_NEIGHBOR_TEXTS (ADAPTED) ---
def get_neighbor_texts_pubmed(graph, node_id, num_neighbors=5):
    """
    Retrieves up to num_neighbors simulated texts of connected nodes for Pubmed.
    """
    if node_id not in graph:
        return "Node not found in graph."

    neighbors = list(graph.neighbors(node_id))

    if len(neighbors) == 0:
        return "No connected nodes found."

    # Use NumPy random choice for selecting neighbors
    selected = np.random.choice(
        neighbors,
        min(num_neighbors, len(neighbors)),
        replace=False
    )

    # Format the output string
    return "\n".join(
        [
            f"Node {n}: {get_pubmed_text(n)[:150]}..."
            for n in selected
        ]
    )

# --- 4. TEST WITH A SAMPLE NODE ---
# We'll use the sample_node_id you defined earlier
test_node = sample_node_id

print("\n" + "="*50)
print(f"--- TEST RESULTS FOR PUBMED (Node ID: {test_node}) ---")
print("="*50)

print("--- Original Text ---")
print(get_pubmed_text(test_node))

print("\n--- Neighbor Texts ---")
# Use the graph G you created and the new function
print(get_neighbor_texts_pubmed(G, test_node, num_neighbors=5))

Simulated Pubmed Text Content (pubmed_texts_df) created.

--- TEST RESULTS FOR PUBMED (Node ID: 500) ---
--- Original Text ---
Research paper ID 500. This article, categorized as 'Type 1 Diabetes', investigates the efficacy of novel genetic biomarkers in predicting long-term outcomes for patients with Type 1 Diabetes. The findings suggest a strong correlation between the expression of the GNG-23 receptor and improved therapeutic response.

--- Neighbor Texts ---
Node 3831: Research paper ID 3831. This article, categorized as 'Type 1 Diabetes', investigates the efficacy of novel genetic biomarkers in predicting long-term ...
Node 17301: Research paper ID 17301. This article, categorized as 'Type 1 Diabetes', investigates the efficacy of novel genetic biomarkers in predicting long-term...
Node 14008: Research paper ID 14008. This article, categorized as 'Type 1 Diabetes', investigates the efficacy of novel genetic biomarkers in predicting long-term...
Node 18728: Research paper ID 18728.

In [117]:
import numpy as np
import pandas as pd

# Use the node_properties_df created earlier, which contains centrality scores
# and ranks for all Pubmed nodes.

print("Finding Most Influential Node")

influential_node_info = node_properties_df.sort_values(
    'betweenness_centrality',
    ascending=False
).iloc[0]

# Get the ID of the most central node
most_central_node = influential_node_info.name

print(f"The node with the highest Betweenness Centrality is: {most_central_node}")
print(f"Betweenness Centrality Score: {influential_node_info['betweenness_centrality']:.4f}")



print("\n" + "="*50)
print(f"TEST RESULTS FOR PUBMED (Node ID: {most_central_node} - Highly Central)")
print("="*50)

# Print the text of the central article
print("Central Article Text")
print(get_pubmed_text(most_central_node))

# Print the text of the articles that cite this central article (its neighbors)
print("\n Neighbor Texts (Citing/Cited Articles) ")
print(get_neighbor_texts_pubmed(G, most_central_node, num_neighbors=5))

--- Finding Most Influential Node ---
The node with the highest Betweenness Centrality is: 11024
Betweenness Centrality Score: 0.1429

TEST RESULTS FOR PUBMED (Node ID: 11024 - Highly Central)
Central Article Text
Research paper ID 11024. This article, categorized as 'Type 1 Diabetes', investigates the efficacy of novel genetic biomarkers in predicting long-term outcomes for patients with Type 1 Diabetes. The findings suggest a strong correlation between the expression of the GNG-23 receptor and improved therapeutic response.

 Neighbor Texts (Citing/Cited Articles) 
Node 574: Research paper ID 574. This article, categorized as 'Type 1 Diabetes', investigates the efficacy of novel genetic biomarkers in predicting long-term o...
Node 1416: Research paper ID 1416. This article, categorized as 'Diabetes Mellitus', investigates the efficacy of novel genetic biomarkers in predicting long-ter...
Node 9723: Research paper ID 9723. This article, categorized as 'Type 1 Diabetes', investigates t

In [111]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class GCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        # First GCN layer: takes node features (in_channels) and outputs hidden_channels
        self.conv1 = GCNConv(in_channels, hidden_channels)
        # Second GCN layer: takes hidden_channels and outputs the number of classes (out_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        # 1. First Convolutional Layer
        x = self.conv1(x, edge_index)
        x = F.relu(x)  # Apply ReLU activation function
        x = F.dropout(x, p=0.5, training=self.training) # Dropout for regularization

        # 2. Second Convolutional Layer
        x = self.conv2(x, edge_index)

        # 3. Final Output (Log Softmax for classification)
        return F.log_softmax(x, dim=1)

In [112]:
# Assuming you ran the initial PyG loading step:
# dataset = Planetoid(root='./data/Pubmed', name='Pubmed')
# data = dataset[0]

# Model Parameters
INPUT_DIM = data.num_node_features # 500 features (Bag-of-Words vectors)
HIDDEN_DIM = 16                    # Standard hidden layer size for GCNs
OUTPUT_DIM = dataset.num_classes   # 3 classes (types of papers)

# Initialize the GCN model
model = GCN(in_channels=INPUT_DIM, hidden_channels=HIDDEN_DIM, out_channels=OUTPUT_DIM)

# Define the optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
criterion = torch.nn.NLLLoss() # Negative Log Likelihood Loss (matches log_softmax output)

print(f"GCN Model initialized with: Input={INPUT_DIM}, Hidden={HIDDEN_DIM}, Output={OUTPUT_DIM}")

GCN Model initialized with: Input=500, Hidden=16, Output=3


In [113]:
def train():
    model.train() # Set the model to training mode
    optimizer.zero_grad() # Clear gradients

    # Forward pass: Pass node features (x) and graph structure (edge_index)
    out = model(data.x, data.edge_index)

    # Calculate loss only on the nodes designated for training
    # data.train_mask is a boolean mask provided by the Planetoid dataset
    loss = criterion(out[data.train_mask], data.y[data.train_mask])

    # Backward pass
    loss.backward()
    optimizer.step()

    return loss.item()

In [114]:
def test():
    model.eval() # Set the model to evaluation mode
    out = model(data.x, data.edge_index)

    # Get the predicted class (index of the max log probability)
    pred = out.argmax(dim=1)

    accuracies = {}

    # Evaluate on the validation mask
    correct_val = pred[data.val_mask] == data.y[data.val_mask]
    accuracies['val'] = int(correct_val.sum()) / int(data.val_mask.sum())

    # Evaluate on the test mask
    correct_test = pred[data.test_mask] == data.y[data.test_mask]
    accuracies['test'] = int(correct_test.sum()) / int(data.test_mask.sum())

    return accuracies

In [115]:
print("\n--- Starting GNN Training ---")
for epoch in range(1, 201):
    loss = train()
    accs = test()

    if epoch % 20 == 0 or epoch == 1:
        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, '
              f'Val Acc: {accs["val"]:.4f}, Test Acc: {accs["test"]:.4f}')

print("--- Training Complete ---")


--- Starting GNN Training ---
Epoch: 001, Loss: 1.0925, Val Acc: 0.5900, Test Acc: 0.6150
Epoch: 020, Loss: 0.7055, Val Acc: 0.7360, Test Acc: 0.6970
Epoch: 040, Loss: 0.3440, Val Acc: 0.7580, Test Acc: 0.7600
Epoch: 060, Loss: 0.2268, Val Acc: 0.7840, Test Acc: 0.7740
Epoch: 080, Loss: 0.1267, Val Acc: 0.7900, Test Acc: 0.7810
Epoch: 100, Loss: 0.1213, Val Acc: 0.7820, Test Acc: 0.7800
Epoch: 120, Loss: 0.1242, Val Acc: 0.7840, Test Acc: 0.7840
Epoch: 140, Loss: 0.1042, Val Acc: 0.7880, Test Acc: 0.7850
Epoch: 160, Loss: 0.1096, Val Acc: 0.7880, Test Acc: 0.7950
Epoch: 180, Loss: 0.0856, Val Acc: 0.7820, Test Acc: 0.7900
Epoch: 200, Loss: 0.0791, Val Acc: 0.7880, Test Acc: 0.7950
--- Training Complete ---


In [106]:
# --- USA Airport Network (Text-Free) Setup ---

# This dataset is not part of Planetoid. You would need to load it from a source
# like the official TANS repository or the original data source (e.g., using NetworkX
# after downloading the edge list).
# --- Conceptual Loading ---
# G_usa = nx.read_edgelist('usa.edgelist', nodetype=int)

# --- MOCKING DATA FOR CONTINUITY ---
# Since direct loading is complex, we mock a small text-free graph for demonstration.
G_usa_mock = nx.random_geometric_graph(n=1190, radius=0.1) # Mock USA graph (1,190 nodes)

# Calculate Centralities
degree_centrality_usa = nx.degree_centrality(G_usa_mock)

classes_usa = ["High Activity", "Moderate Activity", "Moderately Low Activity", "Low Activity"] # 4 classes
sample_node_id_usa = list(G_usa_mock.nodes())[50]
sample_text_usa = ""


In [119]:

def get_original_text_text_free(node_id):
    """Returns empty string for text-free graphs."""
    return ""

def get_neighbor_texts_text_free(graph, node_id, num_neighbors=5):
    """Returns a placeholder indicating no neighbor text exists."""
    return "No textual descriptions available for connected nodes."


# --- Example Prompt for a Text-Free Graph (USA) ---

# 1. Extract node properties as a *DataFrame*, not a Series
high_activity_props = node_properties_df.loc[[sample_node_id_usa]].copy()

# 2. Add new values safely
high_activity_props.loc[sample_node_id_usa, "degree_centrality"] = 0.1749
high_activity_props.loc[sample_node_id_usa, "degree_centrality_rank"] = 99.58  # mock rank

# 3. Now generate the prompt using the retained 2D structure
final_prompt_usa = generate_tans_prompt(
    G_usa_mock,
    sample_node_id_usa,
    high_activity_props,
    classes_usa
)

print("\n--- Final Generated TANS Prompt Example (USA - Text-Free) ---")
print(final_prompt_usa)



--- Final Generated TANS Prompt Example (USA - Text-Free) ---
Given a node from a citation network graph, where the node type is paper.
The original node description is: "No text found for this node.".

The following are the textual information of 5 connected nodes. The descriptions are:
744: No text found for this node....
518: No text found for this node....
952: No text found for this node....
532: No text found for this node....
226: No text found for this node....

Node Properties:
- Degree Centrality value: 0.1749, ranked as 99.58% among all nodes.
- Closeness Centrality value: 0.1458.
- Betweenness Centrality value: 0.0000.

Output the potential class of the node among the following classes: ['High Activity', 'Moderate Activity', 'Moderately Low Activity', 'Low Activity']. 
Provide reasons for your assessment. Your answer should be less than 200 words.
