In [None]:
# graph_reasoning_colab.ipynb (Python code cells below are meant for notebook)

# Step 1: Install Required Packages
!pip install networkx huggingface_hub pandas numpy scikit-learn transformers accelerate

# Step 2: Import Libraries
import networkx as nx
import pandas as pd
import numpy as np
import pickle
from huggingface_hub import hf_hub_download
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# Step 3: Load the Graph
repo_id = 'lamm-mit/bio-graph-1K'
graph_file = hf_hub_download(repo_id=repo_id, filename='large_graph_simple_giant.graphml')
G = nx.read_graphml(graph_file)
print(f"Graph loaded with {len(G.nodes())} nodes and {len(G.edges())} edges.")

# Step 4: Load the Embeddings and Flatten
embedding_file = hf_hub_download(repo_id=repo_id, filename='embeddings_simple_giant_ge-large-en-v1.5.pkl')
with open(embedding_file, 'rb') as f:
    raw_embeddings = pickle.load(f)

# Flatten all embeddings to ensure correct shape
embeddings = {k: np.array(v).squeeze() for k, v in raw_embeddings.items()}
example_key = list(embeddings.keys())[0]
print(f"Sample embedding shape: {embeddings[example_key].shape}")

# Step 5: Node Similarity Function
def get_similar_nodes(node_id, top_k=5):
    if node_id not in embeddings:
        return []
    vec = embeddings[node_id].squeeze().reshape(1, -1)
    keys = list(embeddings.keys())
    mat = np.stack([embeddings[k] for k in keys])
    sims = cosine_similarity(vec, mat)[0]
    top_indices = np.argsort(-sims)[1:top_k+1]
    return [(keys[i], sims[i]) for i in top_indices]

# Step 6: Shortest Path Finder
def find_path(start_node, end_node):
    if start_node not in G or end_node not in G:
        return "Invalid node(s)"
    try:
        path = nx.shortest_path(G, source=start_node, target=end_node)
        return path
    except nx.NetworkXNoPath:
        return "No path found"

# Step 7: Open-Source Reasoning with Mistral
model_name = "mistralai/Mistral-7B-Instruct-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

def query_local_model(prompt, max_tokens=200):
    output = pipe(prompt, max_new_tokens=max_tokens, do_sample=True, temperature=0.7)[0]["generated_text"]
    return output

# Example Use
# path = find_path("mineralized collagen fibers", "schreger pattern")
# print(path)

# similar_nodes = get_similar_nodes("hydroxyapatite")
# print(similar_nodes)

# prompt = "How is mineralized collagen structurally similar to nacre?"
# print(query_local_model(prompt))
