# Guide to Converting a Dataset to a Sentence Graph Dataset

In [1]:
import graph_tool as gt

In [2]:
from sent_graph_rag.Datasets import SentenceGraphCreator
import spacy
import pickle

In [3]:
from spacy.pipeline import EntityLinker
from fastcoref import spacy_component

  from .autonotebook import tqdm as notebook_tqdm


## Load spacy model

In [4]:
from datasets.utils.logging import disable_progress_bar
disable_progress_bar()
nlp = spacy.load("en_core_web_sm")
# nlp.add_pipe("fastcoref",  config={'device': 'cuda:0', "enable_progress_bar": False})
nlp.add_pipe("fastcoref",  config={'device': 'cuda:0', "enable_progress_bar": False})

04/04/2025 14:24:50 - INFO - 	 missing_keys: []
04/04/2025 14:24:50 - INFO - 	 unexpected_keys: []
04/04/2025 14:24:50 - INFO - 	 mismatched_keys: []
04/04/2025 14:24:50 - INFO - 	 error_msgs: []
04/04/2025 14:24:50 - INFO - 	 Model Parameters: 90.5M, Transformer: 82.1M, Coref head: 8.4M


<fastcoref.spacy_component.spacy_component.FastCorefResolver at 0x781a16e5c450>

## Load MIT Corpus

In [5]:
corpus_path = "../data/mit_wiki.txt"
with open(corpus_path, "r") as f:
    corpus = f.read()
    corpus2 = f.read()


In [6]:
grapher = SentenceGraphCreator(nlp, verbose=True)
graphs = grapher.create_graphs([corpus], graph_type="gt")
graph, vertex_map = graphs[0]
# graph2, vertex_map2 = graphs[1]


04/04/2025 14:24:54 - INFO - 	 Tokenize 1 inputs...
04/04/2025 14:24:54 - INFO - 	 ***** Running Inference on 1 texts *****


Inference time: 2.150508403778076
Graph Build Time: 0.19897103309631348


In [7]:
v1 = vertex_map["The Massachusetts Institute of Technology_ORG_TERMINAL"]
v2 = vertex_map["The Massachusetts Institute of Technology_ORG"]

# all_edges = graph.edge(v1, v2, all_edges=True)
# for i, edge in enumerate(all_edges):
#   print(i, graph.edge_properties["sentence"][edge])

for alias in graph.vertex_properties["aliases"][v2]:
    print(alias)
    print("-----------")


The Massachusetts Institute of Technology
-----------
the Institute
-----------
MIT's
-----------
The Institute
-----------
the Institute's
-----------
the Massachusetts Institute of Technology
-----------
MIT
-----------


In [8]:
print(bool(graph.vertex_properties["terminal"][v1]))


True


## Test saving graph

In [9]:
graph.save("../data/mit.gt")


In [10]:
from graph_tool.all import load_graph

graph2 = load_graph("../data/mit.gt")
all_edges = graph2.edge(v1, v2, all_edges=True)
for i, edge in enumerate(all_edges):
    print(i, graph2.edge_properties["sentence"][edge])


0 The institute also has a strong entrepreneurial culture and MIT alumni have founded or co-founded many notable companies.
1 The institute also has a strong entrepreneurial culture and MIT alumni have founded or co-founded many notable companies.
2 The fledgling school still suffered from chronic financial shortages which diverted the attention of the MIT leadership.
3 The fledgling school still suffered from chronic financial shortages which diverted the attention of the MIT leadership.
4 Unlike Ivy League schools, MIT catered more to middle-class families, and depended more on tuition than on endowments or grants for its funding.
5 Unlike Ivy League schools, MIT catered more to middle-class families, and depended more on tuition than on endowments or grants for its funding.
6 We might call it a university limited in its objectives but unlimited in the breadth and the thoroughness with which it pursues these objectives.



7 We might call it a university limited in its objectives but

In [11]:
import io


def graph_to_string(graph):
    # Use BytesIO for binary data
    buffer = io.BytesIO()
    graph.save(buffer)  # Save in default .gt format
    return buffer.getvalue()  # Return the binary data


test_dict = {"data": {"more_data": graph_to_string(graph)}}

with open("../data/dataset_with_graphs.pkl", "wb") as f:
    pickle.dump(test_dict, f)


In [12]:
def string_to_graph(graph_string):
    # Use BytesIO to read the binary data
    buffer = io.BytesIO(graph_string)
    graph = gt.Graph()  # Create an empty graph
    graph.load(buffer)  # Load the graph from the buffer
    return graph


with open("../data/dataset_with_graphs.pkl", "rb") as f:
    test_dict_retrieved = pickle.load(f)

graph_retrieved = string_to_graph(test_dict_retrieved["data"]["more_data"])
all_edges = graph_retrieved.edge(v1, v2, all_edges=True)
# print(test_dict_retrieved["data"])
for i, edge in enumerate(all_edges):
    print(i, graph_retrieved.edge_properties["sentence"][edge])


0 The institute also has a strong entrepreneurial culture and MIT alumni have founded or co-founded many notable companies.
1 The institute also has a strong entrepreneurial culture and MIT alumni have founded or co-founded many notable companies.
2 The fledgling school still suffered from chronic financial shortages which diverted the attention of the MIT leadership.
3 The fledgling school still suffered from chronic financial shortages which diverted the attention of the MIT leadership.
4 Unlike Ivy League schools, MIT catered more to middle-class families, and depended more on tuition than on endowments or grants for its funding.
5 Unlike Ivy League schools, MIT catered more to middle-class families, and depended more on tuition than on endowments or grants for its funding.
6 We might call it a university limited in its objectives but unlimited in the breadth and the thoroughness with which it pursues these objectives.



7 We might call it a university limited in its objectives but

## Visualize Graph

In [13]:
def graphtool_to_networkx_multigraph(gt_graph):
    """
    Converts a graph-tool undirected multigraph to a NetworkX MultiGraph.
    Be careful as it gets rid of all edge properties. Should only be used for visualizing graph

    Args:
        gt_graph (graph_tool.Graph): The input graph-tool graph (undirected multigraph).

    Returns:
        nx_multigraph (networkx.MultiGraph): The equivalent NetworkX MultiGraph.
    """
    # Create an empty NetworkX MultiGraph
    nx_multigraph = nx.MultiGraph()

    # Add nodes with properties
    for v in gt_graph.vertices():
        # Add node properties if they exist
        node_properties = {k: vprop[v] for k, vprop in gt_graph.vp.items()}
        nx_multigraph.add_node(int(v), **node_properties)

    # Add edges with properties
    edge_counter = {}
    for e in gt_graph.edges():
        # Determine a unique key for each edge between the same source and target
        source = int(e.source())
        target = int(e.target())
        edge_key = edge_counter.get((source, target), 0)
        edge_counter[(source, target)] = edge_key + 1

        # Add edge properties if they exist
        # edge_properties = {k: eprop[e] for k, eprop in gt_graph.ep.items()}
        nx_multigraph.add_edge(source, target, key=edge_key)

    return nx_multigraph

In [21]:
from pyvis.network import Network
import networkx as nx
from IPython.display import IFrame
import matplotlib.pyplot as plt


def visualize_graph(G, interactive=True):
    net = Network(notebook=False,  
                height="1100px",  # increase height
                width="100%",    # full width of the cell
    )
    # Load into PyVis

    # Convert to NetworkX
    nx_graph = graphtool_to_networkx_multigraph(G)  # if graph-tool
    # nx_graph = graph #if networkx

    # nx_graph = graph.to_networkx()
    net.from_nx(nx_graph)
    net.set_options("""
        var options = {
            "physics": {
            "enabled": false
            }
        }
    """)

    # net.show("graph.html")
    if interactive:
        net.save_graph("graph.html")
    else:
        nx.draw(nx_graph, with_labels=True)
        plt.show()

    # IFrame(src="graph.html", width="100%", height="600px")


In [23]:
visualize_graph(graph, interactive=True)


## Define Dataset Analysis Tools

### Diameter Aproximater

In [24]:
from graph_tool.all import Graph, GraphView, label_largest_component, pseudo_diameter


def approximate_diameter(graph):
    # Identify the largest connected component
    largest_component = label_largest_component(graph)

    # Extract the subgraph of the largest connected component
    largest_cc_subgraph = GraphView(graph, vfilt=largest_component)

    # Use pseudo_diameter to approximate the diameter
    approx_diameter, _ = pseudo_diameter(largest_cc_subgraph)

    return approx_diameter


# print("Diameter: ", approximate_diameter(graph))


### % of parallel edges

In [25]:
print("Terminal edges:")
for e in graph.edges():
    if graph.edge_properties["terminal"][e]:  # Check if the edge is terminal
        print(
            f"Source: {graph.vertex_properties['label'][e.source()]} :label {graph.vertex_properties['ner_label'][e.source()]} , Target: label: {graph.vertex_properties['label'][e.target()]} id: {e.target()}, Sentence: {graph.edge_properties['sentence'][e]}"
        )


Terminal edges:
Source: The Massachusetts Institute of Technology :label ORG , Target: label: terminal_node id: 36, Sentence: The institute also has a strong entrepreneurial culture and MIT alumni have founded or co-founded many notable companies.
Source: The Massachusetts Institute of Technology :label ORG , Target: label: terminal_node id: 36, Sentence: The institute also has a strong entrepreneurial culture and MIT alumni have founded or co-founded many notable companies.
Source: The Massachusetts Institute of Technology :label ORG , Target: label: terminal_node id: 36, Sentence: The fledgling school still suffered from chronic financial shortages which diverted the attention of the MIT leadership.
Source: The Massachusetts Institute of Technology :label ORG , Target: label: terminal_node id: 36, Sentence: The fledgling school still suffered from chronic financial shortages which diverted the attention of the MIT leadership.
Source: The Massachusetts Institute of Technology :label O

In [26]:
from collections import Counter
from graph_tool.all import Graph


# Add example edges (if needed)
# g.add_edge(source, target)
def count_parallel_edges(g):
    # Step 1: Create a list of all (source, target) pairs
    # For undirected graphs, sort the source and target to ensure consistency
    non_terminal_edges = [
        (min(e.source(), e.target()), max(e.source(), e.target()))
        for e in g.edges()
        if not g.edge_properties["terminal"][e]
    ]

    # Step 2: Count occurrences of each edge
    edge_counts = Counter(non_terminal_edges)

    # Step 3: Calculate the number of parallel edges
    parallel_edge_count = sum(count for count in edge_counts.values() if count > 1)

    # Step 4: Calculate the total number of edges
    total_edge_count = g.num_edges()

    # Step 5: Calculate the percentage of parallel edges
    percentage_parallel_edges = (
        (parallel_edge_count / total_edge_count) * 100 if total_edge_count > 0 else 0
    )

    return percentage_parallel_edges


# print(f"Percentage of parallel edges: {count_parallel_edges(graph):.2f}%")
