# Guide to Converting a Dataset to a Sentence Graph Dataset

In [1]:
from sent_graph_rag.Datasets import SentenceGraphCreator
import spacy
import pickle
import graph_tool as gt



## Load spacy model

In [None]:
from datasets.utils.logging import disable_progress_bar
disable_progress_bar()
nlp = spacy.load("en_core_web_sm")
# nlp.add_pipe("fastcoref",  config={'device': 'cuda:0', "enable_progress_bar": False})
nlp.add_pipe("fastcoref",  config={'device': 'cpu', "enable_progress_bar": False})

ValueError: [E002] Can't find factory for 'fastcoref' for language English (en). This usually happens when spaCy calls `nlp.create_pipe` with a custom component name that's not registered on the current language class. If you're using a custom component, make sure you've added the decorator `@Language.component` (for function components) or `@Language.factory` (for class components).

Available factories: attribute_ruler, tok2vec, merge_noun_chunks, merge_entities, merge_subtokens, token_splitter, doc_cleaner, parser, beam_parser, lemmatizer, trainable_lemmatizer, entity_linker, entity_ruler, tagger, morphologizer, ner, beam_ner, senter, sentencizer, spancat, spancat_singlelabel, span_finder, future_entity_ruler, span_ruler, textcat, textcat_multilabel, en.lemmatizer

## Load MIT Corpus

In [None]:
corpus_path = "/content/drive/MyDrive/NLP_Project/mit_wiki.txt"
with open(corpus_path, "r") as f:
    corpus = f.read()
    corpus2 = f.read()


In [None]:
grapher = SentenceGraphCreator(nlp, verbose=True)
graphs = grapher.create_graphs([corpus], graph_type="gt")
graph, vertex_map = graphs[0]
# graph2, vertex_map2 = graphs[1]


In [None]:
v1 = vertex_map["The Massachusetts Institute of Technology_ORG_TERMINAL"]
v2 = vertex_map["The Massachusetts Institute of Technology_ORG"]

# all_edges = graph.edge(v1, v2, all_edges=True)
# for i, edge in enumerate(all_edges):
#   print(i, graph.edge_properties["sentence"][edge])

for alias in graph.vertex_properties["aliases"][v2]:
    print(alias)
    print("-----------")


In [None]:
print(bool(graph.vertex_properties["terminal"][v1]))


## Test saving graph

In [None]:
graph.save("/content/drive/MyDrive/NLP_Project/Data/mit.gt")


In [None]:
from graph_tool.all import load_graph

graph2 = load_graph("/content/drive/MyDrive/NLP_Project/Data/mit.gt")
all_edges = graph2.edge(v1, v2, all_edges=True)
for i, edge in enumerate(all_edges):
    print(i, graph2.edge_properties["sentence"][edge])


In [None]:
import io


def graph_to_string(graph):
    # Use BytesIO for binary data
    buffer = io.BytesIO()
    graph.save(buffer)  # Save in default .gt format
    return buffer.getvalue()  # Return the binary data


test_dict = {"data": {"more_data": graph_to_string(graph)}}

with open("/content/drive/MyDrive/NLP_Project/Data/dataset_with_graphs.pkl", "wb") as f:
    pickle.dump(test_dict, f)


In [None]:
def string_to_graph(graph_string):
    # Use BytesIO to read the binary data
    buffer = io.BytesIO(graph_string)
    graph = gt.Graph()  # Create an empty graph
    graph.load(buffer)  # Load the graph from the buffer
    return graph


with open("/content/drive/MyDrive/NLP_Project/Data/dataset_with_graphs.pkl", "rb") as f:
    test_dict_retrieved = pickle.load(f)

graph_retrieved = string_to_graph(test_dict_retrieved["data"]["more_data"])
all_edges = graph_retrieved.edge(v1, v2, all_edges=True)
# print(test_dict_retrieved["data"])
for i, edge in enumerate(all_edges):
    print(i, graph_retrieved.edge_properties["sentence"][edge])


## Visualize Graph

In [None]:
def graphtool_to_networkx_multigraph(gt_graph):
    """
    Converts a graph-tool undirected multigraph to a NetworkX MultiGraph.
    Be careful as it gets rid of all edge properties. Should only be used for visualizing graph

    Args:
        gt_graph (graph_tool.Graph): The input graph-tool graph (undirected multigraph).

    Returns:
        nx_multigraph (networkx.MultiGraph): The equivalent NetworkX MultiGraph.
    """
    # Create an empty NetworkX MultiGraph
    nx_multigraph = nx.MultiGraph()

    # Add nodes with properties
    for v in gt_graph.vertices():
        # Add node properties if they exist
        node_properties = {k: vprop[v] for k, vprop in gt_graph.vp.items()}
        nx_multigraph.add_node(int(v), **node_properties)

    # Add edges with properties
    edge_counter = {}
    for e in gt_graph.edges():
        # Determine a unique key for each edge between the same source and target
        source = int(e.source())
        target = int(e.target())
        edge_key = edge_counter.get((source, target), 0)
        edge_counter[(source, target)] = edge_key + 1

        # Add edge properties if they exist
        # edge_properties = {k: eprop[e] for k, eprop in gt_graph.ep.items()}
        nx_multigraph.add_edge(source, target, key=edge_key)

    return nx_multigraph

In [None]:
from pyvis.network import Network
import networkx as nx
from IPython.core.display import display, HTML


def visualize_graph(G):
    net = Network(notebook=True)
    # Load into PyVis

    # Convert to NetworkX
    nx_graph = graphtool_to_networkx_multigraph(G)  # if graph-tool
    # nx_graph = graph #if networkx

    # nx_graph = graph.to_networkx()
    net.from_nx(nx_graph)
    net.set_options("""
        var options = {
            "physics": {
            "enabled": false
            }
        }
    """)

    net.show("graph.html")
    display(HTML("graph.html"))


In [None]:
visualize_graph(graph)


## Define Dataset Analysis Tools

### Diameter Aproximater

In [None]:
from graph_tool.all import Graph, GraphView, label_largest_component, pseudo_diameter


def approximate_diameter(graph):
    # Identify the largest connected component
    largest_component = label_largest_component(graph)

    # Extract the subgraph of the largest connected component
    largest_cc_subgraph = GraphView(graph, vfilt=largest_component)

    # Use pseudo_diameter to approximate the diameter
    approx_diameter, _ = pseudo_diameter(largest_cc_subgraph)

    return approx_diameter


# print("Diameter: ", approximate_diameter(graph))


### % of parallel edges

In [None]:
print("Terminal edges:")
for e in graph.edges():
    if graph.edge_properties["terminal"][e]:  # Check if the edge is terminal
        print(
            f"Source: {graph.vertex_properties['label'][e.source()]} :label {graph.vertex_properties['ner_label'][e.source()]} , Target: label: {graph.vertex_properties['label'][e.target()]} id: {e.target()}, Sentence: {graph.edge_properties['sentence'][e]}"
        )


In [None]:
from collections import Counter
from graph_tool.all import Graph


# Add example edges (if needed)
# g.add_edge(source, target)
def count_parallel_edges(g):
    # Step 1: Create a list of all (source, target) pairs
    # For undirected graphs, sort the source and target to ensure consistency
    non_terminal_edges = [
        (min(e.source(), e.target()), max(e.source(), e.target()))
        for e in g.edges()
        if not g.edge_properties["terminal"][e]
    ]

    # Step 2: Count occurrences of each edge
    edge_counts = Counter(non_terminal_edges)

    # Step 3: Calculate the number of parallel edges
    parallel_edge_count = sum(count for count in edge_counts.values() if count > 1)

    # Step 4: Calculate the total number of edges
    total_edge_count = g.num_edges()

    # Step 5: Calculate the percentage of parallel edges
    percentage_parallel_edges = (
        (parallel_edge_count / total_edge_count) * 100 if total_edge_count > 0 else 0
    )

    return percentage_parallel_edges


# print(f"Percentage of parallel edges: {count_parallel_edges(graph):.2f}%")


## Convert Squad Dataset

### Schema Explorer

In [None]:
def print_json_schema(data, indent=0):
    """Recursively print the schema of a JSON object."""
    if isinstance(data, dict):
        print(" " * indent + "{")
        for key, value in data.items():
            print(" " * (indent + 2) + f'"{key}": {type(value).__name__}', end="")
            if isinstance(value, (dict, list)):
                print(" ->")
                print_json_schema(value, indent + 4)
            else:
                print(",")
        print(" " * indent + "}")
    elif isinstance(data, list):
        print(" " * indent + "[")
        if data:
            print_json_schema(data[0], indent + 2)
        else:
            print(" " * (indent + 2) + "Empty list")
        print(" " * indent + "]")
    else:
        print(" " * indent + f"{type(data).__name__}")

### Download Dataset