In [11]:
import pandas as pd
import networkx as nx
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pickle

In [2]:
# Load the chunked legal cases dataset
chunked_cases_df = pd.read_csv("chunked_law_cases.csv")

In [3]:
# Initialize graph
G = nx.DiGraph()

In [4]:
# Load a sentence embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

In [5]:
# Create node embeddings
chunk_embeddings = model.encode(chunked_cases_df["text"].tolist(), convert_to_tensor=True)

In [6]:
# Add nodes to the graph
for index, row in chunked_cases_df.iterrows():
    G.add_node(row["chunk_id"], text=row["text"], embedding=chunk_embeddings[index])

In [7]:
# Establish logical dependencies between chunks
for i, row in chunked_cases_df.iterrows():
    current_embedding = chunk_embeddings[i].cpu().numpy()
    similarities = cosine_similarity([current_embedding], chunk_embeddings.cpu().numpy())[0]
    
    # Find the top related chunks (excluding itself)
    top_related = np.argsort(similarities)[-6:-1]  # Get top 5 similar chunks
    
    for related_index in top_related:
        related_chunk_id = chunked_cases_df.iloc[related_index]["chunk_id"]
        G.add_edge(row["chunk_id"], related_chunk_id, weight=similarities[related_index])

In [12]:
# Save graph using pickle (more suitable for complex objects)
with open("../data/processed/logical_reasoning/graph_data.pkl", "wb") as f:
    pickle.dump(G, f)