In [4]:
import csv
import json
import re
import random
import math
import os

def parse_cluster(file_path):
    """Parse the clusters file to extract cluster information."""
    clusters = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

        cluster_id = None
        for line in lines:
            line = line.strip()
            if "Index" in line and "Cluster" in line and "topic" in line:
                # Extract cluster ID from format "Index X - Cluster Y - topic : Z"
                # The cluster ID is Y, not X
                match = re.search(r'Cluster (\d+) - topic : (.+)', line)
                if match:
                    cluster_id = int(match.group(1))
                    topic = match.group(2)
                    clusters[cluster_id] = {
                        'topic': topic,
                        'content': []
                    }
            elif line and cluster_id is not None and "(" in line and ")" in line:
                # Extract item number and text
                item_match = re.match(r'\(([^)]+)\) (.+)$', line)
                if item_match:
                    item_number = item_match.group(1)
                    text = item_match.group(2)
                    clusters[cluster_id]['content'].append({
                        'item_number': item_number,
                        'text': text
                    })
    return clusters




In [11]:
clusters = parse_cluster("cluster/clusters_final.txt")
clusters

{0: {'topic': 'a person buy something to do something',
  'content': [{'item_number': 'XX463',
    'text': 'a person go to buy another thing for something'},
   {'item_number': '20950',
    'text': 'a person buy another thing necessary to do something'},
   {'item_number': '16634',
    'text': 'a person buy another thing for something'},
   {'item_number': '29412',
    'text': 'a person buy another person something to aid in the use of another thing'}]},
 1: {'topic': 'a person buy something from another person',
  'content': [{'item_number': 'X1491',
    'text': 'another person buy something from a person'},
   {'item_number': '13525',
    'text': 'a person buy something from another person'},
   {'item_number': '38540', 'text': 'a person buy something from a person'},
   {'item_number': '26693',
    'text': 'a person want to buy something from another person'}]},
 2: {'topic': 'a person buy something for another person',
  'content': [{'item_number': 'X5505',
    'text': 'a person bu

In [7]:
def parse_graph(file_path):
    """Parse the graph CSV file to extract relationships between clusters."""
    links = []
    connected_nodes = set()  # Track connected nodes
    
    # Keep track of edges to avoid duplicates
    edge_set = set()
    
    with open(file_path, 'r', encoding='utf-8') as f:
        reader = csv.reader(f)
        next(reader)  # Skip the header row

        for row in reader:
            if len(row) < 3:
                continue
                
            # Extract cluster IDs from format "(ID) topic : description"
            match_a = re.search(r'\((\d+)\) topic', row[0].strip())
            match_b = re.search(r'\((\d+)\) topic', row[1].strip())

            if match_a and match_b:
                id_a = match_a.group(1).strip()
                id_b = match_b.group(1).strip()

                # Add the ids to the connected nodes set
                connected_nodes.update([id_a, id_b])

                # Determine the type of relation and set source/target
                relation = row[2].strip()
                
                # Create edges based on relation type, avoiding duplicates
                if relation == 'C':
                    edge_key = f"{id_a}->{id_b}"
                    if edge_key not in edge_set:
                        links.append({'source': id_a, 'target': id_b})
                        edge_set.add(edge_key)
                elif relation == 'E':
                    edge_key = f"{id_b}->{id_a}"
                    if edge_key not in edge_set:
                        links.append({'source': id_b, 'target': id_a})
                        edge_set.add(edge_key)

    return links, connected_nodes


In [12]:
links, connected_nodes = parse_graph("graph/graph_final.csv")
links

[{'source': '2', 'target': '149'},
 {'source': '7', 'target': '2'},
 {'source': '2', 'target': '432'},
 {'source': '358', 'target': '5'},
 {'source': '376', 'target': '6'},
 {'source': '6', 'target': '1072'},
 {'source': '336', 'target': '6'},
 {'source': '6', 'target': '998'},
 {'source': '284', 'target': '9'},
 {'source': '10', 'target': '119'},
 {'source': '10', 'target': '951'},
 {'source': '51', 'target': '10'},
 {'source': '769', 'target': '10'},
 {'source': '324', 'target': '10'},
 {'source': '493', 'target': '10'},
 {'source': '1112', 'target': '10'},
 {'source': '652', 'target': '10'},
 {'source': '11', 'target': '347'},
 {'source': '11', 'target': '323'},
 {'source': '11', 'target': '55'},
 {'source': '11', 'target': '933'},
 {'source': '612', 'target': '11'},
 {'source': '321', 'target': '11'},
 {'source': '11', 'target': '1054'},
 {'source': '507', 'target': '11'},
 {'source': '57', 'target': '11'},
 {'source': '828', 'target': '12'},
 {'source': '12', 'target': '776'},
 {'

In [13]:
all_cluster_ids = set(str(k) for k in clusters.keys())
missing_nodes = connected_nodes - all_cluster_ids
missing_nodes

set()