In [7]:
import csv
import json

def parse_cluster(file_path):
    clusters = {}
    with open(file_path, 'r') as f:
        lines = f.readlines()

        cluster_id = None
        for line in lines:
            line = line.strip()
            if "Cluster" in line and "- TOPIC:" in line:
                cluster_id = int(line.split()[1])
                clusters[cluster_id] = {
                    'topic': line.split("- TOPIC:")[1].strip(),
                    'content': []
                }
            elif line and cluster_id is not None:
                item_number = line.split()[0][1:-1]
                text = ' '.join(line.split()[1:])
                clusters[cluster_id]['content'].append({
                    'item_number': item_number,
                    'text': text
                })
    return clusters


def parse_graph(file_path):
    links = []
    with open(file_path, 'r') as f:
        reader = csv.DictReader(f)
        for row in reader:
            if row['relation'] == 'C':
                links.append(
                    {'source': row['node_i'], 'target': row['node_j']})
            elif row['relation'] == 'E':
                links.append(
                    {'source': row['node_j'], 'target': row['node_i']})
    return links


def create_json(cluster_path, graph_path, output_path):
    clusters = parse_cluster(cluster_path)
    links = parse_graph(graph_path)

    data = {
        'nodes': [{'id': str(k), 'topic': v['topic'], 'category': 'Default Behaviour', 'content': v['content']} for k, v in clusters.items()],
        'links': links
    }

    with open(output_path, 'w') as f:
        json.dump(data, f, indent=2)


# Call the function
create_json("clusters_v2.txt", "graph_v2.csv", "data_v2.json")


In [8]:
# load the json file, save it as output_data variable
with open('data_v2.json') as f:
    output_data = json.load(f)
    

In [15]:
import json
import nltk
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import networkx as nx
import random
import json
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

tableau_20 = ['#1f77b4', '#aec7e8', '#ff7f0e', '#ffbb78', '#2ca02c', '#98df8a', 
              '#d62728', '#ff9896', '#9467bd', '#c5b0d5', '#8c564b', '#c49c94', 
              '#e377c2', '#f7b6d2', '#7f7f7f', '#c7c7c7', '#bcbd22', '#dbdb8d', 
              '#17becf', '#9edae5']

stop_words = set(stopwords.words('english'))

from nltk import pos_tag

def find_most_common_word(texts):
    words = []
    stop_words = set(stopwords.words('english'))
    stop_words.update(['person', 'people', 'thing', 'things', 'another'])

    for text in texts:
        tokenized_text = word_tokenize(text)
        words.extend(tokenized_text)

    words = [word.lower() for word in words if word.isalpha() and word not in stop_words] 
    pos_words = pos_tag(words)

    weighted_words = []
    for word, pos in pos_words:
        if pos.startswith('VB'):  # more weight to verbs
            weighted_words.extend([word]*2)  # duplicate verb
        else:
            weighted_words.extend([word])

    most_common_word = Counter(weighted_words).most_common(1)
    return most_common_word[0][0] if most_common_word else None




def calculate_score(nodes, edges):
    G = nx.Graph()
    for node in nodes:
        G.add_node(node['key'])

    for edge in edges:
        G.add_edge(*edge)

    centrality = nx.betweenness_centrality(G)
    for node in nodes:
        node['score'] = centrality[node['key']]

    return nodes


def calculate_layout(nodes, edges):
    G = nx.Graph()
    for node in nodes:
        G.add_node(node['key'])

    for edge in edges:
        G.add_edge(*edge)

    pos = nx.spring_layout(G, scale=2000, k=0.15)

    for node in nodes:
        node['x'], node['y'] = pos[node['key']]

    return nodes


def json_to_dataset(json_data, default_cluster_color, default_tag_image):
    # Prepare data structures
    dataset = {
        'nodes': [],
        'edges': [],
        'clusters': [],
        'tags': [],
        'labels': [],
    }
    clusters = {}
    tags = set()
    labels = set()

    # Prepare color assignment for tags
    tag_colors = {}

    # Create nodes
    for node in json_data['nodes']:
        text_contents = [item['text'] for item in node['content']]
        most_common_word = find_most_common_word(text_contents)
        
        # Assign a color from the palette based on the tag
        tag_colors[node['category']] = tag_colors.get(node['category'], tableau_20[len(tag_colors) % len(tableau_20)])

        dataset['nodes'].append({
            'key': str(node['id']),
            'label': most_common_word,
            'tag': node['category'],  # Use category as tag
            'URL': '',  # URL is not provided in the initial data
            'cluster': node['id'],
            'textContent': node['content'],
        })

        # Add a unique cluster for each node
        dataset['clusters'].append({
            'key': str(node['id']),
            'color': tag_colors[node['category']],  # Cluster color depends on the tag
            'clusterLabel': most_common_word,
            'clusterTextContent': text_contents  # Add text contents to the cluster
        })

        clusters[str(node['id'])] = most_common_word
        labels.add(most_common_word)
        tags.add(node['category'])

    # Create edges
    for link in json_data['links']:
        source = str(link['source'])
        target = str(link['target'])
        # check if the source and target nodes exist
        if source in clusters and target in clusters:
            dataset['edges'].append([source, target])

    # Calculate layout
    dataset['nodes'] = calculate_layout(dataset['nodes'], dataset['edges'])

    # Create tags
    for tag in tags:
        dataset['tags'].append({
            'key': tag,
            'image': default_tag_image,  # Tag image is not provided in the initial data
        })

    # Create labels
    for label in labels:
        dataset['labels'].append({
            'key': label,
            'image': default_tag_image,
        })

    # Calculate scores
    dataset['nodes'] = calculate_score(dataset['nodes'], dataset['edges'])

    return dataset

dataset = json_to_dataset(output_data, '#6c3e81', 'unknown.svg')

# save the dataset to a json file
with open('../public/dataset.json', 'w') as f:
    json.dump(dataset, f, indent=2)


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/songhaifan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/songhaifan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/songhaifan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
