# Convert JSON files to GEXF format for Gephi Graphing:

In [22]:
pip install networkx -q

You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [41]:
#Imports

import networkx as nx
from collections import defaultdict
from datetime import datetime
import json


In [42]:
# Directories to saved json files / Note: Change to using dynamic pathing or backups folder on HPC

def load_json(path):
    with open(path, "r", encoding= "utf-8") as file:
        return json.load(file)

likes_path = "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/notebooks/likes_backup_2025-03-23_19-07-24.json"
posts_path = "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/backups_test/posts/posts_backup_1.json"
follows_path = "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/notebooks/follows_backup_2025-03-23_19-08-02.json"
reposts_path = "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/notebooks/reposts_backup_2025-03-23_19-08-34.json"

In [43]:
#likes = load_json(likes_path)
posts = load_json(posts_path)
#follows = load_json(follows_path)
#reposts = load_json(reposts_path)

In [44]:
"""
    {"repo": "did:plc:rmwb2eivxpcver26ueawtxcn", "timestamp": "2025-03-23T18:07:11.422Z", "seq": 6894436866, "text": "This is an absolute WOW!", 
    "langs": ["en"], "cid": "bafyreicdz4pdq5m5adkr7673mgayrf4llvesnigmrjru7elso5b5qomrum", 
    "uri": "at://did:plc:rmwb2eivxpcver26ueawtxcn/app.bsky.feed.post/3ll2tho47oc26"}
"""

'\n    {"repo": "did:plc:rmwb2eivxpcver26ueawtxcn", "timestamp": "2025-03-23T18:07:11.422Z", "seq": 6894436866, "text": "This is an absolute WOW!", \n    "langs": ["en"], "cid": "bafyreicdz4pdq5m5adkr7673mgayrf4llvesnigmrjru7elso5b5qomrum", \n    "uri": "at://did:plc:rmwb2eivxpcver26ueawtxcn/app.bsky.feed.post/3ll2tho47oc26"}\n'

In [49]:
import re

def clean_text(text):
    if not text:
        return ""
    # Remove control characters except newline/tab
    return re.sub(r"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]", "", text)


In [50]:
G = nx.DiGraph()

user_meta = defaultdict(lambda: {
    "post_count": 0,
    "first_post": None,
    "last_post": None,
    "langs": set(),
    "sample_texts": []
})

# Extract key fields from posts: 

for post in posts:
    did = post.get("repo")
    timestamp = post.get("timestamp")
    langs = post.get("langs") or []
    text = post.get("text") or ""
    cid = post.get("cid")
    uri = post.get("uri")

# If no author, we skip the post. 
    if not did or not cid:
        continue

    meta = user_meta[did]
    meta["post_count"] += 1
    meta["langs"].update(langs)
    if text and len(meta["sample_texts"]) < 3: #Change this part as it only shows up to 3 texts
        meta["sample_texts"].append(text.strip())

# Parse timestamp. We first convert to datetime for python and later on back to ISO for GEFX / Gephi
    try:
        ts = datetime.fromisoformat(timestamp.replace("Z", "+00:00"))
        if not meta["first_post"] or ts < meta["first_post"]:
            meta["first_post"] = ts
        if not meta["last_post"] or ts > meta["last_post"]:
            meta["last_post"] = ts
    except Exception:
        ts = None
        
    G.add_node(
        cid,
        label=f"post:{cid[:6]}...",
        type="post",
        author=did,
        timestamp=timestamp or "",
        text=clean_text(text[:100].strip()),
        langs=", ".join(langs),
        uri=uri
    )
    
    G.add_edge(
    did,
    cid,
    label="authored",
    timestamp=timestamp or ""
    )

In [51]:
for did, meta in user_meta.items():
    G.add_node(
        did,
        label=did,
        type="user",
        post_count=meta["post_count"],
        first_post=meta["first_post"].isoformat() if meta["first_post"] else "",
        last_post=meta["last_post"].isoformat() if meta["last_post"] else "",
        langs=", ".join(meta["langs"]),
        sample_texts=" | ".join(meta["sample_texts"])
    )

# --- Export to GEXF ---
output_path = "posts_with_edges.gexf"
nx.write_gexf(G, output_path)
print(f"GEXF saved to {output_path} with {len(G.nodes)} nodes and {len(G.edges)} edges.")



GEXF saved to posts_with_edges.gexf with 18063 nodes and 9999 edges.


In [39]:
#find length of posts: 
len(posts)

10000

In [54]:
import re
import networkx as nx
from collections import defaultdict
from datetime import datetime
import json

# --- Utility to clean text for GEXF compatibility ---
def clean_text(text):
    if not text:
        return ""
    return re.sub(r"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]", "", text)

# --- Load your JSON files ---
def load_json(path):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

# Update these paths as needed
posts_path = "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/backups_test/posts/posts_backup_1.json"
follows_path = "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/data/backups_test/follows/follows_backup_1.json"

posts = load_json(posts_path)
follows = load_json(follows_path)

# --- Initialize directed graph ---
G = nx.DiGraph()

# --- Collect user metadata from posts ---
user_meta = defaultdict(lambda: {
    "post_count": 0,
    "first_post": None,
    "last_post": None,
    "langs": set(),
    "sample_texts": []
})

for post in posts:
    did = post.get("repo")
    timestamp = post.get("timestamp")
    langs = post.get("langs") or []
    text = post.get("text") or ""

    if not did:
        continue

    meta = user_meta[did]
    meta["post_count"] += 1
    meta["langs"].update(langs)
    if text and len(meta["sample_texts"]) < 3:
        meta["sample_texts"].append(clean_text(text.strip()))

    try:
        ts = datetime.fromisoformat(timestamp.replace("Z", "+00:00"))
        if not meta["first_post"] or ts < meta["first_post"]:
            meta["first_post"] = ts
        if not meta["last_post"] or ts > meta["last_post"]:
            meta["last_post"] = ts
    except Exception:
        pass

# --- Add users from posts ---
for did, meta in user_meta.items():
    G.add_node(
        did,
        label=did,
        type="user",
        post_count=meta["post_count"],
        first_post=meta["first_post"].isoformat() if meta["first_post"] else "",
        last_post=meta["last_post"].isoformat() if meta["last_post"] else "",
        langs=", ".join(meta["langs"]),
        sample_texts=" | ".join(meta["sample_texts"])
    )

# --- Add users from follows (ensure both ends are nodes) ---
for follow in follows:
    source = follow.get("repo")            # Follower
    target = follow.get("followed_user")   # Followed

    if not source or not target:
        continue

    for did in [source, target]:
        if did not in G.nodes:
            G.add_node(did, label=did, type="user")

    G.add_edge(source, target, label="follows", timestamp=follow.get("timestamp", ""))

# --- Export to GEXF ---
output_path = "user_follow_graph.gexf"
nx.write_gexf(G, output_path)
print(f"GEXF saved to {output_path} with {len(G.nodes)} nodes and {len(G.edges)} edges.")


GEXF saved to user_follow_graph.gexf with 19137 nodes and 9968 edges.


In [55]:
##ToDo: Save to data/gexf 