# Convert JSON files to GEXF format for Gephi Graphing:

In [22]:
pip install networkx -q

You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [23]:
#Imports

import networkx as nx
from collections import defaultdict
from datetime import datetime
import json


In [24]:
# Directories to saved json files / Note: Change to using dynamic pathing or backups folder on HPC

def load_json(path):
    with open(path, "r", encoding= "utf-8") as file:
        return json.load(file)

likes_path = "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/notebooks/likes_backup_2025-03-23_19-07-24.json"
posts_path = "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/notebooks/posts_backup_2025-03-23_19-08-29.json"
follows_path = "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/notebooks/follows_backup_2025-03-23_19-08-02.json"
reposts_path = "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/notebooks/reposts_backup_2025-03-23_19-08-34.json"

In [25]:
likes = load_json(likes_path)
posts = load_json(posts_path)
follows = load_json(follows_path)
reposts = load_json(reposts_path)

In [None]:
"""
    {"repo": "did:plc:rmwb2eivxpcver26ueawtxcn", "timestamp": "2025-03-23T18:07:11.422Z", "seq": 6894436866, "text": "This is an absolute WOW!", 
    "langs": ["en"], "cid": "bafyreicdz4pdq5m5adkr7673mgayrf4llvesnigmrjru7elso5b5qomrum", 
    "uri": "at://did:plc:rmwb2eivxpcver26ueawtxcn/app.bsky.feed.post/3ll2tho47oc26"}
"""

In [26]:
#Note: Change path to backup folder on HPC --> Load them dynamically into one large json. 
posts_path = "/Users/tobiasmichelsen/Bachelor_Project/DS_BachelorProject_PH/notebooks/posts_backup_2025-03-23_19-08-29.json"
posts = load_json(posts_path)


user_meta = defaultdict(lambda: {
    "post_count": 0, #remove when adding edges 
    "first_post": None,
    "last_post": None
})

for post in posts:
    did = post.get("repo")
    timestamp = post.get("timestamp")

    if not did:
        continue

    user_meta[did]["post_count"] += 1

    try:
        ts = datetime.fromisoformat(timestamp.replace("Z", "+00:00"))
        if not user_meta[did]["first_post"] or ts < user_meta[did]["first_post"]:
            user_meta[did]["first_post"] = ts
        if not user_meta[did]["last_post"] or ts > user_meta[did]["last_post"]:
            user_meta[did]["last_post"] = ts
    except Exception:
        pass

# --- Build Graph and Add Nodes Only ---
G = nx.Graph()  # No edges yet

for did, meta in user_meta.items():
    G.add_node(
        did,
        label=did,
        post_count=meta["post_count"],
        first_post=meta["first_post"].isoformat() if meta["first_post"] else "",
        last_post=meta["last_post"].isoformat() if meta["last_post"] else ""
    )

# --- Save GEXF ---
output_path = "posts_users_graph.gexf"
nx.write_gexf(G, output_path)
print(f"GEXF saved to {output_path} with {len(G.nodes)} nodes.")


GEXF saved to posts_users_graph.gexf with 4368 nodes.


In [27]:
get_user_nodes_from_posts(posts)

({'did:plc:gi23djbyhqpx2ykajeuzzy3d',
  'did:plc:bsbak762ynou5vd2upxlset2',
  'did:plc:rrr3bhy3xtn7xpxvjuys5syz',
  'did:plc:lcwohl527n6un23jwlisy5db',
  'did:plc:5skzt22ub7h67d2p5p4pgk6c',
  'did:plc:44rsqirylhht3lul4dzd55ak',
  'did:plc:xj4jw744nk2nz7777zvdtzko',
  'did:plc:gc6xqchmpzkwlakd2gu24gm2',
  'did:plc:abfjxvcxyh3tz7djttejlm6b',
  'did:plc:xxk3vlzs7f5uocvkxvd5chxn',
  'did:plc:lnrh26pzh7633kayne4via4n',
  'did:plc:pauh5sxufu3nov2okvexgkg5',
  'did:plc:ahqle3mha3ajudjigm2ptqij',
  'did:plc:sfg7stlgumtynct7d6qw2yi4',
  'did:plc:onc7gdafjz4nqt63bbuios7g',
  'did:plc:2s7ovijrpzod2jr25ukwc7sv',
  'did:plc:tsc2gwchmoom4sor2e2puisw',
  'did:plc:t4kspsivbqiihv2pqsg5y4p7',
  'did:plc:7sjmv2eqcljuuw6qyyx3kuju',
  'did:plc:4qhgeiq766g7nxbh6sqlni6d',
  'did:plc:o3oejwzhgtzwxswcjp3tvd7h',
  'did:plc:k3kpevprdqs4hx4f6dbixivu',
  'did:plc:vrffug3y2bt5ovra5os6dfb2',
  'did:plc:q4mk3cmiwyozgfsnjirpav2o',
  'did:plc:qkewonm5zwnj6aoyl74465nc',
  'did:plc:hxvidvpsiqlgp5dboyo66zeq',
  'did:plc:m