In [1]:
"""
Parses all posts as text and creates a "semantic map", grouping
similar posts together on a 2D plane. Basically:

1. Sentence embeddings
2. Umap
3. Some semi-random links between nodes to make it look smarter than it actually is
4. Graphviz to SVG
5. Display SVG on the site

This notebook also contains some other small utilities used in the site build process,
but their importance is negligible therefore I do not document them all.

---

Notes on the previous version from the preGPT era:

Credit: https://flavioclesio.com/cosine-similarity-search-for-new-documents-using-scikit-learn
Source SO post: https://stackoverflow.com/questions/44862712/td-idf-find-cosine-similarity-between-new-document-and-dataset/44863365#44863365

Disclaimer: I am not a data scientist, just a random guy who wanted to make an automatic
"related posts" section generator. This script is by no means proffesional
nor comprehensive in any way, it's just a quick hack that is good enough for me.
"""

import math
import numpy
import subprocess
import warnings
from datetime import datetime
from dataclasses import dataclass, field
from pathlib import Path
from IPython.display import display, HTML
import textwrap

import markdown
from bs4 import BeautifulSoup

import json

import graphviz
import umap
from sentence_transformers import SentenceTransformer, util

# Consider using "all-MiniLM-L6-v2" for slightly lower accuracy and 5x performance
sentence_transformer = SentenceTransformer("all-mpnet-base-v2")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
RECENTLY_MODIFIED_FILES = subprocess.check_output(
    "git log --pretty=format: --name-only | grep '.md' | awk '!seen[$0]++' | head -n 3",
    shell=True,
).decode()

print("Recently modified:")
for line in RECENTLY_MODIFIED_FILES.split("\n")[
    :-1
]:  # Last line is empty, skip it with :-1
    print("-", line)


def was_recently_modified(file_path: Path) -> bool:
    for file in RECENTLY_MODIFIED_FILES.split("\n"):
        if str(file_path) in file:
            print(file_path, "was recently modified")
            return True

    return False


def extract_title(file_path: Path, content: str) -> str:
    title = None

    for line in content.split("\n"):
        if "title: " in line:
            title = line.replace("title: ", "").strip()

    if not title:
        raise ValueError(f"Title not found in {file_path}")

    return title


def extract_tags(file_path: Path, content: str) -> list[str]:
    tags = []

    for line in content.split("\n"):
        if line.startswith("tags: "):
            tags = line.replace("tags: ", "").split(",")
            tags = [tag.strip() for tag in tags]

    if len(tags) == 0:
        warnings.warn(f"No tags defined for {file_path}")

    return tags


@dataclass
class RelatedPost:
    similarity: float
    post: "Post"
    # level: int # 1 = very related, 2 less related, etc


@dataclass
class Post:
    title: str
    content: str
    path: Path
    recently_modified: bool

    embeddings: numpy.ndarray
    tags: list[str] = field(default_factory=list)
    related_posts: list[RelatedPost] = field(default_factory=list)

    # To be filled by UMAP
    _x: float | None = None
    _y: float | None = None

    @property
    def x(self):
        if self._x is None:
            raise ValueError("Value of X coordinate not filled")
        return self._x

    @property
    def y(self):
        if self._y is None:
            raise ValueError("Value of Y coordinate not filled")
        return self._y

    @property
    def link(self) -> str:
        post_link = "/" + str(
            self.path.relative_to("content").parent
            / self.path.relative_to("content").stem
        )

        return post_link

    @classmethod
    def from_path(cls, path: Path):
        if path.suffix == ".md":
            content_raw = path.read_text()
            # No easy markdown to text convertsion available at the moment :/
            html = markdown.markdown(content_raw)
            html_tree = BeautifulSoup(html, features="html.parser")
            content = html_tree.text

            title = extract_title(path, content_raw)
            tags = extract_tags(path, content_raw)

        elif path.suffix == ".html":
            content_raw = path.read_text()
            html_tree = BeautifulSoup(content_raw, features="html.parser")
            html_tree.nav.decompose()
            content = html_tree.text.replace("\n", "")
            tags = []

            title = path.name

            to_trim = content.rfind("Incoming:")
            if to_trim != -1:
                content = content[to_trim:]
            # else:
            #     print("No Incoming")
            #     print(content)

        embeddings = sentence_transformer.encode([content])[0]
        return cls(
            title=title,
            content=content,
            path=path,
            recently_modified=was_recently_modified(path),
            tags=tags,
            embeddings=embeddings,
        )

    def __hash__(self) -> None:
        return hash(self.path)

    def distance_to(self, post: "Post") -> float:
        return math.sqrt((self.x - post.x) ** 2 + (self.y - post.y) ** 2)
        # return -util.cos_sim(self.embeddings, post.embeddings)

    def distance_embedding(self, post: "Post") -> float:
        # return math.sqrt((self.x - post.x) ** 2 + (self.y - post.y) ** 2)
        result = -util.cos_sim(self.embeddings, post.embeddings)
        return float(result)


def get_all_posts() -> list[Post]:
    all_posts = []
    all_posts_paths = Path.glob(Path("./content/"), "**/*.md")

    for post_path in all_posts_paths:
        all_posts.append(Post.from_path(post_path))

    embeddings = [post.embeddings for post in all_posts]

    umap_result = umap.UMAP().fit_transform(embeddings)
    # umap_result = manifold.TSNE(
    #     n_components=2,
    #     learning_rate='auto',
    #     init='random',
    #     perplexity=3).fit_transform(numpy.array(embeddings))
    # from sklearn.decomposition import PCA
    # umap_result = PCA(n_components=2).fit_transform(numpy.array(embeddings))

    for post, umap_result in zip(all_posts, umap_result):
        post._x, post._y = umap_result

    weirdness_level_embedding = sentence_transformer.encode(
        ["things that are artistic, weird or nerdy"]
    )

    for post in all_posts:
        post.weirdness = float(util.cos_sim(weirdness_level_embedding, post.embeddings))

    max_weirdness = max(post.weirdness for post in all_posts)

    for post in all_posts:
        post.weirdness /= max_weirdness

    for post in all_posts:
        post.related_posts = [
            RelatedPost(post=similar_post, similarity=post.distance_to(similar_post))
            for similar_post in sorted(
                all_posts, key=lambda post_to_sort: post_to_sort.distance_to(post)
            )
            if similar_post != post
        ]

    return all_posts

Recently modified:
- content/waiting-room.md
- content/now.md
- content/how-to-think.md


In [3]:
all_posts = get_all_posts()

content/now.md was recently modified
content/how-to-think.md was recently modified
content/waiting-room.md was recently modified


In [4]:
all_posts[0].title

'Bookmarks'

In [5]:
all_posts[0].related_posts[1].post.title

'Contact'

In [6]:
graph = graphviz.Graph(
    comment="All Relations",
    graph_attr={
        "bgcolor": "transparent",
        "overlap": "false",
        "outputorder": "edgesfirst",
    },
    format="svg",
    node_attr={"shape": "box", "nodesep": "0.55"},
    engine="neato",
)


existing_connections: list[set[Post, Post]] = []
ACCENT_COLOR = "#82AAFF"
UMAP_POSITION_TO_GRAPHVIZ_MULTIPLIER = 0.40


def make_node(graph: graphviz.Graph, post: Post, accent_style: bool = False):
    if accent_style:
        color = ACCENT_COLOR
    else:
        color = "#ffffff"
        transparency = f"{int(255 * post.weirdness):02x}"
        color += transparency

    xlabel = ""
    if post.recently_modified:
        xlabel = "!"

    graph.node(
        post.title,
        # label="< <B>" + graphviz.nohtml(post.title) + "</B> >",
        label="\n".join(textwrap.wrap(post.title, width=16)),
        color=color,
        fillcolor="#263238",
        style="filled",
        fontcolor=ACCENT_COLOR if accent_style else "white",
        penwidth="2.0" if accent_style else "1.0",
        xlabel=xlabel,
        URL="/" + post.path.with_suffix("").name,
        pos=f"{post.x * UMAP_POSITION_TO_GRAPHVIZ_MULTIPLIER},{post.y * UMAP_POSITION_TO_GRAPHVIZ_MULTIPLIER}!",
    )


for post in all_posts:
    make_node(graph, post)

    related_posts = [
        similiar_post
        for similiar_post in sorted(
            all_posts, key=lambda post_to_sort: post_to_sort.distance_to(post)
        )
        if similiar_post.path != post.path
    ]

    related_posts_json_path = Path("./generated") / post.path.relative_to(
        "content"
    ).with_suffix(".json")
    related_posts_json_path.parent.mkdir(parents=True, exist_ok=True)

    with open(related_posts_json_path, "w", encoding="utf-8") as relations_file:
        json.dump(
            {
                "posts": [
                    {"title": related_post.title, "url": related_post.link}
                    for related_post in related_posts[:2]
                ]
            },
            relations_file,
        )

    connections_created = 0
    for related in related_posts:
        if {post, related} not in existing_connections:
            graph.edge(
                post.title,
                related.title,
                color="white" if connections_created == 0 else "gray",
            )
            existing_connections.append({post, related})
            connections_created += 1

        if connections_created > 1:
            break
# Note: it actually renders to connections.svg
graph.render("./generated/connections")
display(
    HTML('<img style="background-color: black" src="./generated/connections.svg"/>')
)

In [7]:
for current_post in all_posts:
    graph = graphviz.Graph(
        comment="Relations for node",
        graph_attr={
            "bgcolor": "transparent",
            "overlap": "false",
            "outputorder": "edgesfirst",
        },
        format="svg",
        node_attr={"shape": "box", "nodesep": "0.55"},
        engine="neato",
    )

    posts_nearby = sorted(all_posts, key=lambda p: p.distance_to(current_post))

    nodes_in_graph = {current_post}
    make_node(graph, current_post, accent_style=True)

    for post in posts_nearby[1:6]:
        print(" Making node", post.title)
        nodes_in_graph.add(post)
        make_node(graph, post, accent_style=False)

    for connection in existing_connections:
        if len(connection & nodes_in_graph) == 2:
            node_1 = list(connection)[0]
            node_2 = list(connection)[1]

            # print("Linking", node_1.title, "with", node_2.title)
            graph.edge(
                node_1.title,
                node_2.title,
                color="white" if (id(node_1) // 10) % 2 else "gray"
                # color="white" if target in [ post.post for post in post.related_posts][:1] else "gray",
            )

    related_posts_svg_path = Path("./generated") / current_post.path.relative_to(
        "content"
    ).with_suffix("")
    related_posts_svg_path.parent.mkdir(parents=True, exist_ok=True)
    graph.render(related_posts_svg_path)
    print(related_posts_svg_path)
    # graph

 Making node Website experience
 Making node Contact
 Making node Registry-based search engine
 Making node Decentralisation
 Making node Caddy
generated/bookmarks
 Making node Daily open-source software guide
 Making node NixOs
 Making node Caddy
 Making node Decentralisation
 Making node Colour scheme
generated/alternatives
 Making node Decentralisation
 Making node Contact
 Making node Daily open-source software guide
 Making node Content creation workflow
 Making node Software alternatives
generated/caddy
 Making node System configuration
 Making node Colour scheme
 Making node Software alternatives
 Making node About
 Making node 2024's Devlog
generated/nixos
 Making node 2024's Devlog
 Making node 2022's Devlog
 Making node 2023's Devlog
 Making node NixOs
 Making node Listening
generated/about
 Making node Registry-based search engine
 Making node Bookmarks
 Making node Map
 Making node Contact
 Making node Exocortex
generated/website-experience
 Making node Resources on audio &

In [8]:
import json

recent_updates = [ { "title": post.title, "url": post.link } for post in all_posts if post.recently_modified ]

with open("./generated/recently_updated.json", "w") as recent_updates_file:
    print(recent_updates)
    json.dump(recent_updates, recent_updates_file)

[{'title': "2024's Devlog", 'url': '/now'}, {'title': 'How to think', 'url': '/how-to-think'}, {'title': 'The Waiting room', 'url': '/waiting-room'}]


In [9]:
with open("./generated/build_date.txt", "w") as build_date:
    build_date.write(
        datetime.strftime(datetime.now(), '%d/%m/%y %H:%M')
    )

In [7]:
import re
from datetime import datetime
from git import Repo

ATOM_FEED_HEAD = """
<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">

<title>Baczek.me Devlog</title>
<link href="https://baczek.me/"/>
<updated>2003-12-13T18:30:02Z</updated>
<author>
<name>Mateusz Bączek</name>
</author>
<id>https://baczek.me/</id>
"""

ATOM_FEED_TAIL =""" 
</feed>
"""

DEVLOG_URL = "https://baczek.me/now"

def generate_entry(link: str, updated: datetime, commit_hash: str) -> str:
    return f"""
      <entry>
        <title>{updated.strftime('%d.%m.%Y')} Devlog update</title>
        <link href="{link}"/>
        <updated>{updated.isoformat()}</updated>
        <summary>DevLog available at {link}</summary>
        <id>https://baczek.me/now#f{commit_hash}</id>
      </entry>
  """

with open("./static/atom.xml", "w") as feed_file:
    repo = Repo('./')
    devlog_changes = list(repo.iter_commits(all=True, paths="./content/now.md"))  # Gets the last 10 commits from all branches.
    devlog_changes.reverse()

    feed_file.write(ATOM_FEED_HEAD)

    for commit in devlog_changes:
        feed_file.write(generate_entry(DEVLOG_URL, commit.committed_datetime, commit.hexsha))

    feed_file.write(ATOM_FEED_TAIL)

print("Atom feed generated")

Atom feed generated
