In [1]:
"""
Credit: https://flavioclesio.com/cosine-similarity-search-for-new-documents-using-scikit-learn
Source SO post: https://stackoverflow.com/questions/44862712/td-idf-find-cosine-similarity-between-new-document-and-dataset/44863365#44863365

Disclaimer: I am not a data scientist, just a random guy who wanted to make an automatic
"related posts" section generator. This script is by no means proffesional
nor comprehensive in any way, it's just a quick hack that is good enough for me.
"""

import math
import numpy
import subprocess
import warnings
from dataclasses import dataclass, field
from pathlib import Path
from IPython.core.display import display, HTML

import markdown
from bs4 import BeautifulSoup

import json

import graphviz
import umap
from sentence_transformers import SentenceTransformer, util

sentence_transformer = SentenceTransformer("all-MiniLM-L6-v2")

  from IPython.core.display import display, HTML
  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


In [9]:
RECENTLY_MODIFIED_FILES = subprocess.check_output(
    "git log --pretty=format: --name-only | grep '.md' | awk '!seen[$0]++' | head -n 3",
    shell=True,
).decode()


def was_recently_modified(file_path: Path) -> bool:
    for file in RECENTLY_MODIFIED_FILES.split("\n"):
        if str(file_path) in file:
            return True

    return False


def extract_title(file_path: Path, content: str) -> str:
    title = None

    for line in content.split("\n"):
        if "title: " in line:
            title = line.replace("title: ", "").strip()

    if not title:
        raise ValueError(f"Title not found in {file_path}")

    return title


def extract_tags(file_path: Path, content: str) -> list[str]:
    tags = []

    for line in content.split("\n"):
        if line.startswith("tags: "):
            tags = line.replace("tags: ", "").split(",")
            tags = [tag.strip() for tag in tags]

    if len(tags) == 0:
        warnings.warn(f"No tags defined for {file_path}")

    return tags

@dataclass
class RelatedPost:
    similarity: float
    post: "Post"

@dataclass
class Post:
    title: str
    content: str
    path: Path
    recently_modified: bool

    embeddings: numpy.ndarray
    tags: list[str] = field(default_factory=list)
    related_posts: list[RelatedPost] = field(default_factory=list)
    
    # To be filled by UMAP
    _x: float | None = None
    _y: float | None = None

    @property
    def x(self):
        if self._x is None:
            raise ValueError("Value of X coordinate not filled")
        return self._x

    @property
    def y(self):
        if self._y is None:
            raise ValueError("Value of Y coordinate not filled")
        return self._y

    @property
    def link(self) -> str:
        post_link = "/" + str(
            self.path.relative_to("content").parent
            / self.path.relative_to("content").stem
        )

        return post_link

    @classmethod
    def from_path(cls, path: Path):
        if path.suffix == ".md":
            content_raw = path.read_text()
            # No easy markdown to text convertsion available at the moment :/
            html = markdown.markdown(content_raw)
            html_tree = BeautifulSoup(html, features="html.parser")
            content = html_tree.text

            title = extract_title(path, content_raw)
            tags = extract_tags(path, content_raw)

        elif path.suffix == ".html":
            content_raw = path.read_text()
            html_tree = BeautifulSoup(content_raw, features="html.parser")
            html_tree.nav.decompose()
            content = html_tree.text.replace("\n", "")
            tags = []

            title = path.name

            to_trim = content.rfind("Incoming:")
            if to_trim != -1:
                content = content[to_trim:]
            # else:
            #     print("No Incoming")
            #     print(content)

        embeddings = sentence_transformer.encode([content])[0]
        return cls(
            title=title,
            content=content,
            path=path,
            recently_modified=was_recently_modified(path),
            tags=tags,
            embeddings=embeddings,
        )

    def distance_to(self, post: "Post") -> float:
        return math.sqrt((self.x - post.x) ** 2 + (self.y - post.y) ** 2)
        # return -util.cos_sim(self.embeddings, post.embeddings)

    def distance_embedding(self, post: "Post") -> float:
        # return math.sqrt((self.x - post.x) ** 2 + (self.y - post.y) ** 2)
        result = -util.cos_sim(self.embeddings, post.embeddings)
        return float(result)


def get_all_posts() -> list[Post]:
    all_posts = []
    all_posts_paths = Path.glob(Path("./content/"), "**/*.md")

    for post_path in all_posts_paths:
        all_posts.append(Post.from_path(post_path))

    embeddings = [post.embeddings for post in all_posts]

    umap_result = umap.UMAP().fit_transform(embeddings)

    for post, umap_result in zip(all_posts, umap_result):
        post._x, post._y = umap_result

    weirdness_level_embedding = sentence_transformer.encode(
        ["things that are artistic, weird or nerdy"]
    )

    for post in all_posts:
        post.weirdness = float(util.cos_sim(weirdness_level_embedding, post.embeddings))

    max_weirdness = max(post.weirdness for post in all_posts)

    for post in all_posts:
        post.weirdness /= max_weirdness
    
    for post in all_posts:
        post.related_posts = [ 
            RelatedPost(post=similar_post, similarity=post.distance_to(similar_post)) for similar_post in sorted(
                all_posts, key=lambda post_to_sort: post_to_sort.distance_to(post)
            )
            if similar_post != post
        ]
        
    return all_posts

In [10]:
all_posts = get_all_posts()

In [11]:
all_posts[0].title

'Music Transcribing'

In [12]:
all_posts[0].related_posts[1].post.title

'Resources on audio & DSP'

In [14]:
graph = graphviz.Graph(
    comment="All Relations",
    graph_attr={
        "bgcolor": "transparent",
        "overlap": "false",
        "outputorder": "edgesfirst",
    },
    format="svg",
    node_attr={"shape": "box", "nodesep": "0.55"},
    engine="neato",
)


def make_edge_id(post_1, post_2) -> tuple[Post, Post]:
    return sorted([post_1, post_2], key=lambda p: p.title)

existing_connections: list[tuple[Post, Post]] = []


for post in all_posts:
    transparency = f"{int(255 * post.weirdness):02x}"
    color = "#ffffff"
    color += transparency

    graph.node(
        post.title,
        color=color,
        fillcolor="#263238",
        style="filled",
        fontcolor="white",
        xlabel="!" if post.recently_modified else "",
        URL="/" + post.path.with_suffix("").name,
        pos=f"{post.x/2.0},{post.y/2.0}!",
    )

    related_posts = [
        similiar_post
        for similiar_post in sorted(
            all_posts, key=lambda post_to_sort: post_to_sort.distance_to(post)
        )
        if similiar_post != post
    ]

    related_posts_json_path = Path("./generated") / post.path.relative_to(
        "content"
    ).with_suffix(".json")
    related_posts_json_path.parent.mkdir(parents=True, exist_ok=True)

    with open(related_posts_json_path, "w", encoding="utf-8") as relations_file:
        json.dump(
            {
                "posts": [
                    {"title": related_post.title, "url": related_post.link}
                    for related_post in related_posts[:2]
                ]
            },
            relations_file,
        )

    connections_created = 0
    for related in related_posts:
        edge_id = make_edge_id(post, related)

        if edge_id not in existing_connections:
            graph.edge(
                post.title,
                related.title,
                color="white" if connections_created == 0 else "gray",
            )
            existing_connections.append(edge_id)
            connections_created += 1

        if connections_created > 1:
            break
# Note: it actually renders to connections.svg
graph.render("./generated/connections")
display(
    HTML('<img style="background-color: black" src="./generated/connections.svg"/>')
)