In [1]:
from related_generator.post import Post, get_all_posts
import nltk
import umap
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
from pathlib import Path
import graphviz
import json

from bs4 import BeautifulSoup


In [2]:
all_posts = get_all_posts()



In [3]:
def tokenizer(text: str) -> list[str]:
    stemmer = SnowballStemmer("english")
    words = word_tokenize(text)

    return [stemmer.stem(word) for word in words]

In [4]:
for post in all_posts:
    html_path = (Path("./public") / post.path.stem) / "index.html"
    html_path = str(html_path).replace("_", "-")
    html_content = open(html_path).read()
    
    html_tree = BeautifulSoup(html_content, features="html.parser")
    content = html_tree.article.text.replace("\n", " ")
    post._old_content = post.content
    post.content = content


In [5]:
all_posts[1].content

"  The word *Bączek* in Polish  My last name, Bączek, has multiple meanings. Here I've collected my favourites:  Spinning top (the toy that spins) Spins, barrel rolls and other similar aerial maneuvers (in Polish we call that kręcić bączki, which roughly translates to spin a bączek) Fart (which leads to many funny/cringey) situations as people often call me by my surname rather than my name  And the winner goes to:  A diminutive form of the word Bąk (Bumblebee)  Here's a collection of hand-picked Bączek pictures, following this interpretation:   Rabbit bumblebee from @Łukasz    Pug bumblebee from @Amanda. The pug is a rolling joke at SatRev, but that's a story for another time :)    Another one from @Amanda    A drawing done by me or some of my friends long time ago, backed up by @Maja (thanks a lot!)    This monstrosity, from @Olek    Christmas Bączek, thank you @Roksana!   "

In [9]:

# Vectorizer to convert a collection of raw documents to a matrix of TF-IDF features
vectorizer = TfidfVectorizer(tokenizer=tokenizer)
# vectorizer = TfidfVectorizer()

# Learn vocabulary and idf, return term-document matrix.
tfidf = vectorizer.fit_transform([post.content for post in all_posts])

# Array mapping from feature integer indices to feature name
words = vectorizer.get_feature_names_out()

umap_result = umap.UMAP().fit_transform(tfidf)

for post, umap_result in zip(all_posts, umap_result):
    post.x, post.y = umap_result

print("Generating related posts...")
for post_index, post in enumerate(all_posts):
    # We can check that using a new document text
    requested_index = post_index

    related_posts = sorted(
        all_posts, key=lambda post_to_sort: post_to_sort.distance_to(post)
    )

    related_product_indices = []
    for related_post in related_posts:
        for post_index, post_ in enumerate(all_posts):
            if related_post.title == post_.title:
                related_product_indices.append(post_index)

    related_product_indices = [
        i for i in related_product_indices if i != requested_index
    ]

    post.related_post_ids = related_product_indices[:3]

for post in all_posts:
    for post_id in post.related_post_ids:
        all_posts[post_id].posts_linking_to_this += 1

max_num_of_links = max(post.posts_linking_to_this for post in all_posts)

for post in all_posts:
    post.posts_linking_to_this /= max_num_of_links

relations_graph = graphviz.Graph(
    comment="All Relations",
    graph_attr={"bgcolor": "transparent", "overlap": "false"},
    format="svg",
    node_attr={"shape": "box"},
    engine="neato",
)

for post in all_posts:
    # transparency = f"{int(255 * post.posts_linking_to_this):02x}"
    color = "#ffffff"
    color += transparency

    relations_graph.node(
        post.title,
        color=color,
        fontcolor="white",
        xlabel="🆕" if post.recently_modified else "",
        URL="/" + post.path.with_suffix("").name,
    )

linked_with = {post.path: [] for post in all_posts}

for post in all_posts:
    related_posts_json_path = Path("./generated") / post.path.relative_to(
        "content"
    ).with_suffix(".json")
    related_posts_json_path.parent.mkdir(parents=True, exist_ok=True)

    related_posts_json = []


    for post_id in post.related_post_ids:
        related_post = all_posts[post_id]

        if (
            related_post.path not in linked_with[post.path]
            and post.path not in linked_with[related_post.path]
        ):
            relations_graph.edge(post.title, related_post.title, color="white")
            linked_with[post.path].append(related_post.path)
            linked_with[related_post.path].append(post.path)

        post_link = "/" + str(
            related_post.path.relative_to("content").parent
            / related_post.path.relative_to("content").stem
        )

        related_posts_json.append({"title": related_post.title, "url": post_link})

    with open(related_posts_json_path, "w", encoding="utf-8") as relations_file:
        json.dump({"posts": related_posts_json}, relations_file)

# Note: it actually renders to connections.svg
relations_graph.render("./generated/connections")
print("Done!")



Generating related posts...
Done!
