## Data

#### Imports

In [28]:

from pprint import pprint
from uuid import uuid4

from tqdm import tqdm

from datetime import datetime


import os
import sys
import re
import pypandoc


from dotenv import load_dotenv


# for accesjng github api
from github import Github, Auth, ContentFile

# markdown parsing to HTML and reSt to md
from rst2gfm import convert_rst_to_md
import mistletoe

# for parsing HTML to extract links and names
import lxml
from bs4 import BeautifulSoup

# langchain for the RAG pipeline
from langchain_text_splitters import MarkdownTextSplitter, RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_core.messages import HumanMessage, SystemMessage, AIMessage

from langchain_ollama import OllamaEmbeddings, ChatOllama
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI

import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from langchain_community.vectorstores.utils import DistanceStrategy


load_dotenv()

True

#### Globals

In [None]:
# Context Enrichment / Summarization
OLLAMA_LANGUAGE_MODEL = "gemma3:270m"
# "gemma3:270m"
# "qwen3:1.7b"
# "qwen3:0.6b"
# "qwen2.5:0.5b"
# "gemma3:1b-it-qat"
LANGUAGE_MODEL_TEMPERATURE = 0.1
LANGUAGE_MODEL_REASONING = False

# Embedding
OLLAMA_EMBEDDING_MODEL = "embeddinggemma:300m"
TOKEN_CONTEXT_LENGTH = 1024
DOCUMENT_PREFIX = f"title: none | text: "
DOCUMENT_PREFIX_WITH_TITLE = lambda x : f"title: {x if x is not None else 'none'} | text: "
QUERY_PREFIX = "task: search result | query: "

#### Auth

In [30]:
gh_token = os.getenv("GITHUB_TOKEN") 

auth = Auth.Token(token = gh_token) # type: ignore

gh = Github(auth=auth)

print(f"Logged in as: {gh.get_user().login}")

Logged in as: VladBonciu


#### Link Extraction

In [31]:

awesome_lists = \
[
    "vinta/awesome-python",
    "josephmisiti/awesome-machine-learning",
    "sorrycc/awesome-javascript",
    "sindresorhus/awesome-nodejs",
    "enaqx/awesome-react",
    "vuejs/awesome-vue",
    "nepaul/awesome-web-development",
    "sbilly/awesome-security",
    "oxnr/awesome-bigdata",
    # "JStumpp/awesome-android",
    # "vsouza/awesome-ios",
    # "gztchan/awesome-design",
    "wmariuss/awesome-devops",
    "Kiloreux/awesome-robotics",
    "awesome-selfhosted/awesome-selfhosted",
    "fffaraz/awesome-cpp",
    "agarrharr/awesome-cli-apps"
]

structured = {}

all_links = []

for route in awesome_lists:

    repo = gh.get_repo(route)

    print("Extracting links from:", repo.name)

    file_content = repo.get_readme() # type: ignore

    # extract the raw markdown of the awesome list readme
    readme_content = file_content.decoded_content.decode("utf-8")  # type: ignore

    # parse it into a html formatted string
    rendered = mistletoe.markdown(readme_content)

    # parse the html string  using bs
    soup = BeautifulSoup(rendered, features='lxml')

    # repos extraction process
    tech_lists = soup.select("ul")

    dict_tech_lists = {}

    for tech_list in tech_lists:
        
        links = []

        try:

            # (sub)title of list
            subtitle = tech_list.find_all_previous()[0].text.splitlines()[0]

            for item in tech_list.select("li"):
                
                link : str = item.a.get("href", None)  # type: ignore
                
                if link.startswith("#"):
                    continue

                # im currently ignoring the other "awesomez-..." lists featured
                if "awesome-" in link:
                    # print("\t---",link.replace("https://github.com/", ""), "---")
                    continue

                if link.startswith("https://github.com/"):
                    
                    if link.endswith("/"):
                        link = link[:-1]

                    link = link.replace("https://github.com/", "")

                    link = "/".join(link.split("/")[:2])

                    if link.count("/") != 1:
                        continue

                    if link in all_links:
                        continue

                    links.append(link)
                    all_links.append(link)


        except:

            continue


        if len(links) > 0:

            # remove duplicates
            links = list(dict.fromkeys(links))

            dict_tech_lists.update({subtitle : links})

            # print(subtitle)
            # pprint(links)

    # pprint(dict_tech_lists)

    structured.update({repo.name : dict_tech_lists})

print("Done!")

print(len(all_links))
        

Extracting links from: awesome-python
Extracting links from: awesome-machine-learning
Extracting links from: awesome-javascript
Extracting links from: awesome-nodejs
Extracting links from: awesome-react
Extracting links from: awesome-vue
Extracting links from: awesome-web-development
Extracting links from: awesome-security
Extracting links from: awesome-bigdata
Extracting links from: awesome-devops
Extracting links from: awesome-robotics
Extracting links from: awesome-selfhosted
Extracting links from: awesome-cpp
Extracting links from: awesome-cli-apps
Done!
4994


In [32]:
# pprint(structured, sort_dicts=False)

gh.rate_limiting
# datetime.strptime(str(gh.rate_limiting_resettime), '%Y%m%d') 
print(gh.rate_limiting_resettime)

datetime.fromtimestamp(gh.rate_limiting_resettime)

1767972090


datetime.datetime(2026, 1, 9, 17, 21, 30)

#### Repo Parsing and Processing

In [33]:
def get_clean_md(readme_file):

    name = readme_file.name.lower()

    content = readme_file.decoded_content.decode("utf-8")
    
    # strip YAML front matter (if exists)
    content = re.sub(r'^---\s*\n.*?\n---\s*\n', '', content, flags=re.DOTALL)

    # find format for pandoc
    ext_map = \
    {
        '.rst': 'rst',
        '.adoc': 'asciidoc',
        '.asciidoc': 'asciidoc',
        '.textile': 'textile',
        '.org': 'org',
        '.md': 'markdown',
        '.markdown': 'markdown'
    }
    
    # get extension, default to 'markdown'
    from_format = 'markdown'
    for ext, fmt in ext_map.items():
        if name.endswith(ext):
            from_format = fmt
            break

    # convert anything to md first
    try:
        raw_md = pypandoc.convert_text(content, 'gfm-raw_html', format=from_format)
    except:
        # fallback 
        raw_md = content 

    # remove Images: ![alt](url)
    clean_md = re.sub(r'!\[.*?\]\(.*?\)', r' ', raw_md)

    # convert links to plain text: [Text](url) -> Text
    clean_md = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', clean_md)

    # remove reST artifacts that Pandoc might have left over
    clean_md = re.sub(r'(?m)^\s*(\.\.\ |:)(image|target|alt|align|width|height):.*$', '', clean_md)
    
    # Remove any leftover unknown HTML tags (like <details> or <div>)
    clean_md = re.sub(r'<[^>]+>', '', clean_md)

    # this finds lines with whitespace and turns them into true empty lines
    clean_md = re.sub(r'(?m)^[ \t]+$', '', clean_md) 
    clean_md = re.sub(r'\n{3,}', '\n\n', clean_md)
    
    return clean_md.strip()

In [34]:
import json
import pickle
from pathlib import Path

# simple checkpoint system
CHECKPOINT_DIR = Path("checkpoints")
CHECKPOINT_DIR.mkdir(exist_ok=True)

def load_progress():
    """Load which repos were already processed."""
    checkpoint_file = CHECKPOINT_DIR / "processed_repos.json"
    if checkpoint_file.exists():
        with open(checkpoint_file, 'r') as f:
            return set(json.load(f))
    return set()

def save_progress(processed_repos):
    """Save which repos have been processed."""
    checkpoint_file = CHECKPOINT_DIR / "processed_repos.json"
    with open(checkpoint_file, 'w') as f:
        json.dump(list(processed_repos), f)

def load_documents():
    """Load previously processed documents."""
    summary_file = CHECKPOINT_DIR / "summary_docs.pkl"
    docs_file = CHECKPOINT_DIR / "docs.pkl"
    
    summary_docs = []
    docs = []
    
    if summary_file.exists():
        with open(summary_file, 'rb') as f:
            summary_docs = pickle.load(f)
    
    if docs_file.exists():
        with open(docs_file, 'rb') as f:
            docs = pickle.load(f)
    
    return summary_docs, docs

def save_documents(summary_docs, docs):
    """Save processed documents."""
    with open(CHECKPOINT_DIR / "summary_docs.pkl", 'wb') as f:
        pickle.dump(summary_docs, f)
    
    with open(CHECKPOINT_DIR / "docs.pkl", 'wb') as f:
        pickle.dump(docs, f)

# Main processing code with checkpoints
text_splitter = MarkdownTextSplitter\
(
    chunk_size=TOKEN_CONTEXT_LENGTH - 80, # -80 ish tokens for the embedding prefix and the summarization of the repo
    chunk_overlap=128,
)



In [35]:
# Load previous progress
processed_repos = load_progress()
summary_docs, docs = load_documents()

print(f"Already processed: {len(processed_repos)} repos")
print(f"Loaded: {len(summary_docs)} summary docs, {len(docs)} repo docs")

llm = ChatOllama \
(
    model= OLLAMA_LANGUAGE_MODEL,
    temperature = LANGUAGE_MODEL_TEMPERATURE,
    reasoning = LANGUAGE_MODEL_REASONING,
    num_predict = 2000,
    keep_alive="24h"
)

save_counter = 0
SAVE_EVERY = 10  # Save progress every 10 repos



for awesome_list_name, sublists_dict in structured.items():

    print(f"\n --- {awesome_list_name} --- \n")

    for sublist_name, repo_list in sublists_dict.items():
        
        print(f"* {sublist_name}")

        for repo_path in repo_list:

            if repo_path in processed_repos:
                continue

            try:

                repo = gh.get_repo(repo_path)
                
                readme_file = repo.get_readme() # type: ignore

                md_content = get_clean_md(readme_file=readme_file)

                # cut the md content to the first 8000 chars
                md_content = md_content[:8000]

                # initial chunks of repo
                repo_chunks = text_splitter.split_text(md_content)

                # summarization llm
                llm_reponse = llm.invoke \
                (
                    input = \
                    [
                        # SystemMessage("You are an expert software engineer, with afinities for discovering new technologies."),
                        HumanMessage(f"""Write a one-paragraph summary (6-7 sentences) of the repository content provided below. 

STRICT RULES:
1. NO PREAMBLE. Do not say "Here is" or "The text is". 
2. FIRST SENTENCE MUST BE: [Name] is a [Type] that [Z], built in [Language].
3. Output the one paragraph ONLY. (no enters / other markdown)

---
CONTENT:
{md_content}
---
SUMMARY: """),
AIMessage(f"{repo.name} is")
                    ]
                )

                # special summary doc
                summary_doc = Document \
                (
                    page_content = str(DOCUMENT_PREFIX_WITH_TITLE(repo.name) + f"{repo.name} is" + str(llm_reponse.content)),
                    metadata = \
                    { 
                        "awesome_list_origin" : awesome_list_name ,
                        "sublist_origin" : sublist_name , 
                        "repo_origin" : repo_path, 
                        "repo_name" : repo.name, 
                        "repo_description" : repo.description,
                        "repo_star_count" : repo.stargazers_count, 
                        "repo_fork_count" : repo.forks_count,
                        "repo_license_name" : repo.license.name if repo.license is not None else "-",
                        "repo_last_update_date" : repo.updated_at,
                        "repo_archived" : repo.archived
                    }
                )

                summary_docs.append(summary_doc)

                # only use the first 10 chunks , as if there were more chunks for each then  there would be a disproportionate amount of chunks for some repos
                repo_chunks = repo_chunks[:10]

                # pprint(repo_chunks, width=1000)

                # context enrichment on the  repo chunks for better retrieval
                repo_chunks = [ f"{DOCUMENT_PREFIX_WITH_TITLE(repo.name)}[{repo.name} is{str(llm_reponse.content).split(sep ='.')[0]}]\n{original_chunk}" for original_chunk in repo_chunks ]

                # print(summary_doc.page_content)
                # print(summary_doc.metadata)

                repo_docs = text_splitter.create_documents\
                (
                    texts = repo_chunks, 
                    metadatas = list(
                    { 
                        "awesome_list_origin" : awesome_list_name ,
                        "sublist_origin" : sublist_name , 
                        "repo_origin" : repo_path, 
                        "repo_name" : repo.name, 
                        "repo_description" : repo.description,
                        "repo_star_count" : repo.stargazers_count, 
                        "repo_fork_count" : repo.forks_count,
                        "repo_license_name" : repo.license.name if repo.license is not None else "-",
                        "repo_license_link" : repo.license.url if repo.license is not None else "-",
                        "repo_last_update_date" : repo.updated_at,
                        "repo_archived" : repo.archived
                    } for i in range(len(repo_chunks)))
                )



                docs.extend(repo_docs)

                processed_repos.add(repo_path)

                # Save progress periodically
                save_counter += 1
                if save_counter % SAVE_EVERY == 0:
                    save_progress(processed_repos)
                    save_documents(summary_docs, docs)
                    print(f" üíæ Progress saved ({len(processed_repos)} repos)")

                print(f" - {repo_path} ({len(repo_chunks)})")

                # sys.exit(0)
            
            except KeyboardInterrupt:
                print("\n‚ö†Ô∏è  Interrupted! Saving progress...")
                save_progress(processed_repos)
                save_documents(summary_docs, docs)
                print(f"üíæ Progress saved. Processed: {len(processed_repos)} repos")
                print("You can resume by running this cell again.")
                raise
                
            except Exception as e:

                print(f" x {repo_path}: {str(e)[:50]}")
                processed_repos.add(repo_path)  # Mark as processed even if failed
                continue
            

pprint(docs)        

Already processed: 475 repos
Loaded: 483 summary docs, 4315 repo docs

 --- awesome-python --- 

* Libraries for administrative interfaces.
 - unfoldadmin/django-unfold (8)
 - offerrall/FuncToWeb (8)
* awesome-algorithms
* ASGI

‚ö†Ô∏è  Interrupted! Saving progress...
üíæ Progress saved. Processed: 477 repos
You can resume by running this cell again.


KeyboardInterrupt: 

In [None]:
# summary_docs, docs = load_documents()


# pprint(len(summary_docs))

# lista  = [doc.metadata['repo_origin'] for doc in summary_docs]

# save_progress(lista)

434


In [46]:
summary_docs, docs = load_documents()

In [47]:
summary_docs, docs = load_documents()
pprint(len(summary_docs))
pprint(len(docs))

485
4339


#### Embedding and Saving Documents

In [48]:
embeddings = OllamaEmbeddings \
(
    model=OLLAMA_EMBEDDING_MODEL,
    num_ctx=TOKEN_CONTEXT_LENGTH,
    mirostat_eta = 0.1,
    mirostat_tau = 5.0,
    mirostat = 0,
    tfs_z=1.0
)

In [49]:
batch_size = 10
db_repo = None
db_summary = None

In [50]:

for i in tqdm(range(0, len(summary_docs), batch_size), desc="Indexing Summary Documents"):

    batch = summary_docs[i : i + batch_size]
    
    if db_summary is None:
        # Create the store with the first batch
        db_summary = FAISS.from_documents(batch, embeddings)
    else:
        # Add subsequent batches
        db_summary.add_documents(batch)

db_summary.save_local("data/summary")  # type: ignore

for i in tqdm(range(0, len(docs), batch_size), desc="Indexing Repo Documents"):

    batch = docs[i : i + batch_size]
    
    if db_repo is None:
        # Create the store with the first batch
        db_repo = FAISS.from_documents(batch, embeddings)
    else:
        # Add subsequent batches
        db_repo.add_documents(batch)

db_repo.save_local("data/repo")  # type: ignore

Indexing Summary Documents: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 49/49 [01:24<00:00,  1.73s/it]
Indexing Repo Documents: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 434/434 [15:58<00:00,  2.21s/it]


In [None]:
db_repo = FAISS.load_local("data/repo", embeddings, allow_dangerous_deserialization=True)
db_summary = FAISS.load_local("data/summary", embeddings, allow_dangerous_deserialization=True)

In [None]:
# db_repo.save_local("data/repo")  # type: ignore
# db_summary.save_local("data/summary")  # type: ignore

In [53]:
retriever_repo = db_repo.as_retriever()
retriever_summary = db_summary.as_retriever()

# pprint(list((doc.metadata.get('awesome_list_origin'), doc.metadata.get('repo_name'), doc.page_content) for doc in db.max_marginal_relevance_search("fast web app", k = 100)), width=10000)

# pprint(list(f"{doc.metadata.get('repo_name')} ; {val} : {doc.page_content[:200]} " for (doc, val) in db_repo.similarity_search_with_score(QUERY_PREFIX + "python content management system", k=100)), width=1000)
pprint(list(f"{doc.metadata.get('repo_name')} ; {val} : {doc.page_content.splitlines()[0]} " for (doc, val) in db_summary.similarity_search_with_score(QUERY_PREFIX + " web app builder", k=100)), width=1000)

# pprint(db.max_marginal_relevance_search("fast web app", k = 100))

['streamlit ; 0.9404366612434387 : title: streamlit | text: streamlit is a fast, easy-to-use Python web app builder that allows users to easily create interactive data apps. It offers a simple and Pythonic approach to data visualization and collaboration, with a vibrant community for support and inspiration. ',
 'pybuilder ; 1.092275857925415 : title: pybuilder | text: pybuilder is a Python build automation tool designed for easy and efficient Python development. It leverages dependency-based programming principles to build build life cycles similar to those found in other popular build tools. PyBuilder offers a powerful plugin mechanism for creating build life cycles similar to those used in Java build tools, allowing for more complex and scalable build processes. The tool is available on the Python 3.9, 3.10, 3.11, 3.12, 3.13, and PyPy 3.8, 3.9, and 3.10 versions. The tool is available via GitHub Actions Workflow, and the PyPI for more information is available at [https://github.com/

In [None]:
# retriever = vector_store.as_retriever()

In [None]:
# ddocs = retriever.

In [None]:
# # pprint(list(x.previous_siblings for x in soup.select("ul")))
# # pprint(list(zip((x.previous_sibling for x in soup.select("ul")), soup.select("ul"))))




# tech_lists = soup.select("ul")

# dict_tech_lists = {}

# for tech_list in tech_lists:
    
#     links = []

#     try:

#         # (sub)title of list
#         subtitle = tech_list.find_all_previous()[0].text.splitlines()[0]

#         for item in tech_list.select("li"):
            
#             link : str = item.a.get("href", None)  # type: ignore
            
#             if link.startswith("#"):
#                 continue

#             # im currently ignoring the other "awesomez-..." lists featured
#             if "awesome-" in link:
#                 # print("\t---",link.replace("https://github.com/", ""), "---")
#                 continue

#             if link.startswith("https://github.com/"):
                
#                 if link.endswith("/"):
#                     link = link[:-1]

#                 link = link.replace("https://github.com/", "")


#                 links.append(link)

#     except:

#         continue

#     if len(links) > 0:

#         dict_tech_lists.update({subtitle : links})

#         # print(subtitle)
#         # pprint(links)

# pprint(dict_tech_lists)

