In [1]:
# !pip install tqdm numpy scikit-learn networkx bs4 matplotlib requests rank_bm25 openai faiss-cpu python-dotenv

In [2]:
from tqdm import tqdm
import numpy as np
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import os

from rank_bm25 import BM25Okapi
from openai import OpenAI
import faiss
from dotenv import load_dotenv

from utils import *



In [3]:
course_numbers = [
    1043, 1045, 1046, 1047, 1048, 1049, 1050, 1054, 1055, 1056, 
    1057, 1058, 1059, 1060, 1061, 1062, 1063, 1064, 1065, 1066, 
    1067, 1068, 1069, 1221
]

courses = []
certificates = []
certificate_htmls_location = 'certificate_htmls'

for html in tqdm(os.listdir(certificate_htmls_location)):
    certificate = Certificate(f'{certificate_htmls_location}/{html}')
    certificates.append(certificate)
    
for n in tqdm(course_numbers):
    new_course = Course(f'https://certification.adobe.com/courses/{n}')
    courses.append(new_course)



  0%|          | 0/54 [00:00<?, ?it/s]

100%|██████████| 54/54 [00:01<00:00, 32.25it/s]
100%|██████████| 24/24 [00:11<00:00,  2.02it/s]


In [4]:
documents = []
doc2source: dict[str, Source] = {}

for i, course in enumerate(courses):
    documents.append(course.to_text())
    doc2source[course.to_text()] = course

for i, cert in enumerate(certificates):
    documents.append(cert.to_text())
    doc2source[cert.to_text()] = cert


In [5]:
load_dotenv()

openai_api_key = os.getenv("OPENAI_API_KEY")

# documents = [
#     "Neural networks are a key part of deep learning.",
#     "BM25 is a ranking function used in search engines.",
#     "Reinforcement learning is used in decision-making.",
#     "Graph neural networks process graph-structured data."
# ]

# Keyword Search
tokenized_corpus = [doc.split() for doc in documents]
bm25 = BM25Okapi(tokenized_corpus)

# Semantic Search
client = OpenAI(api_key=openai_api_key)

def get_embedding(text):
    return client.embeddings.create(input=text, model="text-embedding-ada-002").data[0].embedding

embeddings = np.array([get_embedding(doc) for doc in documents])
d = embeddings.shape[1]  # Embedding dimension
index = faiss.IndexFlatL2(d)
index.add(embeddings)

In [None]:
def rrf_fusion(results, k=30):
    """
    Computes Reciprocal Rank Fusion (RRF) for multiple ranked lists.
    
    :param results: Dict of {method_name: {doc_id: rank_position}}
    :param k: Small constant (default: 60) for score scaling.
    :return: Sorted list of (document_id, RRF score).
    """
    rrf_scores = {}

    for method, ranked_docs in results.items():
        for doc_id, rank in ranked_docs.items():
            if doc_id not in rrf_scores:
                rrf_scores[doc_id] = 0
            rrf_scores[doc_id] += 1 / (k + rank)

    return sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)


def keyword_search(query: str, bm25: BM25Okapi, top_n=10) -> dict[int, int]:
    """
    Perform BM25 keyword search and return ranked results.
    """
    tokenized_query = query.split()
    bm25_scores = bm25.get_scores(tokenized_query)
    bm25_top_n = np.argsort(bm25_scores)[::-1][:top_n]  # Get top BM25 results

    return {i.item(): rank+1 for rank, i in enumerate(bm25_top_n)}  # Rank position starts from 1


def semantic_search(query: str, index: faiss.IndexFlatL2, top_n=10) -> dict[int, int]:
    """
    Perform FAISS vector search and return ranked results.
    """
    query_embedding = np.array(get_embedding(query)).reshape(1, -1)
    _, vector_top_n = index.search(query_embedding, top_n)  # Retrieve top vector matches

    return {i.item(): rank+1 for rank, i in enumerate(vector_top_n[0])}  # Rank position starts from 1


def hybrid_search(query, bm25: BM25Okapi, index: faiss.IndexFlatL2, top_n=10, k=60):
    """
    Perform hybrid search using Reciprocal Rank Fusion (RRF).
    """
    bm25_results = keyword_search(query, bm25, top_n=top_n)
    vector_results = semantic_search(query, index, top_n=top_n)

    rrf_results = rrf_fusion({"BM25": bm25_results, "Vector": vector_results}, k=k)

    return [(documents[i], score) for i, score in rrf_results]


query = "What courses are good for beginner programmers?"
results = hybrid_search(query, bm25, index)

for rank, (doc, score) in enumerate(results, 1):
    source = doc2source[doc]
    print(f"{rank}. { source.display} (type: {source.type}) (Score: {score:.4f})")


1. Adobe Campaign Classic Developer Professional (Type: certificate) (Score: 0.0296)
2. Adobe Experience Manager Foundations (Type: course) (Score: 0.0294)
3. Adobe Workfront Project Manager Professional (Type: certificate) (Score: 0.0164)
4. Adobe Experience Manager Technical Foundations Professional (Type: certificate) (Score: 0.0164)
5. Adobe Workfront Core Developer Expert (Type: certificate) (Score: 0.0161)
6. Adobe Captivate Certified Professional (Type: certificate) (Score: 0.0161)
7. Adobe Advertising DSP Business Practitioner Professional (Type: certificate) (Score: 0.0159)
8. Adobe Commerce Front-End Developer Expert (NEW) (Type: certificate) (Score: 0.0159)
9. Adobe Commerce for Developers - Professional (Type: course) (Score: 0.0156)
10. Adobe Experience Manager Sites Developer Professional (NEW) (Type: certificate) (Score: 0.0156)
11. Adobe Workfront for Experience Manager Enhanced Connector Expert (Type: certificate) (Score: 0.0154)
12. Adobe Campaign Classic Developer Ex