In [13]:
from sentence_transformers import SentenceTransformer, util
import os
import gzip
import json
import numpy as np

### Question A

In [14]:
filepath = "data/simplewiki-2020-11-01.jsonl.gz"
if not os.path.exists(filepath):
    util.http_get("http://sbert.net/datasets/simplewiki-2020-11-01.jsonl.gz", filepath)

In [15]:
# Initialize containers for collection
articles = []
paragraphs = []
sentences = []

# Read and process the dataset
with gzip.open(filepath, "rt", encoding="utf8") as f:
    for line in f:
        data = json.loads(line)
        article_id = data["id"]
        article_title = data["title"]

        # Store article-level information
        articles.append({"id": article_id, "title": article_title})

        for i, paragraph_text in enumerate(data["paragraphs"]):
            paragraph_id = f"{article_id}:{i}"

            # Store paragraph-level information
            paragraphs.append(
                {"article_id": article_id, "id": paragraph_id, "text": paragraph_text}
            )

            # Split paragraph into sentences
            paragraph_sentences = paragraph_text.split(".")

            for j, sentence_text in enumerate(paragraph_sentences):
                sentence_id = f"{paragraph_id}:{j}"

                # Store sentence-level information
                sentences.append(
                    {
                        "paragraph_id": paragraph_id,
                        "id": sentence_id,
                        "text": sentence_text.strip(),
                    }
                )

# Output sample data for verification
print(
    f"Loaded {len(articles)} articles, {len(paragraphs)} paragraphs, and {len(sentences)} sentences."
)
print("Sample article:", articles[0] if articles else "No articles found.")
print("Sample paragraph:", paragraphs[0] if paragraphs else "No paragraphs found.")
print("Sample sentence:", sentences[0] if sentences else "No sentences found.")

Loaded 169597 articles, 509663 paragraphs, and 2025547 sentences.
Sample article: {'id': '9822', 'title': 'Ted Cassidy'}
Sample paragraph: {'article_id': '9822', 'id': '9822:0', 'text': 'Ted Cassidy (July 31, 1932 - January 16, 1979) was an American actor. He was best known for his roles as Lurch and Thing on "The Addams Family".'}
Sample sentence: {'paragraph_id': '9822:0', 'id': '9822:0:0', 'text': 'Ted Cassidy (July 31, 1932 - January 16, 1979) was an American actor'}


### Question B

In [16]:
# Load the sentence transformer model
model_name = "all-MiniLM-L6-v2"  # Choose 'all-mpnet-base-v2' for best quality
model = SentenceTransformer(model_name)

# Encode sentences
sentence_texts = [s["text"] for s in sentences]
sentence_embeddings = model.encode(
    sentence_texts, convert_to_numpy=True, show_progress_bar=True
)

# Check embedding dimensions
embedding_dim = sentence_embeddings.shape[1]
print(
    f"Encoded {len(sentence_embeddings)} sentences with embedding dimension: {embedding_dim}"
)

Batches: 100%|██████████| 63299/63299 [19:17<00:00, 54.71it/s]  


Encoded 2025547 sentences with embedding dimension: 384


### Question C

In [17]:
# Semantic search example
def find_similar_sentences(query, top_k=5):
    query_embedding = model.encode(query, convert_to_numpy=True)
    hits = util.semantic_search(query_embedding, sentence_embeddings, top_k=top_k)
    results = [
        {
            "sentence": sentence_texts[hit["corpus_id"]],
            "paragraph_id": sentences[hit["corpus_id"]]["paragraph_id"],
            "score": hit["score"],
        }
        for hit in hits[0]
    ]
    return results


# Generate prompt
def generate_prompt(query, context):
    instruction = "You are a question answering bot. Use the context below to answer the user's question."
    prompt = f"{instruction}\n\nContext:\n{context}\n\nQuestion:\n{query}"
    return prompt

### Question D

In [18]:
# Expand context to include neighboring paragraphs
def expand_context(paragraph_id):
    context = []
    article_id, paragraph_idx = paragraph_id.split(":")
    paragraph_idx = int(paragraph_idx)

    for offset in [-1, 0, 1]:
        neighbor_idx = paragraph_idx + offset
        neighbor_id = f"{article_id}:{neighbor_idx}"
        neighbor_paragraph = next(
            (p["text"] for p in paragraphs if p["id"] == neighbor_id), None
        )
        if neighbor_paragraph:
            context.append(neighbor_paragraph)

    return "\n\n".join(context)


# Combine similar paragraphs and expand context
def construct_context(similar_sentences):
    paragraph_ids = set()
    context = []
    for result in similar_sentences:
        if result["paragraph_id"] not in paragraph_ids:
            paragraph_ids.add(result["paragraph_id"])
            expanded_context = expand_context(result["paragraph_id"])
            context.append(expanded_context)
    return "\n\n".join(context)

In [19]:
# Example usage
query = "What is the capital of France?"
similar_sentences = find_similar_sentences(query)
context = construct_context(similar_sentences)
prompt = generate_prompt(query, context)

print("Generated prompt:")
print(prompt)

# Optionally, save embeddings and related data for later use
np.save("data/sentence_embeddings.npy", sentence_embeddings)
print("Saved sentence embeddings to data/sentence_embeddings.npy")

Generated prompt:
You are a question answering bot. Use the context below to answer the user's question.

Context:
Île-de-France is a region of France. The capital city is Paris. It is also the capital city of France. In 2013 about 12 million people lived in the region. About 2.1 million people live in the city of Paris.

There are 8 departments in the region. They are:

The capital of France is Paris. In the course of history, the national capital has been in many locations other than Paris.

During the Protestant Reformation, a huge massacre of French Protestants started there in 1572, called the Saint Bartholomew Day Massacre. Paris saw many other troubles over the years of the ""Ancien Régime"" (Old Kingdom), then in 1789, the French Revolution began in Paris, leading to more massacres.

Paris was the Capital of the French Empire which, as well as France, covered Spain, Belgium, Holland, Luxembourg, Switzerland, Italy, most of Germany and some of Austria, Croatia, Slovenia and Pola