In [None]:
import os, bz2, json, tarfile
from pathlib import Path

def chunk_text(text, chunk_size=200, overlap=50):
    """Split text into overlapping chunks."""
    words = text.split()
    chunks = []
    start = 0
    while start < len(words):
        end = start + chunk_size
        chunks.append(" ".join(words[start:end]))
        start = end - overlap
    return chunks

# Use the tar.bz2 file directly instead of expecting an extracted directory
WIKI_PATH = Path("../data/raw/enwiki-20171001-pages-meta-current-withlinks-abstracts.tar.bz2")
OUT = "wiki_chunks.jsonl"

# Limit for testing - set to None to process entire dump
# For quick testing: 100 articles (~30-50 chunks)
# For development: 1000 articles (~300-500 chunks) 
# For production: 10000+ articles or None (full dump)
MAX_ARTICLES = 500  # Start with 500 for reasonable test

def wiki_json_generator(max_articles=None):
    """
    Read Wikipedia articles directly from tar.bz2 archive.
    Each .bz2 file inside contains JSON objects (one per line).
    """
    print(f"Opening Wikipedia dump: {WIKI_PATH}")
    article_count = 0
    
    with tarfile.open(WIKI_PATH, 'r:bz2') as tar:
        members = tar.getmembers()
        # Filter to only .bz2 files
        bz2_members = [m for m in members if m.name.endswith('.bz2') and m.isfile()]
        
        print(f"Found {len(bz2_members)} .bz2 files in archive")
        
        for member in bz2_members:
            if max_articles and article_count >= max_articles:
                print(f"Reached article limit: {max_articles}")
                break
                
            try:
                # Extract the compressed file
                f = tar.extractfile(member)
                if f is None:
                    continue
                
                # Decompress the bz2 content
                decompressed = bz2.decompress(f.read())
                
                # Each line is a separate JSON object
                for line in decompressed.decode('utf-8').strip().split('\n'):
                    if not line.strip():
                        continue
                    
                    if max_articles and article_count >= max_articles:
                        break
                    
                    try:
                        article_count += 1
                        yield json.loads(line)
                    except json.JSONDecodeError:
                        continue
                        
            except Exception as e:
                print(f"Error processing {member.name}: {e}")
                continue

print("Starting to process Wikipedia articles and create chunks...")
print(f"Article limit: {MAX_ARTICLES if MAX_ARTICLES else 'None (all articles)'}")
chunk_count = 0

with open(OUT, "w") as out:
    for art in wiki_json_generator(max_articles=MAX_ARTICLES):
        title = art.get("title", "")
        
        # Text is stored as a list of sentences in the dump
        text_list = art.get("text", [])
        if isinstance(text_list, list):
            text = ' '.join(text_list)
        else:
            text = str(text_list)
        
        if not text or len(text.split()) < 50:
            continue
        
        chunks = chunk_text(text, chunk_size=200, overlap=50)
        for i, ch in enumerate(chunks):
            out.write(json.dumps({
                "id": f"{title}_{i}",
                "title": title,
                "text": ch
            }) + "\n")
            chunk_count += 1
            
            # Print progress every 50 chunks for smaller datasets
            if chunk_count % 50 == 0:
                print(f"Processed {chunk_count} chunks...")

print(f"\n✅ Done! Created {chunk_count} chunks in {OUT}")

# Show sample
if chunk_count > 0:
    with open(OUT, "r") as f:
        sample = json.loads(f.readline())
        print(f"\nSample chunk:")
        print(f"  Title: {sample['title']}")
        print(f"  Text: {sample['text'][:150]}...")

In [None]:
import os
from mistralai import Mistral
import json
import numpy as np
from tqdm import tqdm
import time

# Initialize Mistral client
client = Mistral(api_key=os.environ["MISTRAL_API_KEY"])

embeddings = []
metadatas = []

# Batch processing to handle API rate limits
BATCH_SIZE = 100  # Process in batches to save intermediate results
batch_num = 0

print("Loading chunks from wiki_chunks.jsonl...")

# Count total chunks first
with open("wiki_chunks.jsonl", "r") as f:
    total_chunks = sum(1 for _ in f)

print(f"Total chunks to embed: {total_chunks}")

try:
    with open("wiki_chunks.jsonl", "r") as f:
        for idx, line in enumerate(tqdm(f, desc="Embedding chunks", total=total_chunks)):
            obj = json.loads(line)
            text = obj["text"]

            try:
                # Call Mistral API (note: parameter is 'inputs' not 'input')
                resp = client.embeddings.create(
                    model="mistral-embed",
                    inputs=[text]
                )

                vector = resp.data[0].embedding  # 1024-dim vector

                embeddings.append(np.array(vector, dtype="float32"))
                metadatas.append({
                    "id": obj["id"],
                    "title": obj["title"],
                    "text": obj["text"]
                })
                
                # Save intermediate results every BATCH_SIZE chunks
                if (idx + 1) % BATCH_SIZE == 0:
                    batch_num += 1
                    # Save backup
                    np.save(f"wiki_vectors_batch{batch_num}.npy", np.vstack(embeddings))
                    print(f"\n✅ Saved batch {batch_num} ({len(embeddings)} vectors)")
                    
            except Exception as e:
                print(f"\nError embedding chunk {idx}: {e}")
                # Wait a bit before continuing in case of rate limit
                time.sleep(1)
                continue

except KeyboardInterrupt:
    print("\n⚠️ Processing interrupted by user")

# Save final results
if embeddings:
    print(f"\nSaving final results...")
    np.save("wiki_vectors.npy", np.vstack(embeddings))
    print(f"✅ Saved {len(embeddings)} embeddings to wiki_vectors.npy")

    with open("wiki_meta.jsonl", "w") as out:
        for m in metadatas:
            out.write(json.dumps(m) + "\n")
    print(f"✅ Saved metadata to wiki_meta.jsonl")
    
    print(f"\nFinal statistics:")
    print(f"  - Total embeddings: {len(embeddings)}")
    print(f"  - Embedding dimension: {embeddings[0].shape[0]}")
    print(f"  - Total size: {np.vstack(embeddings).nbytes / (1024**2):.2f} MB")
else:
    print("⚠️ No embeddings were created")

In [None]:
import numpy as np

# First check if faiss is installed, if not provide instructions
try:
    import faiss
    print("✅ FAISS is installed")
except ImportError:
    print("❌ FAISS not found. Installing faiss-cpu...")
    import subprocess
    subprocess.check_call(["pip", "install", "faiss-cpu", "-q"])
    import faiss
    print("✅ FAISS installed successfully")

# Load vectors
print("\nLoading embeddings...")
vectors = np.load("wiki_vectors.npy")
print(f"✅ Loaded {vectors.shape[0]} vectors with {vectors.shape[1]} dimensions")

d = vectors.shape[1]   # Should be 1024 dims from mistral-embed

# Create FAISS index for cosine similarity
print("\nCreating FAISS index...")
index = faiss.IndexFlatIP(d)  # Inner product for cosine similarity

# Normalize vectors to use cosine similarity (cosine = normalized dot product)
print("Normalizing vectors...")
faiss.normalize_L2(vectors)

# Add vectors to index
print("Adding vectors to index...")
index.add(vectors)

# Save index to disk
print("Saving index...")
faiss.write_index(index, "wiki_faiss.index")

print(f"\n✅ FAISS index created successfully!")
print(f"   - Index size: {index.ntotal:,} vectors")
print(f"   - Dimensions: {d}")
print(f"   - Index type: Flat (exact search)")
print(f"   - Saved to: wiki_faiss.index")

In [None]:
import os
from mistralai import Mistral
import faiss
import numpy as np
import json

# Initialize Mistral client
client = Mistral(api_key=os.environ["MISTRAL_API_KEY"])

# Load metadata and FAISS index
print("Loading FAISS index...")
index = faiss.read_index("wiki_faiss.index")
print(f"✅ Loaded index with {index.ntotal} vectors")

print("Loading metadata...")
with open("wiki_meta.jsonl") as f:
    metas = [json.loads(l) for l in f]
print(f"✅ Loaded {len(metas)} metadata entries")

def dense_retrieval(query, k=5):
    """
    Retrieve top-k most relevant chunks for a query using dense retrieval.
    
    Args:
        query: Query string
        k: Number of results to return
        
    Returns:
        List of metadata dictionaries for top-k results
    """
    # Create embedding for query (note: parameter is 'inputs' not 'input')
    resp = client.embeddings.create(
        model="mistral-embed",
        inputs=[query]
    )
    q_vec = np.array(resp.data[0].embedding, dtype="float32").reshape(1, -1)
    
    # Normalize for cosine similarity
    faiss.normalize_L2(q_vec)

    # Search
    scores, ids = index.search(q_vec, k)

    results = []
    for score, idx in zip(scores[0], ids[0]):
        result = metas[idx].copy()
        result['score'] = float(score)
        results.append(result)

    return results


# Test retrieval with multiple queries
test_queries = [
    "Who was Barack Obama's vice president?",
    "What is the capital of France?",
    "Who invented the telephone?"
]

for query in test_queries:
    print("\n" + "="*80)
    print(f"QUERY: {query}")
    print("="*80)
    
    results = dense_retrieval(query, k=3)
    
    for i, r in enumerate(results, 1):
        print(f"\n{i}. TITLE: {r['title']}")
        print(f"   SCORE: {r['score']:.4f}")
        print(f"   TEXT: {r['text'][:200]}...")
        print("   ---")