<a href="https://colab.research.google.com/github/PoojithaaReddy/Semantic-Search-on-Twitter-API-Documentation/blob/main/T1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/xdevplatform/postman-twitter-api

Cloning into 'postman-twitter-api'...
remote: Enumerating objects: 65, done.[K
remote: Counting objects: 100% (12/12), done.[K
remote: Compressing objects: 100% (12/12), done.[K
remote: Total 65 (delta 9), reused 0 (delta 0), pack-reused 53 (from 1)[K
Receiving objects: 100% (65/65), 125.58 KiB | 1.34 MiB/s, done.
Resolving deltas: 100% (31/31), done.


In [None]:
!pip install sentence-transformers



In [None]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.13.0-cp39-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.7 kB)
Downloading faiss_cpu-1.13.0-cp39-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (23.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.6/23.6 MB[0m [31m81.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.13.0


In [None]:
!pip install requests beautifulsoup4



In [None]:
!pip install pydantic



In [None]:
import os
import json
import numpy as np# You might need to install 'langchain-text-splitters' or implement this class manually in a local setup. For Colab, a simple implementation or an equivalent library is often used.
from sentence_transformers import SentenceTransformer
import faiss

# --- 0. Setup and Installation ---

# 1. Install necessary libraries
print("--- Installing Dependencies ---")
!pip install sentence-transformers faiss-cpu pydantic langchain-text-splitters > /dev/null

# 2. Clone the documentation repository
REPO_URL = "https://github.com/xdevplatform/postman-twitter-api"
REPO_DIR = "postman-twitter-api"
print(f"--- Cloning Repository: {REPO_URL} ---")
if not os.path.exists(REPO_DIR):
    !git clone {REPO_URL} > /dev/null
else:
    print(f"Repository {REPO_DIR} already exists.")

# --- 1. Data Loading and Chunking Strategy ---
import os
import json
import numpy as np
# DELETE OR COMMENT OUT THIS LINE: from text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import faiss

# --- ADD THE CLASS DEFINITION HERE (POINT 2) ---

# Simple implementation of text splitter for robust Colab execution
class RecursiveCharacterTextSplitter:
    def __init__(self, chunk_size, chunk_overlap, separators):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.separators = separators

    def split_text(self, text):
        chunks = []
        start = 0
        while start < len(text):
            end = min(start + self.chunk_size, len(text))
            if end == len(text):
                chunks.append(text[start:end])
                break

            # Simple overlap handling (not truly recursive, but functional)
            chunk = text[start:end]
            chunks.append(chunk)
            start += (self.chunk_size - self.chunk_overlap)
        return chunks

# --- END OF CLASS DEFINITION ---

# --- 0. Setup and Installation ---
# ... (rest of the script continues here, including load_and_chunk_documentation)

def load_and_chunk_documentation(repo_path):
    """
    Loads text files from the repository and chunks them intelligently.
    In a real scenario, you'd parse JSON/YAML/Markdown files for API definitions.
    For this example, we assume the documentation is in the README.md.
    """
    DOC_FILE_PATH = os.path.join(repo_path, 'README.md')
    doc_chunks = []

    if not os.path.exists(DOC_FILE_PATH):
        print(f"ERROR: Documentation file not found at {DOC_FILE_PATH}. Using placeholder text.")
        # Placeholder data if the file is missing
        placeholder_text = (
            "The Twitter API v2 allows fetching tweets with expansions. "
            "Expansions are key fields used to retrieve related objects like users or media. "
            "To use expansions, append `expansions=author_id,attachments.media_keys` to your query. "
            "The Postman collection contains endpoints for tweets, users, and spaces. "
            "Rate limits apply to all v2 endpoints."
        )
        doc_chunks = simple_splitter(placeholder_text, chunk_size=100, chunk_overlap=20)
        return doc_chunks

    print(f"Loading and chunking data from: {DOC_FILE_PATH}")
    with open(DOC_FILE_PATH, 'r', encoding='utf-8') as f:
        full_doc_text = f.read()

    # Intelligent Chunking: Using RecursiveCharacterTextSplitter
    # tuned for general text/markdown structure.
    # Chunk size is set small for highly granular API documentation sections.
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=300,
        chunk_overlap=50,
        separators=["\n\n", "\n", " ", ""]
    )
    doc_chunks = splitter.split_text(full_doc_text)

    return doc_chunks

# Simple implementation of text splitter for robust Colab execution
class RecursiveCharacterTextSplitter:
    def __init__(self, chunk_size, chunk_overlap, separators):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.separators = separators

    def split_text(self, text):
        chunks = []
        start = 0
        while start < len(text):
            end = min(start + self.chunk_size, len(text))
            if end == len(text):
                chunks.append(text[start:end])
                break

            # Simple overlap handling (not truly recursive, but functional)
            chunk = text[start:end]
            chunks.append(chunk)
            start += (self.chunk_size - self.chunk_overlap)
        return chunks

doc_chunks = load_and_chunk_documentation(REPO_DIR)
print(f"Successfully created {len(doc_chunks)} documentation chunks.")

# --- 2. Embedding and Index Creation ---

# 2.1 Choose and Load Embedding Model
MODEL_NAME = 'all-MiniLM-L6-v2'
print(f"--- Loading Embedding Model: {MODEL_NAME} ---")
model = SentenceTransformer(MODEL_NAME)
EMBEDDING_DIM = model.get_sentence_embedding_dimension()

# 2.2 Generate embeddings (This can be computationally intensive)
print("--- Generating Chunk Embeddings ---")
chunk_embeddings = model.encode(
    doc_chunks,
    convert_to_numpy=True,
    show_progress_bar=True
).astype('float32') # FAISS often prefers float32

# 2.3 Build the Vector Index (FAISS)
print(f"--- Building FAISS Index (Dim: {EMBEDDING_DIM}) ---")
# IndexFlatL2 uses Euclidean (L2) distance for simple nearest neighbor search
index = faiss.IndexFlatL2(EMBEDDING_DIM)
index.add(chunk_embeddings)

print(f"FAISS index created with {index.ntotal} vectors.")

# --- 3. Semantic Retrieval and Output ---

def semantic_retrieve(query, index, model, doc_chunks, k=5):
    """Performs semantic search and returns top-k ranked chunks."""

    # 1. Embed the query using the same model
    query_embedding = model.encode([query], convert_to_numpy=True).astype('float32')

    # 2. Search the FAISS index (D=Distances (L2), I=Indices)
    # k is the top-k results requested
    D, I = index.search(query_embedding, k)

    # 3. Compile the ranked results
    ranked_results = []

    for rank, idx in enumerate(I[0]):
        # Distance D[0][rank] is the L2 distance. Smaller is better.

        # Structure the required JSON output
        result = {
            "rank": rank + 1,
            "score": float(D[0][rank]), # Convert numpy float to native float for JSON
            "content": doc_chunks[idx]
        }
        ranked_results.append(result)

    return ranked_results

# [cite_start]Required invocation query example [cite: 16]
QUERY = "How do I fetch tweets with expansions?"
TOP_K = 5 # Retrieve 5 most relevant chunks

print(f"\n--- Searching for: '{QUERY}' (Top K={TOP_K}) ---")

# Run the search
top_k_chunks = semantic_retrieve(
    query=QUERY,
    index=index,
    model=model,
    doc_chunks=doc_chunks,
    k=TOP_K
)

# --- 4. Final Output JSON to stdout ---

# [cite_start]Final Output: JSON printed to stdout with ranked chunks [cite: 24]
final_output = {
    "query": QUERY,
    "ranked_chunks": top_k_chunks
}

# Print the JSON to standard output (stdout)
print("\n--- FINAL OUTPUT JSON TO STDOUT ---")
# Use print() to send output to stdout, required by the challenge
print(json.dumps(final_output, indent=4))

--- Installing Dependencies ---
--- Cloning Repository: https://github.com/xdevplatform/postman-twitter-api ---
Repository postman-twitter-api already exists.
Loading and chunking data from: postman-twitter-api/README.md
Successfully created 5 documentation chunks.
--- Loading Embedding Model: all-MiniLM-L6-v2 ---


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

--- Generating Chunk Embeddings ---


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

--- Building FAISS Index (Dim: 384) ---
FAISS index created with 5 vectors.

--- Searching for: 'How do I fetch tweets with expansions?' (Top K=5) ---

--- FINAL OUTPUT JSON TO STDOUT ---
{
    "query": "How do I fetch tweets with expansions?",
    "ranked_chunks": [
        {
            "rank": 1,
            "score": 0.9849801063537598,
            "content": "This is a Postman Collection for the Twitter API v2 endpoints.\n\nRefer to the main [Twitter API documentation](https://developer.twitter.com/en/docs) for more details.\n\nIf you have an API-related question, you can also discuss in the developer [community forum](https://twittercommunity.com).\n\n## Inst"
        },
        {
            "rank": 2,
            "score": 1.1066023111343384,
            "content": "tps://github.com/twitterdev/postman-twitter-api\n\n## Environment\n\nThis Collection includes a pre-configured Environment. You will need to set up the following variables in order to run each request (depending on th

In [None]:
import os
import json
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss

# --- Simple implementation of text splitter (Option 1 Fix) ---
# This class needs to be defined BEFORE it is used in load_and_chunk_documentation
class RecursiveCharacterTextSplitter:
    def __init__(self, chunk_size, chunk_overlap, separators):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.separators = separators

    def split_text(self, text):
        chunks = []
        start = 0
        while start < len(text):
            end = min(start + self.chunk_size, len(text))
            if end == len(text):
                chunks.append(text[start:end])
                break

            # Simple overlap handling
            chunk = text[start:end]
            chunks.append(chunk)
            start += (self.chunk_size - self.chunk_overlap)
        return chunks
# --- End of Simple Splitter ---


# --- 0. Setup and Installation ---

# 1. Install necessary libraries
print("--- Installing Dependencies ---")
!pip install sentence-transformers faiss-cpu pydantic > /dev/null

# 2. Clone the documentation repository
REPO_URL = "https://github.com/xdevplatform/postman-twitter-api"
REPO_DIR = "postman-twitter-api"
print(f"--- Cloning Repository: {REPO_URL} ---")
if not os.path.exists(REPO_DIR):
    !git clone {REPO_URL} > /dev/null
else:
    print(f"Repository {REPO_DIR} already exists.")

# --- 1. Data Loading and Chunking Strategy (MODIFIED) ---

def load_and_chunk_documentation(repo_path):
    """
    Loads documentation, chunks it, and assigns an ID and placeholder method to each chunk.
    Returns: A list of dictionaries (metadata), and a list of strings (for embedding).
    """
    DOC_FILE_PATH = os.path.join(repo_path, 'README.md')

    # List to store the metadata for each chunk
    doc_metadata = []
    # List to store ONLY the text content for the embedding model
    doc_text_list = []

    if not os.path.exists(DOC_FILE_PATH):
        print(f"ERROR: Documentation file not found at {DOC_FILE_PATH}. Using placeholder text.")
        full_doc_text = (
            "The Twitter API v2 allows fetching tweets with expansions (GET /2/tweets). "
            "Expansions are key fields used to retrieve related objects like users or media. "
            "To use expansions, append `expansions=author_id,attachments.media_keys` to your query. "
            "The Postman collection contains endpoints for tweets, users, and spaces (POST /2/users)."
        )
    else:
        print(f"Loading and chunking data from: {DOC_FILE_PATH}")
        with open(DOC_FILE_PATH, 'r', encoding='utf-8') as f:
            full_doc_text = f.read()

    # Create chunks from the text
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=300,
        chunk_overlap=50,
        separators=["\n\n", "\n", " ", ""]
    )
    text_chunks = splitter.split_text(full_doc_text)

    # ⚠️ NOTE ON METHOD ASSIGNMENT:
    # In a real-world scenario, you would parse the API documentation structure
    # (e.g., using RegEx or a specialized parser) to accurately determine if a chunk
    # belongs to a GET, POST, or DELETE endpoint.
    # For this script, we use a simple text check as a demonstration.

    for i, chunk_text in enumerate(text_chunks):
        # Heuristic to determine a placeholder method for demonstration
        method = "UNKNOWN"
        if "GET" in chunk_text.upper() or "FETCH" in chunk_text.upper():
            method = "GET"
        elif "POST" in chunk_text.upper() or "CREATE" in chunk_text.upper():
            method = "POST"
        elif "DELETE" in chunk_text.upper():
            method = "DELETE"

        # Store full metadata
        doc_metadata.append({
            "id": i,
            "method": method,
            "content": chunk_text
        })
        # Store text only for embedding
        doc_text_list.append(chunk_text)

    return doc_metadata, doc_text_list

doc_metadata, doc_text_list = load_and_chunk_documentation(REPO_DIR)
print(f"Successfully created {len(doc_metadata)} documentation chunks with metadata.")

# --- 2. Embedding and Index Creation ---

# 2.1 Choose and Load Embedding Model
MODEL_NAME = 'all-MiniLM-L6-v2'
print(f"--- Loading Embedding Model: {MODEL_NAME} ---")
model = SentenceTransformer(MODEL_NAME)
EMBEDDING_DIM = model.get_sentence_embedding_dimension()

# 2.2 Generate embeddings
print("--- Generating Chunk Embeddings ---")
chunk_embeddings = model.encode(
    doc_text_list, # Use the list of TEXT content for embedding
    convert_to_numpy=True,
    show_progress_bar=True
).astype('float32')

# 2.3 Build the Vector Index (FAISS)
print(f"--- Building FAISS Index (Dim: {EMBEDDING_DIM}) ---")
index = faiss.IndexFlatL2(EMBEDDING_DIM)
index.add(chunk_embeddings)

print(f"FAISS index created with {index.ntotal} vectors.")

# --- 3. Semantic Retrieval and Output (MODIFIED) ---

def semantic_retrieve(query, index, model, doc_metadata, k=5):
    """
    Performs semantic search and returns top-k ranked chunks, using metadata.
    """

    # 1. Embed the query
    query_embedding = model.encode([query], convert_to_numpy=True).astype('float32')

    # 2. Search the FAISS index (D=Distances, I=Indices)
    D, I = index.search(query_embedding, k)

    # 3. Compile the ranked results
    ranked_results = []

    for rank, idx in enumerate(I[0]):
        # The index 'idx' corresponds directly to the position in the doc_metadata list
        metadata = doc_metadata[idx]

        # Structure the required JSON output, including new fields
        result = {
            "rank": rank + 1,
            "score": float(D[0][rank]),
            "chunk_id": metadata["id"], # <-- Added
            "method": metadata["method"], # <-- Added
            "content": metadata["content"]
        }
        ranked_results.append(result)

    return ranked_results

# Required invocation query example
QUERY = "How do I fetch tweets with expansions?"
TOP_K = 5 # Retrieve 5 most relevant chunks

print(f"\n--- Searching for: '{QUERY}' (Top K={TOP_K}) ---")

# Run the search
top_k_chunks = semantic_retrieve(
    query=QUERY,
    index=index,
    model=model,
    doc_metadata=doc_metadata, # Pass the metadata list
    k=TOP_K
)

# --- 4. Final Output JSON to stdout ---

final_output = {
    "query": QUERY,
    "ranked_chunks": top_k_chunks
}

# Print the JSON to standard output (stdout)
print("\n--- FINAL OUTPUT JSON TO STDOUT ---")
print(json.dumps(final_output, indent=4))

--- Installing Dependencies ---
--- Cloning Repository: https://github.com/xdevplatform/postman-twitter-api ---
Repository postman-twitter-api already exists.
Loading and chunking data from: postman-twitter-api/README.md
Successfully created 5 documentation chunks with metadata.
--- Loading Embedding Model: all-MiniLM-L6-v2 ---
--- Generating Chunk Embeddings ---


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

--- Building FAISS Index (Dim: 384) ---
FAISS index created with 5 vectors.

--- Searching for: 'How do I fetch tweets with expansions?' (Top K=5) ---

--- FINAL OUTPUT JSON TO STDOUT ---
{
    "query": "How do I fetch tweets with expansions?",
    "ranked_chunks": [
        {
            "rank": 1,
            "score": 0.9849801063537598,
            "chunk_id": 0,
            "method": "POST",
            "content": "This is a Postman Collection for the Twitter API v2 endpoints.\n\nRefer to the main [Twitter API documentation](https://developer.twitter.com/en/docs) for more details.\n\nIf you have an API-related question, you can also discuss in the developer [community forum](https://twittercommunity.com).\n\n## Inst"
        },
        {
            "rank": 2,
            "score": 1.1066023111343384,
            "chunk_id": 2,
            "method": "POST",
            "content": "tps://github.com/twitterdev/postman-twitter-api\n\n## Environment\n\nThis Collection includes a pre-con

In [None]:
import os
import json
import numpy as np
import sys
from sentence_transformers import SentenceTransformer
import faiss

# --- Simple implementation of text splitter (Fix for ModuleNotFoundError) ---
class RecursiveCharacterTextSplitter:
    def __init__(self, chunk_size, chunk_overlap, separators):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.separators = separators

    def split_text(self, text):
        chunks = []
        start = 0
        while start < len(text):
            end = min(start + self.chunk_size, len(text))
            if end == len(text):
                chunks.append(text[start:end])
                break

            # Simple overlap handling
            chunk = text[start:end]
            chunks.append(chunk)
            start += (self.chunk_size - self.chunk_overlap)
        return chunks
# --- End of Simple Splitter ---


# --- 0. Setup and Installation ---

# 1. Install necessary libraries
print("--- Installing Dependencies ---", file=sys.stderr)
# Using sys.stderr to ensure install messages don't contaminate required stdout JSON output
!pip install sentence-transformers faiss-cpu pydantic > /dev/null

# 2. Clone the documentation repository
REPO_URL = "https://github.com/xdevplatform/postman-twitter-api"
REPO_DIR = "postman-twitter-api"
print(f"--- Cloning Repository: {REPO_URL} ---", file=sys.stderr)
if not os.path.exists(REPO_DIR):
    !git clone {REPO_URL} > /dev/null
else:
    print(f"Repository {REPO_DIR} already exists.", file=sys.stderr)

# --- MOCK DATA SIMULATION ---
def get_mock_endpoint_data():
    """
    Simulates reading and parsing the structured Postman Collection JSON
    to extract key API endpoints, their methods, URLs, and documentation.

    In a real solution, you would parse the actual Postman JSON file
    from the cloned repo (e.g., using json.load()).
    """
    return [
        {
            "name": "Full archive search",
            "method": "GET",
            "url": "https://api.twitter.com/2/tweets/search/all",
            "documentation": "Search across the complete history of public Tweets matching a search query. This endpoint supports up to 10 years of history. Use query parameters like 'query' and 'expansions' (e.g., author_id, attachments.media_keys). Rate limits apply based on your product track."
        },
        {
            "name": "Recent search",
            "method": "GET",
            "url": "https://api.twitter.com/2/tweets/search/recent",
            "documentation": "Search for tweets published within the last 7 days. This is the standard search endpoint. Supports all the same query parameters as the Full archive search, but is limited to recent data."
        },
        {
            "name": "Create Tweet",
            "method": "POST",
            "url": "https://api.twitter.com/2/tweets",
            "documentation": "Allows an authenticated user to post a new Tweet. The request body must contain the 'text' field. You can include media IDs, poll options, or replies to a parent tweet."
        },
        {
            "name": "Delete Tweet",
            "method": "DELETE",
            "url": "https://api.twitter.com/2/tweets/:id",
            "documentation": "Allows the authenticated user to delete a Tweet by its ID. Only the Tweet author can delete the Tweet. Requires the 'id' path parameter."
        }
    ]

# --- 1. Data Loading and Chunking Strategy (MODIFIED) ---

def load_and_chunk_documentation():
    """
    Loads mock API endpoint data, formats the text, and then chunks the formatted text.
    """
    endpoint_data = get_mock_endpoint_data()

    # Lists to store final formatted data
    doc_metadata = []
    doc_text_list = []

    print("--- Formatting and Chunking Documentation ---", file=sys.stderr)

    for endpoint in endpoint_data:
        # Step 1: Create the REQUIRED formatted string for the endpoint
        formatted_text = (
            f"{endpoint['name']} | METHOD: {endpoint['method']} | URL: {endpoint['url']} | "
            f"DESC: {endpoint['documentation']}"
        )

        # Step 2: Chunk the formatted string
        splitter = RecursiveCharacterTextSplitter(
            chunk_size=150, # Set a smaller size to create more chunks per endpoint
            chunk_overlap=30,
            separators=[" | ", ". ", "\n\n", "\n", " ", ""]
        )
        text_chunks = splitter.split_text(formatted_text)

        # Step 3: Store metadata and text for each resulting chunk
        for i, chunk_text in enumerate(text_chunks):
            # The chunk_id is a combination of the endpoint name and the chunk index for uniqueness
            chunk_id = f"{endpoint['name'].replace(' ', '_')}_{i}"

            doc_metadata.append({
                "id": chunk_id,
                "method": endpoint['method'],
                "text": chunk_text # Use 'text' key as required by the user's output
            })
            doc_text_list.append(chunk_text)

    return doc_metadata, doc_text_list

doc_metadata, doc_text_list = load_and_chunk_documentation()
print(f"Successfully created {len(doc_metadata)} documentation chunks with metadata.", file=sys.stderr)

# --- 2. Embedding and Index Creation ---

# 2.1 Choose and Load Embedding Model
MODEL_NAME = 'all-MiniLM-L6-v2'
print(f"--- Loading Embedding Model: {MODEL_NAME} ---", file=sys.stderr)
model = SentenceTransformer(MODEL_NAME)
EMBEDDING_DIM = model.get_sentence_embedding_dimension()

# 2.2 Generate embeddings
print("--- Generating Chunk Embeddings ---", file=sys.stderr)
chunk_embeddings = model.encode(
    doc_text_list,
    convert_to_numpy=True,
    show_progress_bar=True
).astype('float32')

# 2.3 Build the Vector Index (FAISS)
print(f"--- Building FAISS Index (Dim: {EMBEDDING_DIM}) ---", file=sys.stderr)
index = faiss.IndexFlatL2(EMBEDDING_DIM)
index.add(chunk_embeddings)

# --- 3. Semantic Retrieval and Output (MODIFIED) ---

def semantic_retrieve(query, index, model, doc_metadata, k=5):
    """
    Performs semantic search and returns top-k ranked chunks, using metadata.
    """

    # 1. Embed the query
    query_embedding = model.encode([query], convert_to_numpy=True).astype('float32')

    # 2. Search the FAISS index (D=Distances, I=Indices)
    D, I = index.search(query_embedding, k)

    # 3. Compile the ranked results
    ranked_results = []

    for rank, idx in enumerate(I[0]):
        # Get the full metadata object associated with the index 'idx'
        metadata = doc_metadata[idx]

        # Structure the required JSON output, ensuring 'text' key is used
        result = {
            "rank": rank + 1,
            "score": float(D[0][rank]),
            "chunk_id": metadata["id"],
            "method": metadata["method"],
            "text": metadata["text"] # REQUIRED KEY: 'text'
        }
        ranked_results.append(result)

    return ranked_results

# Required invocation query example
# In a command-line script, this would be read from sys.argv
QUERY = "How do I fetch tweets with expansions?"
TOP_K = 3

print(f"\n--- Searching for: '{QUERY}' (Top K={TOP_K}) ---", file=sys.stderr)

# Run the search
top_k_chunks = semantic_retrieve(
    query=QUERY,
    index=index,
    model=model,
    doc_metadata=doc_metadata,
    k=TOP_K
)

# --- 4. Final Output JSON to stdout ---

final_output = {
    "query": QUERY,
    "ranked_chunks": top_k_chunks
}

# Print the JSON to standard output (stdout)
# This is the ONLY part of the script that should print to stdout for the challenge submission
print("\n--- FINAL OUTPUT JSON TO STDOUT ---")
print(json.dumps(final_output, indent=4))

--- Installing Dependencies ---
--- Cloning Repository: https://github.com/xdevplatform/postman-twitter-api ---
Repository postman-twitter-api already exists.
--- Formatting and Chunking Documentation ---
Successfully created 10 documentation chunks with metadata.
--- Loading Embedding Model: all-MiniLM-L6-v2 ---
--- Generating Chunk Embeddings ---


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


--- FINAL OUTPUT JSON TO STDOUT ---
{
    "query": "How do I fetch tweets with expansions?",
    "ranked_chunks": [
        {
            "rank": 1,
            "score": 0.7123546600341797,
            "chunk_id": "Full_archive_search_1",
            "method": "GET",
            "text": "history of public Tweets matching a search query. This endpoint supports up to 10 years of history. Use query parameters like 'query' and 'expansions'"
        },
        {
            "rank": 2,
            "score": 0.8605637550354004,
            "chunk_id": "Full_archive_search_0",
            "method": "GET",
            "text": "Full archive search | METHOD: GET | URL: https://api.twitter.com/2/tweets/search/all | DESC: Search across the complete history of public Tweets match"
        },
        {
            "rank": 3,
            "score": 0.9622856378555298,
            "chunk_id": "Recent_search_0",
            "method": "GET",
            "text": "Recent search | METHOD: GET | URL: https://a

--- Building FAISS Index (Dim: 384) ---

--- Searching for: 'How do I fetch tweets with expansions?' (Top K=3) ---


In [None]:
import os, json
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
from tqdm import tqdm

# --------------------------------------
# LOAD & PARSE TWITTER POSTMAN COLLECTION
# --------------------------------------

def extract_from_json(path):
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)

    records = []

    def walk(node, parent=""):
        if isinstance(node, dict):
            name = node.get("name", "")
            method = ""
            url = ""
            desc = ""

            if "request" in node:
                req = node["request"]
                method = req.get("method", "")
                desc = req.get("description", "")

                # extract URL
                try:
                    if "raw" in req.get("url", {}):
                        url = req["url"]["raw"]
                except:
                    url = ""

                text = f"{name} | METHOD: {method} | URL: {url} | DESC: {desc}"
                records.append(text)

            for k, v in node.items():
                walk(v, name)

        elif isinstance(node, list):
            for item in node:
                walk(item)

    walk(data)
    return records


DOCS_DIR = "postman-twitter-api"
all_records = []

# load all JSON files
for root, dirs, files in os.walk(DOCS_DIR):
    for f in files:
        if f.endswith(".json"):
            path = os.path.join(root, f)
            print("Loading:", path)
            all_records.extend(extract_from_json(path))

print("Total extracted records:", len(all_records))


# ---------------------
# CHUNKING (simple)
# ---------------------
def chunk_text(text, chunk_size=50):
    words = text.split()
    return [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]

chunks = []
for rec in all_records:
    chunks.extend(chunk_text(rec))

print("Total chunks:", len(chunks))


# ---------------------
# EMBEDDINGS
# ---------------------
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
embeddings = model.encode(chunks, batch_size=16, show_progress_bar=True)
embeddings = np.array(embeddings).astype("float32")


# ---------------------
# BUILD VECTOR INDEX
# ---------------------
dim = embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(embeddings)


# ---------------------
# SEMANTIC SEARCH
# ---------------------
def semantic_search(query, top_k=5):
    q = model.encode([query]).astype("float32")
    D, I = index.search(q, top_k)

    results = []
    for rank, (score, idx) in enumerate(zip(D[0], I[0]), 1):
        text = chunks[idx]

        # extract METHOD if possible
        method = ""
        if "METHOD:" in text:
            try:
                method = text.split("METHOD:")[1].split("|")[0].strip()
            except:
                method = ""

        results.append({
            "rank": rank,
            "score": float(score),
            "chunk_id": int(idx),
            "method": method,
            "text": text
        })

    return results


# ---------------------
# TEST QUERY
# ---------------------
query = "How do I fetch tweets with expansions?"
res = semantic_search(query, top_k=5)

print(json.dumps(res, indent=2))


Loading: postman-twitter-api/Twitter API v2.postman_collection.json
Loading: postman-twitter-api/Twitter API v2.postman_environment.json
Total extracted records: 56
Total chunks: 61


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

[
  {
    "rank": 1,
    "score": 0.7721138596534729,
    "chunk_id": 2,
    "method": "GET",
    "text": "Multiple Tweets | METHOD: GET | URL: https://api.twitter.com/2/tweets?ids= | DESC: This endpoint returns details about up to 100 Tweets specified by the requested IDs. For full details, see the [API reference](https://developer.twitter.com/en/docs/twitter-api/tweets/lookup/api-reference/get-tweets) for this endpoint."
  },
  {
    "rank": 2,
    "score": 0.7721211910247803,
    "chunk_id": 0,
    "method": "GET",
    "text": "Single Tweet | METHOD: GET | URL: https://api.twitter.com/2/tweets/:id | DESC: This endpoint returns details about the Tweet specified by the requested ID. For full details, see the [API reference](https://developer.twitter.com/en/docs/twitter-api/tweets/lookup/api-reference/get-tweets-id) for this endpoint."
  },
  {
    "rank": 3,
    "score": 0.7963993549346924,
    "chunk_id": 1,
    "method": "GET",
    "text": "Single Tweet Usercontext | METHOD: GET | U