In [11]:
import numpy as np
import os
import time
import requests
import json

In [5]:
ES_HOST = os.environ.get("ES_HOST", "http://es01:9200")
WORDS_FILE = "/app/words.txt"

def wait_for_elasticsearch():
    """ Wait until Elasticsearch is accessible on ES_HOST. """
    while True:
        try:
            r = requests.get(ES_HOST, timeout=3)
            if r.status_code == 200:
                print(f"Elasticsearch is up at {ES_HOST}.")
                break
        except requests.exceptions.RequestException:
            pass
        print(f"Waiting for Elasticsearch to be ready at {ES_HOST}...")
        time.sleep(3)

        
wait_for_elasticsearch()

Elasticsearch is up at http://es01:9200.


In [7]:
def create_autocomplete_index():
    """ Create the 'autocomplete' index with a 'completion' field mapping. """
    print("Creating the 'autocomplete' index with completion mapping...")

    # Define the mapping for the 'suggest' field
    mapping = {
        "mappings": {
            "properties": {
                "suggest": {
                    "type": "completion"
                }
            }
        }
    }

    url = f"{ES_HOST}/autocomplete"
    # Use PUT to create or update index
    response = requests.put(url, json=mapping)
    if response.status_code in (200, 201):
        print("Index created or updated successfully.")
    else:
        # It's okay if index already exists; 400/404 can happen
        print(f"Index creation response ({response.status_code}): {response.text}")

create_autocomplete_index()

Creating the 'autocomplete' index with completion mapping...
Index created or updated successfully.


In [16]:
def bulk_index_in_batches(words, batch_size=10000):
    """
    Splits the 'words' list into chunks of 'batch_size' and 
    sends each chunk to the Elasticsearch Bulk API.
    """
    def generate_bulk_payload(batch):
        lines = []
        for word in batch:
            # Action/metadata
            lines.append(json.dumps({"index": {"_index": "autocomplete"}}))
            # Document body
            lines.append(json.dumps({"suggest": word}))
        return "\n".join(lines) + "\n"

    total_docs = len(words)
    start = 0

    while start < total_docs:
        end = min(start + batch_size, total_docs)
        batch = words[start:end]
        payload = generate_bulk_payload(batch)

        # Send bulk
        bulk_url = f"{ES_HOST}/_bulk?refresh=wait_for"
        headers = {"Content-Type": "application/x-ndjson"}
        response = requests.post(bulk_url, data=payload, headers=headers)

        if response.status_code not in (200, 201):
            print(f"Batch [{start}:{end}] failed with status {response.status_code}")
            print(response.text)
        else:
            resp_json = response.json()
            if resp_json.get("errors"):
                print(f"Batch [{start}:{end}] had partial failures:")
                for item in resp_json.get("items", []):
                    if "error" in item["index"]:
                        print(json.dumps(item["index"]["error"], indent=2))
            else:
                print(f"Batch [{start}:{end}] indexed successfully.")

        start = end

def index_words_in_batches():
    if not os.path.isfile(WORDS_FILE):
        print("No words.txt file found; skipping.")
        return

    with open(WORDS_FILE, "r", encoding="utf-8") as f:
        words = [line.strip() for line in f if line.strip()]

    # Here we choose 1000 as a default batch size; adjust to your needs
    bulk_index_in_batches(words)

# Then call:
index_words_in_batches()

Batch [0:10000] indexed successfully.
Batch [10000:20000] indexed successfully.
Batch [20000:30000] indexed successfully.
Batch [30000:40000] indexed successfully.
Batch [40000:50000] indexed successfully.
Batch [50000:60000] indexed successfully.
Batch [60000:70000] indexed successfully.
Batch [70000:80000] indexed successfully.
Batch [80000:90000] indexed successfully.
Batch [90000:100000] indexed successfully.
Batch [100000:110000] indexed successfully.
Batch [110000:120000] indexed successfully.
Batch [120000:130000] indexed successfully.
Batch [130000:140000] indexed successfully.
Batch [140000:150000] indexed successfully.
Batch [150000:160000] indexed successfully.
Batch [160000:170000] indexed successfully.
Batch [170000:180000] indexed successfully.
Batch [180000:190000] indexed successfully.
Batch [190000:200000] indexed successfully.
Batch [200000:210000] indexed successfully.
Batch [210000:220000] indexed successfully.
Batch [220000:230000] indexed successfully.
Batch [2300

In [18]:
def autocomplete(prefix):
    """
    Send a suggest query to the 'autocomplete' index with the given prefix
    and return a list of suggestion options.
    """
    url = f"{ES_HOST}/autocomplete/_search"
    headers = {"Content-Type": "application/json"}
    
    # Suggest query body
    suggest_query = {
        "suggest": {
            "word-suggest": {
                "prefix": prefix,
                "completion": {
                    "field": "suggest"
                }
            }
        }
    }
    
    response = requests.post(url, headers=headers, json=suggest_query)
    if response.status_code != 200:
        print(f"Error fetching suggestions. Status: {response.status_code}")
        print(response.text)
        return []
    
    data = response.json()
    
    # The suggest results are under data["suggest"]["word-suggest"][0]["options"]
    # We'll parse and return the suggested text.
    suggestions = []
    try:
        suggest_items = data["suggest"]["word-suggest"][0]["options"]
        for item in suggest_items:
            suggestions.append(item["text"])
    except (KeyError, IndexError):
        print("Unexpected response format:")
        print(json.dumps(data, indent=2))
    
    return suggestions


test_prefixes = ["wo", "he", "wor", "xyz"]

for prefix in test_prefixes:
    results = autocomplete(prefix)
    print(f"Prefix: '{prefix}' -> Suggestions: {results}")


Prefix: 'wo' -> Suggestions: ['WO', 'woa', 'woad', 'woad-leaved', 'woad-painted']
Prefix: 'he' -> Suggestions: ['HE', 'he-all', 'he-balsam', 'he-broom', 'he-cabbage-tree']
Prefix: 'wor' -> Suggestions: ['Worcester', 'Worcestershire', 'Word', 'worble', 'word-beat']
Prefix: 'xyz' -> Suggestions: ['xyz']
