# File to see the difference in tokenization

In [1]:
import requests
from requests.auth import HTTPBasicAuth
import json

# Config
es_url = "http://localhost:9201"
es_user = "elastic"
es_password = "apppw"

# Text to analyze
text_to_analyze = "The quick brown fox jumps over the lazy dog!"

# Analyzers/tokenizers to test
analyzers = [
    {"name": "standard", "type": "analyzer"},
    {"name": "simple", "type": "analyzer"},
    {"name": "whitespace", "type": "tokenizer"},
    {"name": "letter", "type": "tokenizer"},
]

def analyze(text, config):
    endpoint = f"{es_url}/_analyze"
    payload = {}

    if config["type"] == "analyzer":
        payload["text"] = text
        payload["analyzer"] = config["name"]
    elif config["type"] == "tokenizer":
        payload["text"] = text
        payload["tokenizer"] = config["name"]

    response = requests.post(endpoint, auth=HTTPBasicAuth(es_user, es_password),
                             headers={"Content-Type": "application/json"},
                             data=json.dumps(payload))
    
    if response.status_code != 200:
        print(f"Error with {config['name']}: {response.text}")
        return []

    tokens = [t["token"] for t in response.json().get("tokens", [])]
    return tokens

# Run analysis
for a in analyzers:
    tokens = analyze(text_to_analyze, a)
    print(f"--- {a['name']} ({a['type']}) ---")
    print(tokens)
    print()




--- standard (analyzer) ---
['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']

--- simple (analyzer) ---
['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']

--- whitespace (tokenizer) ---
['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog!']

--- letter (tokenizer) ---
['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']

