# File to see the difference in tokenization

### Helper methods

In [None]:
import requests
from requests.auth import HTTPBasicAuth
import json

# Config
es_url = "http://localhost:9201"
es_user = "elastic"
es_password = "apppw"

def analyze(text, payload):
    endpoint = f"{es_url}/_analyze"

    payload["text"] = text

    response = requests.post(endpoint, auth=HTTPBasicAuth(es_user, es_password),
                             headers={"Content-Type": "application/json"},
                             data=json.dumps(payload))
    
    if response.status_code != 200:
        print(f"Error with {payload}: {response.text}")
        return []

    tokens = [t["token"] for t in response.json().get("tokens", [])]
    return tokens

### Standard analysers

In [None]:
import requests
from requests.auth import HTTPBasicAuth
import json

# Config
es_url = "http://localhost:9201"
es_user = "elastic"
es_password = "apppw"

# Text to analyze
text_to_analyze = "The quick brown fox jumps over the lazy dog!"

# Analyzers/tokenizers to test
analyzers = [
    {"analyzer": "standard"},
    {"analyzer": "simple"},
    {"analyzer": "whitespace"},
    {"analyzer": "stop"},
    {"analyzer": "keyword"},
    {"analyzer": "pattern"},
    {"analyzer": "fingerprint"},
]

# Run analysis
for analyser in analyzers:
    tokens = analyze(text_to_analyze, analyser)
    print(f"--- {analyser['analyzer']} ---")
    print(tokens)
    print()


### Standard tokenizers

In [None]:
# Text to analyze
text_to_analyze = "The quick brown fox jumps over the lazy dog!"

# Tokenizers to test
tokenizers = [
    {"tokenizer": "standard"},
    {"tokenizer": "letter"},
    {"tokenizer": "standard"},
    {"tokenizer": "whitespace"},
    {"tokenizer": "letter"},
    {"tokenizer": "ngram"},
    {"tokenizer": "edge_ngram"},
    {"tokenizer": "keyword"},
]


# Run analysis
for tokenizer in tokenizers:
    tokens = analyze(text_to_analyze, tokenizer)
    print(f"--- {tokenizer['tokenizer']} tokenizer ---")
    print(tokens)
    print()


### Token filters

In [None]:
# Text to analyze
text_to_analyze = "The Running Café in München!"

# Token filters to test (with tokenizer)
token_filters = [
    {"tokenizer": "standard", "filter": ["lowercase"]},
    {"tokenizer": "standard", "filter": ["stop"]},
    {"tokenizer": "standard", "filter": ["asciifolding"]},
    {"tokenizer": "standard", "filter": ["stemmer"]},
    {"tokenizer": "standard", "filter": ["unique"]},
]

# Run analysis
for filter in token_filters:
    tokens = analyze(text_to_analyze, filter)
    print(f"--- Filters: {filter['filter'][0]} ---")
    print(tokens)
    print()


### Stemming Example

In [None]:
# Text to analyze
text_to_analyze = "The &lt;b&gt;quick&lt;/b&gt; brown fox"

# Character filters to test
char_filters = [
    {"tokenizer": "standard", "char_filter": ["html_strip"]},
    {"tokenizer": "standard", "char_filter": ["mapping"]},
]

# Run analysis
for char_filter in char_filters:
    tokens = analyze(text_to_analyze, char_filter)
    print(f"--- Char Filter: {char_filter['char_filter']} ---")
    print(tokens)
    print()
