# File to see the difference in tokenization

### Helper methods

In [1]:
import requests
from requests.auth import HTTPBasicAuth
import json

# Config
es_url = "http://localhost:9201"
es_user = "elastic"
es_password = "apppw"

def analyze(text, payload):
    endpoint = f"{es_url}/_analyze"

    payload["text"] = text

    response = requests.post(endpoint, auth=HTTPBasicAuth(es_user, es_password),
                             headers={"Content-Type": "application/json"},
                             data=json.dumps(payload))
    
    if response.status_code != 200:
        print(f"Error with {payload}: {response.text}")
        return []

    tokens = [t["token"] for t in response.json().get("tokens", [])]
    return tokens



### Standard analysers

In [2]:
import requests
from requests.auth import HTTPBasicAuth
import json

# Config
es_url = "http://localhost:9201"
es_user = "elastic"
es_password = "apppw"

# Text to analyze
text_to_analyze = "The quick brown fox jumps over the lazy dog!"

# Analyzers/tokenizers to test
analyzers = [
    {"analyzer": "standard"},
    {"analyzer": "simple"},
    {"analyzer": "whitespace"},
    {"analyzer": "stop"},
    {"analyzer": "keyword"},
    {"analyzer": "pattern"},
    {"analyzer": "fingerprint"},
]

# Run analysis
for analyser in analyzers:
    tokens = analyze(text_to_analyze, analyser)
    print(f"--- {analyser['analyzer']} ---")
    print(tokens)
    print()


--- standard ---
['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']

--- simple ---
['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']

--- whitespace ---
['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog!']

--- stop ---
['quick', 'brown', 'fox', 'jumps', 'over', 'lazy', 'dog']

--- keyword ---
['The quick brown fox jumps over the lazy dog!']

--- pattern ---
['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']

--- fingerprint ---
['brown dog fox jumps lazy over quick the']



### Standard tokenizers

In [3]:
# Text to analyze
text_to_analyze = "The quick brown fox jumps over the lazy dog!"

# Tokenizers to test
tokenizers = [
    {"tokenizer": "standard"},
    {"tokenizer": "letter"},
    {"tokenizer": "standard"},
    {"tokenizer": "whitespace"},
    {"tokenizer": "letter"},
    {"tokenizer": "ngram"},
    {"tokenizer": "edge_ngram"},
    {"tokenizer": "keyword"},
]


# Run analysis
for tokenizer in tokenizers:
    tokens = analyze(text_to_analyze, tokenizer)
    print(f"--- {tokenizer['tokenizer']} tokenizer ---")
    print(tokens)
    print()


--- standard tokenizer ---
['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']

--- letter tokenizer ---
['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']

--- standard tokenizer ---
['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']

--- whitespace tokenizer ---
['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog!']

--- letter tokenizer ---
['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']

--- ngram tokenizer ---
['T', 'Th', 'h', 'he', 'e', 'e ', ' ', ' q', 'q', 'qu', 'u', 'ui', 'i', 'ic', 'c', 'ck', 'k', 'k ', ' ', ' b', 'b', 'br', 'r', 'ro', 'o', 'ow', 'w', 'wn', 'n', 'n ', ' ', ' f', 'f', 'fo', 'o', 'ox', 'x', 'x ', ' ', ' j', 'j', 'ju', 'u', 'um', 'm', 'mp', 'p', 'ps', 's', 's ', ' ', ' o', 'o', 'ov', 'v', 've', 'e', 'er', 'r', 'r ', ' ', ' t', 't', 'th', 'h', 'he', 'e', 'e ', ' ', ' l', 'l', 'la', 'a', 'az', 'z', 'zy', 'y', 'y ', ' ', ' d', 'd', 'do', 'o', 'og', 'g', 'g!', '!'

### Token filters

In [6]:
# Text to analyze
text_to_analyze = "The Running Café in München!"

# Token filters to test (with tokenizer)
token_filters = [
    {"tokenizer": "standard", "filter": ["lowercase"]},
    {"tokenizer": "standard", "filter": ["stop"]},
    {"tokenizer": "standard", "filter": ["asciifolding"]},
    {"tokenizer": "standard", "filter": ["stemmer"]},
    {"tokenizer": "standard", "filter": ["unique"]},
]

# Run analysis
for filter in token_filters:
    tokens = analyze(text_to_analyze, filter)
    print(f"--- Filters: {filter['filter'][0]} ---")
    print(tokens)
    print()


--- Filters: lowercase ---
['the', 'running', 'café', 'in', 'münchen']

--- Filters: stop ---
['The', 'Running', 'Café', 'München']

--- Filters: asciifolding ---
['The', 'Running', 'Cafe', 'in', 'Munchen']

--- Filters: stemmer ---
['The', 'Run', 'Café', 'in', 'München']

--- Filters: unique ---
['The', 'Running', 'Café', 'in', 'München']



### Stemming Example

In [8]:
# Text to analyze
text_to_analyze = "The &lt;b&gt;quick&lt;/b&gt; brown fox"

# Character filters to test
char_filters = [
    {"tokenizer": "standard", "char_filter": ["html_strip"]},
    {"tokenizer": "standard", "char_filter": ["mapping"]},
]

# Run analysis
for char_filter in char_filters:
    tokens = analyze(text_to_analyze, char_filter)
    print(f"--- Char Filter: {char_filter['char_filter']} ---")
    print(tokens)
    print()


--- Char Filter: ['html_strip'] ---
['The', 'b', 'quick', 'b', 'brown', 'fox']

Error with {'tokenizer': 'standard', 'char_filter': ['mapping'], 'text': 'The &lt;b&gt;quick&lt;/b&gt; brown fox'}: {"error":{"root_cause":[{"type":"illegal_argument_exception","reason":"Analysis settings required - can't instantiate analysis factory"}],"type":"illegal_argument_exception","reason":"Analysis settings required - can't instantiate analysis factory"},"status":400}
--- Char Filter: ['mapping'] ---
[]

