## Field Data Types

In [1]:
from pprint import pprint
from elasticsearch import Elasticsearch
es = Elasticsearch("http://localhost:9200")
client_info = es.info()
print ("Connected to Elastic Search")
pprint(client_info.body)

Connected to Elastic Search
{'cluster_name': 'docker-cluster',
 'cluster_uuid': 'WpOI-sfBSXe9aaVkWGIQnQ',
 'name': '61929d733ddf',
 'tagline': 'You Know, for Search',
 'version': {'build_date': '2025-12-16T10:09:08.849001802Z',
             'build_flavor': 'default',
             'build_hash': 'd8972a71dbbd64ff17f2f4dba9ca2c3fe09fb100',
             'build_snapshot': False,
             'build_type': 'docker',
             'lucene_version': '10.3.2',
             'minimum_index_compatibility_version': '8.0.0',
             'minimum_wire_compatibility_version': '8.19.0',
             'number': '9.2.3'}}


## 1. Common types
### 1.1 Binary


In [2]:
es.indices.delete(index='binary_index', ignore_unavailable=True)
es.indices.create(
    index='binary_index',
    mappings={
        "properties": {
            "image_data": {
                "type": "binary"
            }
        }
    }
)


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'binary_index'})

In [3]:
import base64

image_path = "./images/field_data_types_docs.png"
with open(image_path, "rb") as image_file:
    image_bytes = image_file.read()
    image_base64 = base64.b64encode(image_bytes).decode("utf-8")

image_base64[:100]

'iVBORw0KGgoAAAANSUhEUgAAB4AAAAJTCAYAAADpMAvgAAAABHNCSVQICAgIfAhkiAAAABl0RVh0U29mdHdhcmUAZ25vbWUtc2Ny'

In [4]:
len(image_base64)

271328

In [5]:
doc = {
    "image_data": image_base64
}
es.index(index='binary_index', document=doc)

ObjectApiResponse({'_index': 'binary_index', '_id': 'I6sJ8JsBkibQBq1fh1sr', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1})

That why we generally dont store images mostly in elastic search, the lenght of a single image is very large

In [6]:
es.indices.delete(index='other_common_data_types_index',
                  ignore_unavailable=True)
es.indices.create(
    index='other_common_data_types_index',
    mappings={
        "properties": {
            "book_reference": {
                "type": "keyword"
            },
            "price": {
                "type": "float"
            },
            "publish_date": {
                "type": "date"
            },
            "is_available": {
                "type": "boolean"
            },
        }
    }
)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'other_common_data_types_index'})

In [7]:
document = {
    "book_reference": "978-1617294433",
    "price": 44.99,
    "publish_date": "2021-06-30",
    "is_available": True
}
response = es.index(index='other_common_data_types_index', body=document)
response.body

{'_index': 'other_common_data_types_index',
 '_id': 'JKsJ8JsBkibQBq1fi1vy',
 '_version': 1,
 'result': 'created',
 '_shards': {'total': 2, 'successful': 1, 'failed': 0},
 '_seq_no': 0,
 '_primary_term': 1}

## Object datatypes

In [22]:
es.indices.delete(index='object_index', ignore_unavailable=True)
es.indices.create(
    index='object_index',
    mappings={
        "properties": {
            "author": {
                "properties": {
                    "first_name": {
                        "type": "text"
                    },
                    "last_name": {
                        "type": "text"
                    }
                }
            }
        }
    }
)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'object_index'})

In [23]:
mapping = es.indices.get_mapping(index='object_index')

In [24]:
pprint(mapping['object_index']['mappings']['properties'])

{'author': {'properties': {'first_name': {'type': 'text'},
                           'last_name': {'type': 'text'}}}}


In [25]:
document = {
    "author": [
        {
            "first_name": "John",
            "last_name": "Doe"
        },
        {
        "first_name": "Imad",
        "last_name": "Saddik"
        }
    ]
}
response = es.index(index='object_index', body=document)
response.body

{'_index': 'object_index',
 '_id': 'J6sM8JsBkibQBq1f2lsU',
 '_version': 1,
 'result': 'created',
 '_shards': {'total': 2, 'successful': 1, 'failed': 0},
 '_seq_no': 0,
 '_primary_term': 1}

In [26]:
search_query = {
    "query": {
        "bool": {
            "must": {
                "match" : { "author.first_name": "Imad"},
                "match" : { "author.last_name": "Doe"}
            }
        }
    }
}
response = es.search(index='object_index', body=search_query)
pprint(response.body['hits']['hits'])

[{'_id': 'J6sM8JsBkibQBq1f2lsU',
  '_index': 'object_index',
  '_score': 0.2876821,
  '_source': {'author': [{'first_name': 'John', 'last_name': 'Doe'},
                         {'first_name': 'Imad', 'last_name': 'Saddik'}]}}]


## Flattening of JSON Objects in Elasticsearch

Elasticsearch internally flattens JSON objects by transforming nested structures into dot-notation fields. For example, an object like:

```json
{
    "author": {
        "first_name": "John",
        "last_name": "Doe"
    }
}
```

Gets flattened to: `author.first_name: "John"` and `author.last_name: "Doe"`

### The Problem with Arrays of Objects

This flattening becomes problematic when dealing with **arrays of objects**. Consider:

```json
{
    "authors": [
        {"first_name": "John", "last_name": "Doe"},
        {"first_name": "Jane", "last_name": "Smith"}
    ]
}
```

Elasticsearch flattens this to:
- `authors.first_name: ["John", "Jane"]`
- `authors.last_name: ["Doe", "Smith"]`

**The relationship between first_name and last_name is lost!** A search for `first_name: "John" AND last_name: "Smith"` would incorrectly match this document, even though no author with that combination exists.

### Solution: Nested Datatype

The `nested` datatype preserves the independence of each object in an array by storing them as separate hidden documents, maintaining the relationship between fields within each object.

### Nested DataType

In [54]:
es.indices.delete(index='nested_object_index', ignore_unavailable=True)
es.indices.create(
    index='nested_object_index',
    mappings={
        "properties": {
            "author": {
                "type": "nested"
            }
        }
    }
)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'nested_object_index'})

In [55]:
mapping = es.indices.get_mapping(index='nested_object_index')
mapping['nested_object_index']['mappings']['properties']

{'author': {'type': 'nested'}}

In [56]:
document = {
    "author": [
        {
            "first_name": "John",
            "last_name": "Doe"
        },
        {
        "first_name": "Imad",
        "last_name": "Saddik"
        }
    ]
}
response = es.index(index='nested_object_index', body=document)
response.body

{'_index': 'nested_object_index',
 '_id': 'LasV8JsBkibQBq1fzVsh',
 '_version': 1,
 'result': 'created',
 '_shards': {'total': 2, 'successful': 1, 'failed': 0},
 '_seq_no': 0,
 '_primary_term': 1}

In [57]:
mapping = es.indices.get_mapping(index='nested_object_index')
mapping['nested_object_index']['mappings']['properties']

{'author': {'type': 'nested',
  'properties': {'first_name': {'type': 'text',
    'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}},
   'last_name': {'type': 'text',
    'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}}}}

In [58]:
search_query = {
    "query": {
        "bool": {
            "must": {
                "match" : { "author.first_name": "Imad"},
                "match" : { "author.last_name": "Doe"}
            }
        }
    }
}
response = es.search(index='nested_object_index', body=search_query)
pprint(response.body['hits']['hits'])

[]


In [60]:
search_query = {
    "query": {
        "nested": {
            "path": "author",
            "query": {
                "bool": {
                    "must": [
                        {"match": {"author.first_name": "Imad"}},
                        {"match": {"author.last_name": "Saddik"}}
                    ]
                }
            }
        }
    }
}
response = es.search(index='nested_object_index', body=search_query)
pprint(response.body['hits']['hits'])

[{'_id': 'LasV8JsBkibQBq1fzVsh',
  '_index': 'nested_object_index',
  '_score': 1.3862942,
  '_source': {'author': [{'first_name': 'John', 'last_name': 'Doe'},
                         {'first_name': 'Imad', 'last_name': 'Saddik'}]}}]


## Text Search Types

### 3.1 Text

In [62]:

es.indices.delete(index='text_index', ignore_unavailable=True)
es.indices.create(
    index='text_index',
    mappings={
        "properties": {
            "email_body": {
                "type": "text"
            }
        }
    }
)
document = {
    "email_body": "Hello, this is a test email."
}
response = es.index(index='text_index', body=document)
response.body

{'_index': 'text_index',
 '_id': 'L6sZ8JsBkibQBq1f6lvf',
 '_version': 1,
 'result': 'created',
 '_shards': {'total': 2, 'successful': 1, 'failed': 0},
 '_seq_no': 0,
 '_primary_term': 1}

### 3.2 Completion

In [63]:

es.indices.delete(index='text_completion_index', ignore_unavailable=True)
es.indices.create(
    index='text_completion_index',
    mappings={
        "properties": {
            "suggest": {
                "type": "completion"
            }
        }
    }
)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'text_completion_index'})

In [64]:
document_1 = {
    "suggest": {
        "input": ["Mars", "Planet"]
    }
}

document_2 = {
    "suggest": {
        "input": ["Andromeda", "Galaxy"]
    }
}

es.index(index='text_completion_index', body=document_1)
es.index(index='text_completion_index', body=document_2)

ObjectApiResponse({'_index': 'text_completion_index', '_id': 'Masa8JsBkibQBq1f9lvX', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 1, '_primary_term': 1})

In [71]:
search_query = {
    "suggest_planets": {
        "text": "Ma",
        "completion": {
            "field": "suggest"
        }
    }
}   

response = es.search(index='text_completion_index', body={"suggest": search_query})
pprint(response.body['suggest'])

{'suggest_planets': [{'length': 2,
                      'offset': 0,
                      'options': [{'_id': 'MKsa8JsBkibQBq1f9lvH',
                                   '_index': 'text_completion_index',
                                   '_score': 1.0,
                                   '_source': {'suggest': {'input': ['Mars',
                                                                     'Planet']}},
                                   'text': 'Mars'}],
                      'text': 'Ma'}]}


You didnt understood this clearly do some research