In [1]:
from pprint import pprint
from elasticsearch  import Elasticsearch, helpers
import json

#Connect to elasticsearch
es = Elasticsearch('http://localhost:9200')
client_info = es.info()
print("Connected to Elasticsearch")
pprint(client_info.body)

Connected to Elasticsearch
{'cluster_name': 'docker-cluster',
 'cluster_uuid': '4Xf3-_8JQxCX6HVz9r0yvA',
 'name': '5a0a9ef6b26a',
 'tagline': 'You Know, for Search',
 'version': {'build_date': '2024-08-05T10:05:34.233336849Z',
             'build_flavor': 'default',
             'build_hash': '1a77947f34deddb41af25e6f0ddb8e830159c179',
             'build_snapshot': False,
             'build_type': 'docker',
             'lucene_version': '9.11.1',
             'minimum_index_compatibility_version': '7.0.0',
             'minimum_wire_compatibility_version': '7.17.0',
             'number': '8.15.0'}}


In [2]:
# Create apache index
es.indices.delete(index='apache', ignore_unavailable=True)
es.indices.create(
    index="apache",
    settings={
        "index": {
            "number_of_shards": 3,
            "number_of_replicas": 2
        }
    },
    mappings={
        "properties": {
            "timestamp_iso":   {"type": "date"},
            "timestamp_epoch": {"type": "long"},
            "log_level":       {"type": "keyword"},
            "message":         {"type": "text"},
            "raw_line":        {"type": "text"}
        }
    }
)

print("Index created: apache")

Index created: apache


In [3]:
# Bulk Import the NDJSON file for the apache index
NDJSON_FILE = r"C:\Users\nikhi\loghub\Apache\apache_bulk.ndjson"

def generate_actions():
    with open(NDJSON_FILE, "r", encoding="utf-8") as f:
        lines = f.readlines()

        # NDJSON is in pairs:
        # { "index": {} }
        # { actual_document }
        for i in range(0, len(lines), 2):
            action_meta = json.loads(lines[i])      # index metadata (ignored)
            doc = json.loads(lines[i + 1])          # actual log document

            action = {
                "_index": "apache",
                "_source": doc
            }
            yield action

helpers.bulk(es, generate_actions())

print("Bulk upload completed!")


Bulk upload completed!


In [4]:
resp = es.search(
    index="apache",
    query={"match_all": {}},
    size=10
)

resp


ObjectApiResponse({'took': 8, 'timed_out': False, '_shards': {'total': 3, 'successful': 3, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 2000, 'relation': 'eq'}, 'max_score': 1.0, 'hits': [{'_index': 'apache', '_id': 'W5HggJoBB2aHXSli2yZ3', '_score': 1.0, '_source': {'timestamp_epoch': 1133652069, 'timestamp_iso': '2005-12-04T04:51:09Z', 'log_level': 'notice', 'message': 'jk2_init() Found child 6726 in scoreboard slot 8', 'raw_line': '[Sun Dec 04 04:51:09 2005] [notice] jk2_init() Found child 6726 in scoreboard slot 8'}}, {'_index': 'apache', '_id': 'Y5HggJoBB2aHXSli2yZ3', '_score': 1.0, '_source': {'timestamp_epoch': 1133652097, 'timestamp_iso': '2005-12-04T04:51:37Z', 'log_level': 'notice', 'message': 'jk2_init() Found child 6736 in scoreboard slot 10', 'raw_line': '[Sun Dec 04 04:51:37 2005] [notice] jk2_init() Found child 6736 in scoreboard slot 10'}}, {'_index': 'apache', '_id': 'ZZHggJoBB2aHXSli2yZ3', '_score': 1.0, '_source': {'timestamp_epoch': 1133652098, 'timestamp_

In [5]:
resp = es.search(
    index="apache",
    query={
        "term": {
            "log_level": "notice"
        }
    },
    size=20
)

resp


ObjectApiResponse({'took': 47, 'timed_out': False, '_shards': {'total': 3, 'successful': 3, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 1405, 'relation': 'eq'}, 'max_score': 0.38484582, 'hits': [{'_index': 'apache', '_id': 'WJHggJoBB2aHXSli2yZ3', '_score': 0.38484582, '_source': {'timestamp_epoch': 1133651864, 'timestamp_iso': '2005-12-04T04:47:44Z', 'log_level': 'notice', 'message': 'workerEnv.init() ok /etc/httpd/conf/workers2.properties', 'raw_line': '[Sun Dec 04 04:47:44 2005] [notice] workerEnv.init() ok /etc/httpd/conf/workers2.properties'}}, {'_index': 'apache', '_id': 'WpHggJoBB2aHXSli2yZ3', '_score': 0.38484582, '_source': {'timestamp_epoch': 1133652068, 'timestamp_iso': '2005-12-04T04:51:08Z', 'log_level': 'notice', 'message': 'jk2_init() Found child 6725 in scoreboard slot 10', 'raw_line': '[Sun Dec 04 04:51:08 2005] [notice] jk2_init() Found child 6725 in scoreboard slot 10'}}, {'_index': 'apache', '_id': 'ZJHggJoBB2aHXSli2yZ3', '_score': 0.38484582, '_source': 