In [2]:
from pprint import pprint
from elasticsearch  import Elasticsearch, helpers
import json

#Connect to elasticsearch
es = Elasticsearch('http://localhost:9200')
client_info = es.info()
print("Connected to Elasticsearch")
pprint(client_info.body)

Connected to Elasticsearch
{'cluster_name': 'docker-cluster',
 'cluster_uuid': '4Xf3-_8JQxCX6HVz9r0yvA',
 'name': '5a0a9ef6b26a',
 'tagline': 'You Know, for Search',
 'version': {'build_date': '2024-08-05T10:05:34.233336849Z',
             'build_flavor': 'default',
             'build_hash': '1a77947f34deddb41af25e6f0ddb8e830159c179',
             'build_snapshot': False,
             'build_type': 'docker',
             'lucene_version': '9.11.1',
             'minimum_index_compatibility_version': '7.0.0',
             'minimum_wire_compatibility_version': '7.17.0',
             'number': '8.15.0'}}


In [3]:
#Create thunderbird index
es.indices.delete(index='thunderbird', ignore_unavailable=True)
es.indices.create(
    index="thunderbird",
    settings={
        "index": {
            "number_of_shards": 3,
            "number_of_replicas": 2
        }
    },
    mappings={
        "properties": {
            "timestamp_iso": {"type": "date"},
            "timestamp_epoch": {"type": "long"},
            "date_field": {"type": "keyword"},
            "host": {"type": "keyword"},
            "message": {"type": "text"},
            "raw_line": {"type": "text"}
        }
    }
)

print("Index created: thunderbird")

Index created: thunderbird


In [4]:
#Bulk Import the NDJSON file for the thunderbird index
NDJSON_FILE = r"C:\Users\nikhi\loghub\Thunderbird\thunderbird_bulk.ndjson"

def generate_actions():
    with open(NDJSON_FILE, "r", encoding="utf-8") as f:
        lines = f.readlines()

        # NDJSON is in pairs:
        # { "index": {} }
        # { actual_document }
        for i in range(0, len(lines), 2):
            action_meta = json.loads(lines[i])          # index metadata
            doc = json.loads(lines[i + 1])              # actual doc

            action = {
                "_index": "thunderbird",
                "_source": doc
            }
            yield action

helpers.bulk(es, generate_actions())

print("Bulk upload completed!")

Bulk upload completed!


In [None]:
#Sample queries
#Get 10 documents
res = es.search(index="thunderbird", query={"match_all": {}}, size=10)
print(res["hits"]["hits"])

[{'_index': 'thunderbird', '_id': 'nzPXfJoB-KNr5FXDNoSd', '_score': 1.0, '_source': {'timestamp_epoch': '1131566461', 'timestamp_iso': '2005-11-09T20:01:01Z', 'date_field': '2005.11.09', 'host': 'dn228', 'message': 'Nov 9 12:01:01 dn228/dn228 crond[2916]: (root) CMD (run-parts /etc/cron.hourly)', 'raw_line': '- 1131566461 2005.11.09 dn228 Nov 9 12:01:01 dn228/dn228 crond[2916]: (root) CMD (run-parts /etc/cron.hourly)'}}, {'_index': 'thunderbird', '_id': 'oTPXfJoB-KNr5FXDNoSd', '_score': 1.0, '_source': {'timestamp_epoch': '1131566461', 'timestamp_iso': '2005-11-09T20:01:01Z', 'date_field': '2005.11.09', 'host': 'dn261', 'message': 'Nov 9 12:01:01 dn261/dn261 crond(pam_unix)[2907]: session opened for user root by (uid=0)', 'raw_line': '- 1131566461 2005.11.09 dn261 Nov 9 12:01:01 dn261/dn261 crond(pam_unix)[2907]: session opened for user root by (uid=0)'}}, {'_index': 'thunderbird', '_id': 'qDPXfJoB-KNr5FXDNoSd', '_score': 1.0, '_source': {'timestamp_epoch': '1131566461', 'timestamp_iso

In [7]:
#Find error messages
res = es.search(
    index="thunderbird",
    query={
        "match": {
            "message": "ERROR"
        }
    }
)

for hit in res["hits"]["hits"]:
    print(hit["_source"])

{'timestamp_epoch': '1131567043', 'timestamp_iso': '2005-11-09T20:10:43Z', 'date_field': '2005.11.09', 'host': 'tbird-admin1', 'message': 'Nov 9 12:10:43 local@tbird-admin1 vesafb: probe of vesafb0 failed with error -6', 'raw_line': '- 1131567043 2005.11.09 tbird-admin1 Nov 9 12:10:43 local@tbird-admin1 vesafb: probe of vesafb0 failed with error -6'}
{'timestamp_epoch': '1131567055', 'timestamp_iso': '2005-11-09T20:10:55Z', 'date_field': '2005.11.09', 'host': 'tbird-admin1', 'message': 'Nov 9 12:10:55 local@tbird-admin1 sshd[1761]: error: Bind to port 22 on 0.0.0.0 failed: Address already in use.', 'raw_line': '- 1131567055 2005.11.09 tbird-admin1 Nov 9 12:10:55 local@tbird-admin1 sshd[1761]: error: Bind to port 22 on 0.0.0.0 failed: Address already in use.'}


In [8]:
#Filter by host
res = es.search(
    index="thunderbird",
    query={
        "term": {"host": "tbird-sm1"}
    }
)
print(res["hits"]["hits"])

[{'_index': 'thunderbird', '_id': '1DPXfJoB-KNr5FXDNoSe', '_score': 2.3912213, '_source': {'timestamp_epoch': '1131566470', 'timestamp_iso': '2005-11-09T20:01:10Z', 'date_field': '2005.11.09', 'host': 'tbird-sm1', 'message': 'Nov 9 12:01:10 src@tbird-sm1 ib_sm.x[24904]: [ib_sm_sweep.c:1831]: ********************** NEW SWEEP ********************', 'raw_line': '- 1131566470 2005.11.09 tbird-sm1 Nov 9 12:01:10 src@tbird-sm1 ib_sm.x[24904]: [ib_sm_sweep.c:1831]: ********************** NEW SWEEP ********************'}}, {'_index': 'thunderbird', '_id': '2zPXfJoB-KNr5FXDNoSe', '_score': 2.3912213, '_source': {'timestamp_epoch': '1131566474', 'timestamp_iso': '2005-11-09T20:01:14Z', 'date_field': '2005.11.09', 'host': 'tbird-sm1', 'message': 'Nov 9 12:01:14 src@tbird-sm1 ib_sm.x[24904]: [ib_sm_sweep.c:1455]: No topology change', 'raw_line': '- 1131566474 2005.11.09 tbird-sm1 Nov 9 12:01:14 src@tbird-sm1 ib_sm.x[24904]: [ib_sm_sweep.c:1455]: No topology change'}}, {'_index': 'thunderbird', '_i

In [9]:
#Search raw log text
res = es.search(
    index="thunderbird",
    query={
        "match_phrase": {"raw_line": "Authentication failed"}
    }
)