In [44]:
import json
import os
import sys
import requests
from tqdm import tqdm

In [54]:
queries = {
    "kong-access-":{
        "bool": {
            "must": [
                {
                    "range": {
                        "@timestamp": {
                            "gte": "2025-01-02T00:00:00.000Z",
                            "lte": "2025-01-03T00:00:00.000Z",
                            "format": "strict_date_optional_time"
                        }
                    }
                }
            ]
        }
    },
    "metricbeat-": {
        "bool": {
            "must": [
                {
                            "query_string": {
                                "query": "cee25daa-3fd9-441b-af33-8211e3649f3e",
                                "default_operator": "AND"
                            }
                        },
                {
                    "range": {
                        "@timestamp": {
                            "gte": "2025-01-02T00:00:00.000Z",
                            "lte": "2025-01-03T00:00:00.000Z",
                            "format": "strict_date_optional_time"
                        }
                    }
                }
            ]
        }
    },
    "traces-apm":{
        "bool": {
            "must": [
                {
                    "range": {
                        "@timestamp": {
                            "gte": "2025-01-02T00:00:00.000Z",
                            "lte": "2025-01-03T00:00:00.000Z",
                            "format": "strict_date_optional_time"
                        }
                    }
                }
            ]
        }
    },
    "apm-":{
        "bool": {
            "must": [
                {
                    "range": {
                        "@timestamp": {
                            "gte": "2025-01-02T00:00:00.000Z",
                            "lte": "2025-01-03T00:00:00.000Z",
                            "format": "strict_date_optional_time"
                        }
                    }
                }
            ]
        }
    },
    "logs-apm":{
        "bool": {
            "must": [
                {
                    "range": {
                        "@timestamp": {
                            "gte": "2025-01-02T00:00:00.000Z",
                            "lte": "2025-01-03T00:00:00.000Z",
                            "format": "strict_date_optional_time"
                        }
                    }
                }
            ]
        }
    },
    "metrics-apm":{
        "bool": {
            "must": [
                {
                    "range": {
                        "@timestamp": {
                            "gte": "2025-01-02T00:00:00.000Z",
                            "lte": "2025-01-03T00:00:00.000Z",
                            "format": "strict_date_optional_time"
                        }
                    }
                }
            ]
        }
    },
}

In [57]:
base_url = "https://116.101.122.180:5200/{index}*/_search"

headers = {
    "Authorization": "ApiKey",
    "Content-Type": "application/json"
    }

index_arr = ["kong-access-", "metricbeat-", "traces-apm", "apm-", "logs-apm", "metrics-apm"]

for index in tqdm(index_arr):
    url = base_url.format(index=index)
    data = {
        "from": 0,
        "size": 500,
        "query": queries[f"{index}"],
        "sort": [
            {
                "@timestamp": {
                    "order": "asc"
                }
            }
        ]
    }
    response = requests.get(
        url, headers=headers,
        json=data, verify=False
    )
    data = response.json()
    with open(f"./logs/{index}.json", "w") as json_file:
        json.dump(data, json_file, indent=4)

100%|██████████| 6/6 [00:40<00:00,  6.74s/it]


In [19]:

# LOGS 

import pandas as pd
import json

# Load the JSON data from a file
with open('/Users/longcaca/Downloads/example/ETL-Flow-DataCentric/logs/logs-apm.json', 'r') as file:  # Replace with your actual file path
    data = json.load(file)

# Extracting relevant fields from the nested structure
extracted_data = []

for hit in data['hits']['hits']:
    source = hit['_source']
    error_info = source.get('error', {})
    stacktrace = error_info.get('stacktrace', [])
    
    # Prepare a dictionary for each record
    record = {
        'Timestamp': source.get('@timestamp', None),
        'Message': source.get('message', ''),
        # 'WARN message': '',  # Assuming WARD message is not present in the provided structure
        # 'Stack trace': '\n'.join([f"{item['classname']} - {item['filename']}:{item['line']['number']} - {item['function']}" for item in stacktrace]),
        # 'Error code': error_info.get('type', ''),  # Assuming type as Error code
        'Error code': error_info.get('exception', [{}])[0].get('type', ''),  # Get first exception message as Error cause
        'Error cause': error_info.get('exception', [{}])[0].get('message', '')  # Get first exception message as Error cause
    }
    extracted_data.append(record)

# Create a DataFrame from the extracted data
df = pd.DataFrame(extracted_data)

# Print the first 10 rows of the DataFrame
print(df.head(10))
df.to_csv('/Users/longcaca/Downloads/example/ETL-Flow-DataCentric/logs.csv' , index= False)

                  Timestamp  \
0  2025-01-02T00:00:10.788Z   
1  2025-01-02T00:00:14.668Z   
2  2025-01-02T00:00:16.213Z   
3  2025-01-02T00:00:21.259Z   
4  2025-01-02T00:00:33.765Z   
5  2025-01-02T00:01:15.794Z   
6  2025-01-02T00:01:19.674Z   
7  2025-01-02T00:01:21.216Z   
8  2025-01-02T00:01:26.265Z   
9  2025-01-02T00:02:20.801Z   

                                             Message  \
0  [s1|connecting...] Protocol initialization req...   
1  [s1|connecting...] Protocol initialization req...   
2  [s0|connecting...] Protocol initialization req...   
3  [s0|connecting...] Protocol initialization req...   
4  unexpected end of stream on https://open.camer...   
5  [s1|connecting...] Protocol initialization req...   
6  [s1|connecting...] Protocol initialization req...   
7  [s0|connecting...] Protocol initialization req...   
8  [s0|connecting...] Protocol initialization req...   
9  [s1|connecting...] Protocol initialization req...   

                                         

In [23]:
# TRACES
import pandas as pd
import json

# Load the JSON data from the file
with open('/Users/longcaca/Downloads/example/ETL-Flow-DataCentric/logs/traces-apm.json', 'r') as file:  # Replace with your actual file path
    data = json.load(file)

# Prepare a list to hold the extracted records
extracted_data = []

# Iterate through each hit in the JSON data
for hit in data['hits']['hits']:
    source = hit['_source']
    
    # Extract relevant fields
    transaction = source.get('transaction', {})
    
    # Ensure duration is accessed correctly and converted to seconds
    duration_us = transaction.get('duration', {}).get('us', 0)  # Default to 0 if not found
    
    record = { #
        'Timestamp': source.get('@timestamp', None),
        
        'transaction_name': transaction.get('name', ''),
        'transaction_duration': duration_us,
        'transaction_id': transaction.get('id', ''),
        'transaction_type': transaction.get('type', ''),
        
        'span_name': source.get('span', {}).get('name', ''),
        'span_duration': source.get('span', {}).get('duration', {}).get('us', 0),
        'span_subtype': source.get('span', {}).get('subtype', ''),
        'span_id': source.get('span', {}).get('id', ''),
        'span_id': source.get('span', {}).get('type', ''),
    }
    
    extracted_data.append(record)

# Create a DataFrame from the extracted data
df = pd.DataFrame(extracted_data)

# Print the first 10 rows of the DataFrame
print(df.head(10))
df.to_csv('/Users/longcaca/Downloads/example/ETL-Flow-DataCentric/traces.csv'  ,  index = False )



                  Timestamp         transaction_name  transaction_duration  \
0  2025-01-02T00:00:00.092Z             ping dummydb                   624   
1  2025-01-02T00:00:01.219Z             ping dummydb                   499   
2  2025-01-02T00:00:01.290Z  GET /actuator/health/**                  1173   
3  2025-01-02T00:00:01.290Z  GET /actuator/health/**                  1150   
4  2025-01-02T00:00:02.004Z                      GET                  1659   
5  2025-01-02T00:00:02.004Z                                              0   
6  2025-01-02T00:00:02.126Z                     POST                  1144   
7  2025-01-02T00:00:02.472Z             ping dummydb                   392   
8  2025-01-02T00:00:03.403Z                     POST                  1456   
9  2025-01-02T00:00:03.869Z                      GET                  1243   

     transaction_id transaction_type     span_name  span_duration  \
0  58265d8827bec8c9          unknown                            0   
1  

In [13]:
import pandas as pd
import json

# Load the JSON data from the file
with open('/Users/longcaca/Downloads/example/ETL-Flow-DataCentric/logs/metrics-apm.json', 'r') as file:
    data = json.load(file)

# Extracting relevant information from the JSON structure
# Assuming 'hits' contains the relevant metrics
hits = data['hits']['hits']

# Create a list to store extracted records
extracted_data = []

for hit in hits:
    source = hit['_source']
    transaction = source.get('transaction', {})
    
    # Extracting relevant fields
    record = {
        # Timestamp
        'Timestamp': source.get('@timestamp', None),  # Adjust based on actual field names in your JSON
        # CPU
        'System CPU Usage': source.get('system.cpu.usage', None),  # Adjust based on actual field names in your JSON
        'Process CPU Usage': source.get('process.cpu.usage', None),  # Adjust based on actual field names in your JSON
        'System CPU Count': source.get('system.cpu.count', None),  # Adjust based on actual field names in your JSON
        
        'jvm_system_cpu_load_1m': source.get('process.runtime.jvm.system.cpu.load_1m', None),  # Adjust based on actual field names in your JSON
        'jvm_cpu_utilization': source.get('process.runtime.jvm.cpu.utilization', None),  # Adjust based on actual field names in your JSON
        'jvm_system_cpu_utilization': source.get('process.runtime.jvm.system.cpu.utilization', None),  # Adjust based on actual field names in your JSON
        
        # Memory
        
        'jvm.memory.committed': source.get('jvm.memory.committed', None),  # Adjust based on actual field names in your JSON
        'jvm.memory.max': source.get('jvm.memory.max', None),  # Adjust based on actual field names in your JSON
        'jvm.memory.used': source.get('jvm.memory.used', None),  # Adjust based on actual field names in your JSON
        
        'jvm.buffer.memory.used': source.get('jvm.buffer.memory.used', None),  
        'jvm.memory.usage.after.gc': source.get('jvm.memory.usage.after.gc', None),  # Adjust based on actual field names in your JSON
        'jvm.gc.memory.allocated': source.get('jvm.gc.memory.allocated', None),  # Adjust based on actual field names in your JSON
        'jvm.gc.memory.promoted': source.get('jvm.gc.memory.promoted', None),  # Adjust based on actual field names in your JSON
    
        'process.runtime.jvm.memory.init': source.get('process.runtime.jvm.memory.init', None),  # Adjust based on actual field names in your JSON
        'process.runtime.jvm.memory.limit': source.get('process.runtime.jvm.memory.limit', None),  # Adjust based on actual field names in your JSON
        'process.runtime.jvm.memory.usage': source.get('process.runtime.jvm.memory.usage', None),  # Adjust based on actual field names in your JSON
        'process.runtime.jvm.memory.committed': source.get('process.runtime.jvm.memory.committed', None),  # Adjust based on actual field names in your JSON
        'process.runtime.jvm.memory.usage_after_last_gc': source.get('process.runtime.jvm.memory.usage_after_last_gc', None),  # Adjust based on actual field names in your JSON
        
        'system.memory.utilization': source.get('system.memory.utilization', None),  # Adjust based on actual field names in your JSON
        'system.memory.usage': source.get('system.memory.usage', None),  # Adjust based on actual field names in your JSON
        
        'Latency': transaction.get('duration.histogram', {}).get('values', [None])[0],  # First value as latency
        'Error Rate': transaction.get('result', None),  # Adjust based on actual error rate representation
        'Number of Requests': source.get('_doc_count', 0)  # Total document count as number of requests
    }
    
    extracted_data.append(record)

# Create a DataFrame from the extracted data
df = pd.DataFrame(extracted_data)

# Display the DataFrame
print(df)
df.to_csv('/Users/longcaca/Downloads/example/ETL-Flow-DataCentric/metrics.csv'  ,  index = False)


                    Timestamp  System CPU Usage  Process CPU Usage  \
0    2025-01-02T00:00:00.000Z               NaN                NaN   
1    2025-01-02T00:00:00.000Z               NaN                NaN   
2    2025-01-02T00:00:00.000Z               NaN                NaN   
3    2025-01-02T00:00:00.000Z               NaN                NaN   
4    2025-01-02T00:00:00.000Z               NaN                NaN   
..                        ...               ...                ...   
495  2025-01-02T00:00:06.006Z               NaN                NaN   
496  2025-01-02T00:00:06.006Z               NaN                NaN   
497  2025-01-02T00:00:06.006Z               NaN                NaN   
498  2025-01-02T00:00:06.006Z               NaN                NaN   
499  2025-01-02T00:00:06.006Z               NaN                NaN   

     System CPU Count  jvm_system_cpu_load_1m  jvm_cpu_utilization  \
0                 NaN                     NaN                  NaN   
1                 N

In [27]:
from urllib import request
 # from urllib.request
request.urlopen('https://google.com').getcode()







200