In [5]:
import os

class DataLoader:
    def __init__(self, file_paths, chunk_size=1000):
        """
        Initializes the DataLoader for multiple log files.

        :param file_paths: List of log file paths to process.
        :param chunk_size: The number of log lines to yield in each chunk.
        """
        self.file_paths = file_paths
        self.chunk_size = chunk_size
        self.files = [open(file_path, 'r') for file_path in file_paths]  # Open all files
        self.current_file_idx = 0
        self.current_file = self.files[self.current_file_idx]

    def __iter__(self):
        """
        Returns an iterator to process the log files in chunks.
        """
        return self

    def __next__(self):
        """
        Reads the next chunk of log lines from the current file.

        :return: A list of log lines as a chunk.
        """
        lines = []
        while len(lines) < self.chunk_size:
            line = self.current_file.readline()
            if not line:  # If end of file is reached, move to next file
                self.current_file_idx += 1
                if self.current_file_idx >= len(self.files):
                    # If we've processed all files, stop iteration
                    raise StopIteration
                self.current_file = self.files[self.current_file_idx]
                continue
            lines.append(line.strip())  # Strip newlines and whitespace from each line
        return self.current_file_idx, lines

    def close(self):
        """
        Close all open files when finished.
        """
        for file in self.files:
            file.close()



In [None]:
# Assume that DataLoader class has been defined as above.

# Step 1: Define the log file paths
log_dir = "logs"

log_file_paths = [os.path.join(log_dir, file) for file in os.listdir(log_dir) if file.endswith('.log')]
print(log_file_paths)

# Step 2: Initialize the DataLoader with the log file paths and chunk size
train_data_loader = DataLoader(log_file_paths, chunk_size=1000)
test_data_loader = DataLoader(log_file_paths, chunk_size=1000)

# # Step 3: Iterate over the data_loader to process the files in chunks
# try:
#     for current_file_idx, chunk in data_loader:
#         print(f"Processing chunk of size {len(chunk)} for the file {current_file_idx}:")
#         # Here you can process each chunk of log lines
#         for line in chunk:
#             print(line)  # Or, do some further processing on each log line
#         break  # Remove this line to process all chunks

# except StopIteration:
#     print("All log files have been processed.")

# # Step 4: Close the DataLoader after processing is complete
# data_loader.close()



In [7]:
import json
import logging
import time
import math
import sys
from collections import defaultdict
from drain3 import TemplateMiner
from drain3.template_miner_config import TemplateMinerConfig
from drain3.file_persistence import FilePersistence

logger = logging.getLogger(__name__)
logging.basicConfig(stream=sys.stdout, level=logging.INFO, format='%(message)s')


class MultiLevelDrainTrainer:
    def __init__(self, train_data_loader, test_data_loader, base_config_path, levels, trained_models_dir):
        """
        Initialize the MultiLevelDrainTrainer.

        :param data_loader: An iterable DataLoader to read log lines.
        :param base_config_path: Path to the base configuration file for Drain.
        :param levels: List of dictionaries defining hyper-parameters for each level.
        """
        self.train_data_loader = train_data_loader
        self.test_data_loader = test_data_loader
        self.base_config_path = base_config_path
        self.levels = levels
        self.models = []
        self.statistics = []  # Store statistics for each level
        self.trained_models_dir = trained_models_dir

    def _load_config(self, level_params):
        """
        Load the base configuration and apply level-specific hyper-parameters.

        :param level_params: Dictionary of hyper-parameters for this level.
        :return: A configured TemplateMinerConfig instance.
        """
        config = TemplateMinerConfig()
        config.load(self.base_config_path)
        for param, value in level_params.items():
            setattr(config, param, value)
        return config

    def train_level(self, level_idx, config):
        """
        Train a single level of Drain with the given configuration.

        :param level_idx: Index of the level being trained.
        :param config: Configured TemplateMinerConfig instance.
        :return: A trained TemplateMiner instance.
        """
        logger.info(f"Training Drain model for Level {level_idx} with config: {config.__dict__}")
        persistence = FilePersistence(os.path.join(self.trained_models_dir, f"state_level_{level_idx}.bin"))
        template_miner = TemplateMiner(persistence, config)

        # for file_idx, chunk in self.train_data_loader:
        #     for line in chunk:
        #         line = line.strip()
        #         template_miner.add_log_message(line)
                # masked_message = template_miner.masker.mask(line)
                # tokens = template_miner.drain.get_content_as_tokens(masked_message)
                # print(f"Original: {line}")
                # print(f"Masked: {" ".join(tokens)}")

        return template_miner

    def collect_statistics(self, level_idx, model):
        """
        Collect frequency and rarity statistics for templates after training is complete.

        :param level_idx: Index of the level being analyzed.
        :param model: Trained TemplateMiner instance.
        :return: A list of statistics for each template.
        """
        logger.info(f"Collecting statistics for Level {level_idx}...")
        stats = {}
        total_lines = 0
        template_id_template_map = {}
        total_files = set()
        # Reprocess training data to collect statistics
        for file_idx, chunk in self.test_data_loader:
            total_files.add(file_idx)
            if file_idx not in stats:
                stats[file_idx] = {}

            for line in chunk:
                line = line.strip()
                cluster = model.match(line)
                if cluster is None:
                    continue
                else:
                    cluster_id = cluster.cluster_id
                    template = cluster.get_template()
                    if cluster_id not in template_id_template_map:
                        template_id_template_map[cluster_id] = {"template": template, "count": 0, "files": set()}
                    template_id_template_map[cluster_id]["count"] += 1
                    template_id_template_map[cluster_id]["files"].add(file_idx)
                    if template not in stats[file_idx]:
                        stats[file_idx][template] = 0
                    stats[file_idx][template] += 1
                total_lines += 1

        for template_id, template_stats in template_id_template_map.items():
            try:
                template = template_stats["template"]
                template_stats["occurrence_ratio"] = len(template_stats["files"]) / len(total_files)
                template_stats["average_frequency_per_file"] = template_stats["count"] / len(total_files)

                # Compute Log-Likelihood Ratio (LLR)
                p_obs = template_stats["count"] / total_lines
                p_exp = 1 / len(template_id_template_map)
                template_stats["log_likelihood_ratio"] = template_stats["count"] * math.log(p_obs / p_exp) if p_obs > 0 else 0

                probabilities = []
                for idx in stats:
                    if stats[idx][template]:
                        probabilities.append(stats[idx][template])
                sum_probabilities = sum(probabilities)
                probabilities = [p / sum_probabilities for p in probabilities]
                entropy = -sum(p * math.log(p) for p in probabilities if p > 0)
                template_stats["entropy"] = entropy

            except:
                continue

        self.statistics.append({"level": level_idx, "stats": template_id_template_map})
        return stats

    def train(self):
        """
        Train the Drain models for all levels and collect statistics.
        """
        for level_idx, level_params in enumerate(self.levels, start=1):
            config = self._load_config(level_params)
            model = self.train_level(level_idx, config)
            self.models.append(model)
            self.collect_statistics(level_idx, model)

    def summarize_models(self):
        """
        Summarize the clusters for all trained models.
        """
        for level_idx, model in enumerate(self.models, start=1):
            logger.info(f"--- Summary for Level {level_idx} ---")
            sorted_clusters = sorted(model.drain.clusters, key=lambda c: c.size, reverse=True)
            for cluster in sorted_clusters[:10]:  # Show top 10 clusters
                logger.info(cluster)
            print(f"Prefix Tree for Level {level_idx}:")
            model.drain.print_tree()
            model.profiler.report(0)


In [None]:
# from dataloader import DataLoader

base_config_path = "drain3.ini"
trained_models_dir = "trained_models"
os.makedirs(trained_models_dir, exist_ok=True)
levels = [
    # {"max_depth": 4, "max_children": 100},  # Coarse-grained
    # {"max_depth": 6, "max_children": 50},   # Medium-grained
    {"depth": 8, "sim_th": 0.4}    # Fine-grained
]

# Train Multi-Level Drain
trainer = MultiLevelDrainTrainer(train_data_loader, test_data_loader, base_config_path, levels, trained_models_dir)
trainer.train()

# Summarize Results
# trainer.summarize_models()


In [None]:
import csv

# Save statistics to a CSV file
csv_file_path = "statistics.csv"
with open(csv_file_path, mode='w', newline='') as file:
    writer = csv.writer(file)
    # Write the header
    writer.writerow(["level", "template_id", "template_text", "occurrence_ratio", "average_frequency_per_file", "log_likelihood_ratio", "entropy"])
    
    # Write the data
    for level_stats in trainer.statistics:
        level = level_stats["level"]
        template_id_template_map = level_stats["stats"]
        for template_id, template_stats in template_id_template_map.items():
            writer.writerow([
                level,
                template_id,
                template_stats["template"],
                template_stats["occurrence_ratio"],
                template_stats["average_frequency_per_file"],
                template_stats["log_likelihood_ratio"],
                template_stats["entropy"]
            ])

In [None]:
for level_stats in trainer.statistics:
    level = level_stats["level"]
    template_id_template_map = level_stats["stats"]
    for template_id, template_stats in template_id_template_map.items():
        template_stats["files"] = list(template_stats["files"])

In [None]:
# Specify the file path to save the JSON data
file_path = "statistics.json"

# Save the statistics to a JSON file
with open(file_path, 'w') as json_file:
    json.dump(trainer.statistics, json_file, indent=4)

In [None]:
def classifier(template_stats, occurrence_threshold = 0.1, frequency_threshold = 10, log_likelihood_threshold = 0, entropy_threshold = 0.5):
    """
    Classifies based on the given parameters and thresholds.
    
    Args:
    - template_stats (dict): A dictionary containing the following keys:
        - occurrence_ratio (float)
        - average_frequency_per_file (float)
        - log_likelihood_ratio (float)
        - entropy (float)
    - occurrence_threshold (float): Threshold for occurrence ratio.
    - frequency_threshold (float): Threshold for average frequency per file.
    - log_likelihood_threshold (float): Threshold for log likelihood ratio.
    - entropy_threshold (float): Threshold for entropy.

    Returns:
    - bool: True if the conditions are met, False otherwise.
    """
    
    # Extract values from template_stats
    occurrence_ratio = template_stats["occurrence_ratio"]
    average_frequency_per_file = template_stats["average_frequency_per_file"]
    log_likelihood_ratio = template_stats["log_likelihood_ratio"]
    entropy = template_stats["entropy"]
    
    # Check the conditions: occurrence_ratio, average_frequency_per_file, log_likelihood_ratio should be lower
    # entropy should be higher
    # if (occurrence_ratio < occurrence_threshold and 
    #     average_frequency_per_file < frequency_threshold and 
    #     log_likelihood_ratio < log_likelihood_threshold and 
    #     entropy > entropy_threshold):
    #     return True
    # else:
    #     return False
    
    if (average_frequency_per_file < frequency_threshold):
        return True
    else:
        return False
    
# Define the thresholds for classification
occurrence_threshold = 0.1
frequency_threshold = 10
log_likelihood_threshold = 0
entropy_threshold = 0.5

anomaly_markings = {}

for level_stats in trainer.statistics:
    level = level_stats["level"]
    template_id_template_map = level_stats["stats"]
    for template_id, template_stats in template_id_template_map.items():
        template_stats["is_anomaly"] = classifier(template_stats, occurrence_threshold, frequency_threshold, log_likelihood_threshold, entropy_threshold)
        anomaly_markings[template_stats["template"]] = template_stats["is_anomaly"]

# Specify the file path to save the JSON data
file_path = "anomaly_markings.json"
# Save the statistics to a JSON file
with open(file_path, 'w') as json_file:
    json.dump(anomaly_markings, json_file, indent=4)

In [None]:
# file_path = "statistics.json"

# # Load the statistics from the JSON file
# with open(file_path, 'r') as json_file:
#     trainer.statistics = json.load(json_file)

In [30]:
import sqlite3
import json
import re
from datetime import datetime
import os

base_config_path = "drain3.ini"
trained_models_dir = "trained_models"
level_idx = 1
log_dir = "logs"


# Step 1: Create a DataLoader instance
log_file_paths = [os.path.join(log_dir, file) for file in os.listdir(log_dir) if file.endswith('.log')]
data_loader = DataLoader(log_file_paths, chunk_size=1000)

# Step 2: Read the anomaly_markings.json file
with open('anomaly_markings.json', 'r') as f:
    anomaly_markings = json.load(f)

# Step 3: Get the cluster templates using the Drain model
config = TemplateMinerConfig()
config.load(base_config_path)

persistence = FilePersistence(os.path.join(trained_models_dir, f"state_level_{level_idx}.bin"))
template_miner = TemplateMiner(persistence, config)

# Step 4: Check if the template is an anomaly using the anomaly markings
def is_anomaly(template):
    return anomaly_markings.get(template, False)

# Step 5: Split the log line into pieces
def split_log_line(line):
    pattern = r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}),\s+(Info|Error|Warning|Debug)\s+(\w+)\s+(.*)'
    match = re.match(pattern, line)
    if match:
        timestamp, log_level, component, message = match.groups()
        return timestamp, log_level, component, message
    else:
        return None, None, None, None

# Step 6: Store anomalies in SQLite3 database
conn = sqlite3.connect('anomalies.db')
cursor = conn.cursor()
cursor.execute('''
    CREATE TABLE IF NOT EXISTS anomalies (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        timestamp TEXT,
        log_level TEXT,
        component TEXT,
        message TEXT,
        template TEXT,
        created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
        log_line TEXT,
        incident_time TIMESTAMP,
        cpu_usage REAL,
        memory_usage REAL,
        criticality INTEGER
    )
''')

start_time = time.time()

line_count = 0
data_to_insert = []
prev_timestamp = None
for file_idx, chunk in data_loader:
    for line in chunk:
        line_count += 1
        line = line.strip()
        cluster = template_miner.match(line)
        if cluster is not None:
            template = cluster.get_template()
            if "error" in line or "Error" in line or "ERROR" in line or is_anomaly(template):
                if is_anomaly(template):
                    critical = 2
                else:
                    critical = 1
                timestamp, log_level, component, message = split_log_line(line)
                # Convert timestamp to a datetime object
                if timestamp:
                    incident_time = datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S')
                    prev_timestamp = timestamp
                else:
                    timestamp = prev_timestamp
                    message = line

                # Filter from the DB if needed (example query)
                # cursor.execute('SELECT * FROM anomalies WHERE incident_time > ?', (incident_time,))
                # if incident_time:
                data_to_insert.append((timestamp, log_level, component, message, template, line, incident_time, critical))

end_time = time.time()

# Insert all data at once
cursor.executemany('''
    INSERT INTO anomalies (timestamp, log_level, component, message, template, log_line, incident_time, criticality)
    VALUES (?, ?, ?, ?, ?, ?, ?, ?)
''', data_to_insert)

print(f"Time taken to process the log files: {end_time - start_time:.2f} seconds\nNumber of lines processed: {line_count}")
print(f"Processing speed: {line_count / (end_time - start_time):.2f} lines per second")
conn.commit()
conn.close()

Starting Drain3 template miner
Checking for saved state
Restored 2584 clusters built from 9949358 messages


  cursor.executemany('''


Time taken to process the log files: 952.66 seconds
Number of lines processed: 10000000
Processing speed: 10496.94 lines per second


In [None]:
import sqlite3
import json

# Connect to the SQLite database
conn = sqlite3.connect('anomalies.db')
cursor = conn.cursor()

# Query to fetch the required data
query = '''
    SELECT timestamp, message, template, criticality, cpu_usage, memory_usage
    FROM anomalies
'''

# Execute the query
cursor.execute(query)
rows = cursor.fetchall()

# Convert the data to a list of dictionaries
data = []
for row in rows:
    data.append({
        "timestamp": row[0],
        "message": row[1],
        "template": row[2],
        "criticality": row[3],
        "cpu_usage": row[4],
        "memory_usage": row[5]
    })


filtered_data = []
template_count = {}

for entry in data:
    template = entry["template"]
    criticality = entry["criticality"]
    
    if criticality == 1:
        if template not in template_count:
            template_count[template] = 0
        if template_count[template] < 5:
            filtered_data.append(entry)
            template_count[template] += 1
    else:
        filtered_data.append(entry)

data = filtered_data

# Convert the list of dictionaries to JSON
json_data = json.dumps(data, indent=4)

# Close the database connection
conn.close()

# Print the JSON data
print(len(data))

# Specify the file path to save the JSON data
file_path = "metric_data.json"
# Save the statistics to a JSON file
with open(file_path, 'w') as json_file:
    json.dump(data, json_file, indent=4)

In [1]:
import sqlite3
import json

# Load the updated metric data from the JSON file
with open('updated_metric_data.json', 'r') as json_file:
    updated_data = json.load(json_file)

# Connect to the SQLite database
conn = sqlite3.connect('anomalies.db')
cursor = conn.cursor()

# Update the database for criticality 2 based on timestamp and message
for entry in updated_data:
    if entry["criticality"] == 2:
        cursor.execute('''
            UPDATE anomalies
            SET cpu_usage = ?, memory_usage = ?
            WHERE timestamp = ? AND message = ? AND criticality = 2
        ''', (entry["cpu_usage"], entry["memory_usage"], entry["timestamp"], entry["message"]))

# Calculate average cpu_usage and memory_usage for criticality 1 by template
template_stats = {}
for entry in updated_data:
    if entry["criticality"] == 1:
        template = entry["template"]
        if template not in template_stats:
            template_stats[template] = {"cpu_usage": 0, "memory_usage": 0, "count": 0}
        template_stats[template]["cpu_usage"] += entry["cpu_usage"]
        template_stats[template]["memory_usage"] += entry["memory_usage"]
        template_stats[template]["count"] += 1

# Update the database for criticality 1 with average values by template
for template, stats in template_stats.items():
    avg_cpu_usage = stats["cpu_usage"] / stats["count"]
    avg_memory_usage = stats["memory_usage"] / stats["count"]
    cursor.execute('''
        UPDATE anomalies
        SET cpu_usage = ?, memory_usage = ?
        WHERE template = ? AND criticality = 1
    ''', (avg_cpu_usage, avg_memory_usage, template))

# Commit the changes and close the database connection
conn.commit()
conn.close()

In [8]:
from datetime import datetime
# Read the JSON file
with open('syn_data.json', 'r') as json_file:
    syn_data = json.load(json_file)

# Extract the logs under the key "failures"
failure_logs = syn_data.get('failures', [])

# Connect to the SQLite database
conn = sqlite3.connect('anomalies.db')
cursor = conn.cursor()

# Iterate over each failure log and extract the sub logs
for failure in failure_logs:
    logs = failure.get('logs', [])
    for log in logs:
        # print(log)
        # Extract the required fields from the log
        timestamp = log.get('timestamp')
        log_level = log.get('log_level')
        message = log.get('message')
        cpu_usage = log.get('cpu_usage')
        memory_usage = log.get('memory_usage')
        template = log.get('drain_template')
        criticality = 2

        incident_time = datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S')

        # Insert the log into the database
        cursor.execute('''
            INSERT INTO anomalies (timestamp, log_level, component, message, template, log_line, incident_time, cpu_usage, memory_usage, criticality)
            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
        ''', (timestamp, log_level, 'Error', message, template, log['whole_log'], incident_time, cpu_usage, memory_usage, criticality))

# Commit the changes and close the database connection
conn.commit()
conn.close()


  cursor.execute('''


In [None]:
from datetime import timedelta

def fetch_logs(incident_time):
    """
    Fetch log lines within a 2-hour range around the given incident time.

    :param incident_time: The incident time as a datetime object.
    :return: A list of log lines within the time range.
    """

    # Calculate the time range
    start_time = incident_time - timedelta(hours=2)
    end_time = incident_time + timedelta(hours=2)

    # Query to fetch the log lines within the time range
    query = '''
        SELECT timestamp, log_level, component, message, template, log_line
        FROM anomalies
        WHERE incident_time BETWEEN ? AND ?
        ORDER BY incident_time
    '''

    # Execute the query
    cursor.execute(query, (start_time, end_time))
    rows = cursor.fetchall()

    # Convert the data to a list of dictionaries
    log_lines = []
    for row in rows:
        log_lines.append({
            "timestamp": row[0],
            "log_level": row[1],
            "component": row[2],
            "message": row[3],
            "template": row[4],
            "log_line": row[5]
        })

    logs = [log["log_line"] for log in log_lines]

    return logs

# Example usage
incident_time = datetime.strptime('2023-07-25 14:46:05', '%Y-%m-%d %H:%M:%S')
logs = fetch_logs(incident_time)
print(logs)



  cursor.execute(query, (start_time, end_time))
