In [None]:
# ============================================================
# SINGLE DATASET: HDFS LOG ANOMALY DETECTION
# Drain3 log parser + Isolation Forest + Dashboard
# ============================================================

# --------------------------
# STEP 0: Install Libraries
# --------------------------
!pip install drain3 pandas scikit-learn plotly --quiet

# --------------------------
# STEP 1: Import Libraries
# --------------------------
import pandas as pd
import numpy as np
from drain3 import TemplateMiner
from drain3.file_persistence import FilePersistence
from drain3.template_miner_config import TemplateMinerConfig
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
import plotly.express as px
from google.colab import files

# --------------------------
# STEP 2: Upload HDFS Log File
# --------------------------
print("Upload your HDFS log file (example: hdfs.log)")
uploaded = files.upload()
hdfs_file = list(uploaded.keys())[0]
print(f"Uploaded file: {hdfs_file}")

# --------------------------
# STEP 3: Parse Logs with Drain3
# --------------------------
persistence = FilePersistence("drain3_state.bin")
config = TemplateMinerConfig()
template_miner = TemplateMiner(persistence, config)

with open(hdfs_file, 'r', encoding='utf-8') as f:
    for line in f:
        template_miner.add_log_message(line.strip())

# --------------------------
# STEP 4: Convert Logs to Structured DataFrame
# --------------------------
logs = []
for cluster in template_miner.drain.clusters:
    logs.append({
        'template_id': cluster.cluster_id,
        'template': cluster.get_template(),
        'occurrences': cluster.size
    })

hdfs_df = pd.DataFrame(logs)

# Add a dummy label (0=normal, 1=anomaly) – for demonstration
# In a real scenario, you would have labeled data
hdfs_df['label'] = 0 # Assuming all are normal for this example

# Scale template_id for ML
scaler = StandardScaler()
hdfs_df['template_id_scaled'] = scaler.fit_transform(hdfs_df[['template_id']])

# Save structured CSV
hdfs_df.to_csv("clean_hdfs.csv", index=False)
print("Saved clean_hdfs.csv")

# --------------------------
# STEP 5: Anomaly Detection using Isolation Forest
# --------------------------
# Prepare data for Isolation Forest - using occurrences and scaled template ID
features = hdfs_df[['occurrences', 'template_id_scaled']]
iso = IsolationForest(contamination=0.01, random_state=42)
hdfs_df['anomaly'] = iso.fit_predict(features)
print("Anomaly detection complete!")

# Convert anomaly prediction to 0 (normal) and 1 (anomaly)
hdfs_df['anomaly'] = hdfs_df['anomaly'].apply(lambda x: 1 if x == -1 else 0)


# --------------------------
# STEP 6: Dashboard Visualization
# --------------------------
# Top 20 templates
fig = px.bar(hdfs_df.sort_values('occurrences', ascending=False).head(20),
             x='template', y='occurrences', color='anomaly',
             title="Top 20 HDFS Templates with Anomaly Detection")
fig.show()

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for drain3 (setup.py) ... [?25l[?25hdone
Upload your HDFS log file (example: hdfs.log)


Saving HDFS_2k.log to HDFS_2k.log
Uploaded file: HDFS_2k.log
Saved clean_hdfs.csv
Anomaly detection complete!


In [None]:
from google.colab import files

files.download('clean_hdfs.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
print("Characteristics of Anomalous Logs:")
display(anomalous_logs.groupby(['template_id', 'template']).agg({'occurrences': 'sum', 'anomaly': 'count'}).rename(columns={'anomaly': 'num_anomalous_occurrences'}).sort_values(by='occurrences', ascending=False))

Characteristics of Anomalous Logs:


NameError: name 'anomalous_logs' is not defined

In [None]:
anomalous_logs = hdfs_df[hdfs_df['anomaly'] == 1]
display(anomalous_logs)

In [None]:
fig = px.scatter(hdfs_df, x='occurrences', y='template_id_scaled', color='anomaly',
                 title='Anomaly Detection Results (Scatter Plot)',
                 labels={'occurrences': 'Occurrences', 'template_id_scaled': 'Scaled Template ID'},
                 hover_data=['template'])
fig.show()

In [None]:
hdfs_df.to_csv("hdfs_anomalies.csv", index=False)
print("Saved hdfs_anomalies.csv with anomaly labels.")

# Task
Compare the original HDFS log file with the processed data in "/content/clean_hdfs.csv", describing the changes made during processing.

## Load the original log file

### Subtask:
Read the original HDFS log file (the one uploaded by the user) line by line.


**Reasoning**:
Read the original HDFS log file line by line to inspect its content.



In [None]:
# Read the original HDFS log file line by line
original_logs = []
with open(hdfs_file, 'r', encoding='utf-8') as f:
    for line in f:
        original_logs.append(line.strip())

# Display the first few lines of the original logs
print("First 5 lines of the original HDFS log file:")
for i in range(min(5, len(original_logs))):
    print(original_logs[i])

**Reasoning**:
The previous attempt failed because the variable `hdfs_file` was not defined in the current execution context. I need to re-execute the cell where `hdfs_file` was defined and the file was uploaded.



In [None]:
# --------------------------
# STEP 0: Install Libraries
# --------------------------
!pip install drain3 pandas scikit-learn plotly --quiet

# --------------------------
# STEP 1: Import Libraries
# --------------------------
import pandas as pd
import numpy as np
from drain3 import TemplateMiner
from drain3.file_persistence import FilePersistence
from drain3.template_miner_config import TemplateMinerConfig
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
import plotly.express as px
from google.colab import files

# --------------------------
# STEP 2: Upload HDFS Log File
# --------------------------
print("Upload your HDFS log file (example: hdfs.log)")
uploaded = files.upload()
hdfs_file = list(uploaded.keys())[0]
print(f"Uploaded file: {hdfs_file}")

# --------------------------
# STEP 3: Parse Logs with Drain3
# --------------------------
persistence = FilePersistence("drain3_state.bin")
config = TemplateMinerConfig()
template_miner = TemplateMiner(persistence, config)

with open(hdfs_file, 'r', encoding='utf-8') as f:
    for line in f:
        template_miner.add_log_message(line.strip())

# --------------------------
# STEP 4: Convert Logs to Structured DataFrame
# --------------------------
logs = []
for cluster in template_miner.drain.clusters:
    logs.append({
        'template_id': cluster.cluster_id,
        'template': cluster.get_template(),
        'occurrences': cluster.size
    })

hdfs_df = pd.DataFrame(logs)

# Add a dummy label (0=normal, 1=anomaly) – for demonstration
# In a real scenario, you would have labeled data
hdfs_df['label'] = 0 # Assuming all are normal for this example

# Scale template_id for ML
scaler = StandardScaler()
hdfs_df['template_id_scaled'] = scaler.fit_transform(hdfs_df[['template_id']])

# Save structured CSV
hdfs_df.to_csv("clean_hdfs.csv", index=False)
print("Saved clean_hdfs.csv")

# --------------------------
# STEP 5: Anomaly Detection using Isolation Forest
# --------------------------
# Prepare data for Isolation Forest - using occurrences and scaled template ID
features = hdfs_df[['occurrences', 'template_id_scaled']]
iso = IsolationForest(contamination=0.01, random_state=42)
hdfs_df['anomaly'] = iso.fit_predict(features)
print("Anomaly detection complete!")

# Convert anomaly prediction to 0 (normal) and 1 (anomaly)
hdfs_df['anomaly'] = hdfs_df['anomaly'].apply(lambda x: 1 if x == -1 else 0)

**Reasoning**:
Now that the `hdfs_file` variable is defined after re-uploading the file, I can proceed with reading the original log file line by line as per the subtask instruction.



In [None]:
# Read the original HDFS log file line by line
original_logs = []
with open(hdfs_file, 'r', encoding='utf-8') as f:
    for line in f:
        original_logs.append(line.strip())

# Display the first few lines of the original logs
print("First 5 lines of the original HDFS log file:")
for i in range(min(5, len(original_logs))):
    print(original_logs[i])

## Load the processed csv

### Subtask:
Load the `clean_hdfs.csv` file into a pandas DataFrame.


**Reasoning**:
Load the `clean_hdfs.csv` file into a pandas DataFrame and display its head and info to verify.



In [None]:
clean_hdfs_df = pd.read_csv('/content/clean_hdfs.csv')
print("First 5 rows of clean_hdfs.csv:")
display(clean_hdfs_df.head())
print("\nInfo of clean_hdfs.csv:")
display(clean_hdfs_df.info())

## Compare data structures

### Subtask:
Describe the differences in structure between the raw log data and the structured DataFrame.


**Reasoning**:
Describe the structure of the raw log data, the processed DataFrame, and highlight the differences.



In [None]:
# Describe the raw log data
print("Raw log data format:")
print(f"The raw log data is stored in a list called 'original_logs'.")
print(f"It contains {len(original_logs)} elements, where each element is a string representing a single line from the original log file.")
print("Example of a raw log line:")
print(original_logs[0]) # Display the first line as an example

print("\n" + "="*50 + "\n")

# Describe the processed DataFrame
print("Processed DataFrame format:")
print(f"The processed data is stored in a pandas DataFrame called 'clean_hdfs_df'.")
print(f"The DataFrame has {clean_hdfs_df.shape[0]} rows and {clean_hdfs_df.shape[1]} columns.")
print("The columns are: ", clean_hdfs_df.columns.tolist())
print("Example of the processed DataFrame structure:")
display(clean_hdfs_df.head())

print("\n" + "="*50 + "\n")

# Highlight key structural differences
print("Key structural differences:")
print("- The raw log data is unstructured text, a sequence of lines.")
print("- The processed data is structured in a tabular format (DataFrame) with named columns.")
print("- Each row in the DataFrame represents a summary of a log template, not an individual log line.")
print("- The DataFrame extracts specific information like 'template_id', 'template', and 'occurrences' from the log patterns.")
print("- The DataFrame includes additional computed features like 'template_id_scaled'.")

## Describe transformations

### Subtask:
Explain the steps taken in the code to transform the raw log data into the structured DataFrame, highlighting how Drain3 was used to extract templates and count occurrences.


**Reasoning**:
Describe the process of transforming raw log data into a structured DataFrame using Drain3 based on the provided instructions.



In [None]:
print("Explanation of the transformation from raw logs to structured DataFrame:")

# 1. Describe the initial state of the data
print("\n1. Initial State: Raw Log Data")
print("The initial data is the raw HDFS log file, which consists of unstructured text lines.")
print("Each line is a separate log entry, containing information like timestamp, process ID, log level, component, and the log message itself.")
print("Example raw line:")
if original_logs:
    print(original_logs[0])
else:
    print("Original logs not available.")

# 2. Explain the purpose of Drain3
print("\n2. Purpose of Drain3")
print("The Drain3 library is used for log parsing and template extraction.")
print("Its main purpose is to group similar log messages together and identify a common template or pattern that represents the structure of those messages.")
print("This reduces the volume of data and allows for analysis based on log event types rather than individual log lines.")

# 3. Detail how TemplateMiner is initialized and used
print("\n3. Drain3 TemplateMiner Initialization and Usage")
print("A `TemplateMiner` instance is initialized with a persistence layer (`FilePersistence`) and configuration.")
print("The code then iterates through each line of the raw log file.")
print("For each `line.strip()`, the `template_miner.add_log_message()` method is called.")
print("Drain3 processes each message, comparing it against existing templates and either adding it to a matching cluster or creating a new cluster and template if no match is found.")

# 4. Explain how Drain3 groups and generates templates
print("\n4. Log Grouping and Template Generation")
print("As Drain3 processes log messages, it uses an algorithm (like the Drain algorithm) to identify variable parts of the log messages (like timestamps, IDs, file paths) and replace them with wildcards (`<*>`).")
print("Log messages that have the same structure after replacing variables are grouped into the same cluster.")
print("Each cluster is assigned a unique `template_id` and has a representative `template` string.")

# 5. Describe how the code iterates through clusters
print("\n5. Iterating through Drain3 Clusters")
print("After processing all log lines, the `template_miner.drain.clusters` attribute contains a list of all identified log clusters.")
print("The code iterates through this list of `cluster` objects.")

# 6. Explain extraction of template_id, template, and occurrences
print("\n6. Extracting Cluster Information")
print("For each `cluster` in the list, the code extracts:")
print("- `cluster.cluster_id`: The unique ID assigned to the template/cluster.")
print("- `cluster.get_template()`: The representative template string for the cluster.")
print("- `cluster.size`: The number of original log messages that belong to this cluster (occurrences).")
print("This information is stored as a dictionary with keys 'template_id', 'template', and 'occurrences'.")
print("These dictionaries are collected in a list called `logs`.")

# 7. Describe conversion to DataFrame
print("\n7. Conversion to Pandas DataFrame")
print("Finally, the list of dictionaries (`logs`) is converted into a pandas DataFrame using `pd.DataFrame(logs)`.")
print("Each dictionary in the list becomes a row in the DataFrame.")
print("The keys of the dictionaries ('template_id', 'template', 'occurrences') become the column names.")
print("This results in the `hdfs_df` DataFrame, where each row summarizes a unique log template found in the original data, along with its frequency.")
display(hdfs_df.head())

## Compare content

### Subtask:
Illustrate the mapping between original log lines and the corresponding entries in the `clean_hdfs.csv`, showing examples of how templates were generated and how occurrences were counted.


**Reasoning**:
Select a few templates and find corresponding original log lines to illustrate the mapping and explain the process of template generation and occurrence counting.



In [None]:
import re

# Select a few templates with varying occurrences
selected_templates = hdfs_df.sort_values('occurrences', ascending=False).head(3)
print("Selected Templates for Illustration:")
display(selected_templates[['template_id', 'template', 'occurrences']])

# Find and show original log lines for each selected template
print("\nIllustration of Template Generation and Occurrence Counting:")
for index, row in selected_templates.iterrows():
    template_id = row['template_id']
    template_str = row['template']
    occurrences = row['occurrences']

    print(f"\n--- Template ID: {template_id} ---")
    print(f"Template: {template_str}")
    print(f"Occurrences in DataFrame: {occurrences}")

    # Escape special characters for regex, but keep the wildcard <*>
    regex_pattern = re.escape(template_str).replace(re.escape('<*>'), '.*')

    # Find original log lines matching the template (show up to 3 examples)
    matching_logs = [line for line in original_logs if re.fullmatch(regex_pattern, line)]

    print(f"Examples of Original Log Lines Matching this Template ({min(3, len(matching_logs))} shown):")
    for i in range(min(3, len(matching_logs))):
        print(f"- {matching_logs[i]}")

    # Explanation of template generation
    print("\nExplanation:")
    print(f"Drain3 generated this template by identifying the common structure in {occurrences} original log lines.")
    print("Variable parts of the log messages (like timestamps, process IDs, block IDs, IP addresses, file paths, etc.) were replaced by the wildcard '<*>'.")
    print("For example, in the original lines shown above, the parts that differ are replaced by '<*>' in the template.")

    # Explanation of occurrence counting
    print(f"The 'occurrences' count ({occurrences}) in the DataFrame for this template represents the total number of original log lines that were found to match this specific template by Drain3.")


## Highlight added columns

### Subtask:
Explain the purpose of the `label`, `template_id_scaled`, and `anomaly` columns added to the DataFrame.


**Reasoning**:
Provide an explanation for the purpose of the `label`, `template_id_scaled`, and `anomaly` columns, following the specified instructions.



In [None]:
print("Explanation of additional columns in the DataFrame (`label`, `template_id_scaled`, `anomaly`):")

# Explain 'label' column
print("\n- 'label' column:")
print("  This column was added as a placeholder (currently filled with 0s) to represent ground truth labels.")
print("  In a real-world anomaly detection scenario with labeled data, this column would contain the true status of each log template (e.g., 0 for normal, 1 for anomaly), which could be used to train or evaluate a supervised learning model.")
print("  For this demonstration using unsupervised anomaly detection (Isolation Forest), the 'label' column is not used in the anomaly detection process itself but is included to show where ground truth labels would typically fit.")

# Explain 'template_id_scaled' column
print("\n- 'template_id_scaled' column:")
print("  This column contains a scaled version of the 'template_id' column using StandardScaler.")
print("  Scaling is a common preprocessing step in machine learning.")
print("  Many algorithms, including distance-based or gradient-based methods (though Isolation Forest is not strictly one of these, it can still benefit from features on a similar scale), perform better when numerical input features are scaled to have a mean of 0 and a standard deviation of 1.")
print("  Scaling the 'template_id' puts it on a comparable scale to other numerical features like 'occurrences', which is important when using both features in the Isolation Forest model.")

# Explain 'anomaly' column
print("\n- 'anomaly' column:")
print("  This column contains the output of the Isolation Forest anomaly detection model.")
print("  The Isolation Forest model was trained on the 'occurrences' and 'template_id_scaled' features.")
print("  The model predicts whether each log template (represented by a row) is an outlier or anomaly.")
print("  A value of 1 in this column indicates that the Isolation Forest model classified the log template as anomalous (an outlier).")
print("  A value of 0 indicates that the model classified it as normal (inlier).")

print("\nThese columns were added *after* the initial log parsing and structuring steps performed by Drain3.")

## Summarize changes

### Subtask:
Provide a summary of the key changes made to the data, from the raw log file to the final DataFrame with anomaly labels.


**Reasoning**:
Provide a summary of the key changes made to the data, from the raw log file to the final DataFrame with anomaly labels, following the instructions.



In [None]:
print("Summary of Data Transformation from Raw Logs to Final DataFrame:")

# 1. Summarize the initial state
print("\n1. Initial State: Raw HDFS Log File")
print("The process starts with the raw HDFS log file, which is a collection of unstructured text lines.")
print("Each line is a single log entry, containing varying information like timestamps, process IDs, components, and the log message content in a free-text format.")
print("This format is difficult to analyze directly for patterns and anomalies.")

# 2. Describe the Drain3 transformation
print("\n2. Log Parsing and Structuring with Drain3")
print("The Drain3 log parsing library is applied to the raw log data.")
print("Drain3 identifies recurring patterns (templates) within the log messages and groups similar log lines together into clusters.")
print("This step transforms the unstructured text into a structured format where each unique log template is identified.")
print("The output of this stage, initially, is information about each template: a unique `template_id`, the generalized `template` string (with wildcards like '<*>'), and the `occurrences` count, representing how many raw log lines matched that template.")
print("This structured information is then organized into a pandas DataFrame.")

# 3. Explain the addition of the 'label' column
print("\n3. Addition of 'label' Column")
print("A 'label' column is added to the DataFrame.")
print("In this specific case (unsupervised anomaly detection), it is initialized with a default value (0, representing 'normal') as a placeholder.")
print("In a supervised learning scenario, this column would contain actual ground truth labels indicating whether a log template is known to be normal or anomalous.")

# 4. Summarize the creation of 'template_id_scaled'
print("\n4. Creation of 'template_id_scaled'")
print("The 'template_id' column, which is a discrete numerical identifier, is scaled using StandardScaler.")
print("This creates the 'template_id_scaled' column, which has a mean of 0 and a standard deviation of 1.")
print("Scaling is performed to prepare this feature for use in machine learning models, ensuring it is on a comparable scale to other numerical features like 'occurrences'.")

# 5. Describe the addition of the 'anomaly' column
print("\n5. Anomaly Detection and Addition of 'anomaly' Column")
print("The Isolation Forest model, an unsupervised anomaly detection algorithm, is applied.")
print("The model uses the 'occurrences' and 'template_id_scaled' features to identify outliers among the log templates.")
print("Based on the model's predictions, an 'anomaly' column is added to the DataFrame.")
print("A value of 1 in this column indicates that the model identified the corresponding log template as anomalous (an outlier), while a value of 0 indicates it was identified as normal (an inlier).")

# 6. Conclude on the overall transformation
print("\nOverall Transformation:")
print("The entire process transforms raw, unstructured HDFS log text into a structured DataFrame where each row represents a log template.")
print("This DataFrame contains the template details, occurrence counts, a placeholder for ground truth labels, a scaled feature for machine learning, and importantly, a binary flag indicating whether the template was classified as an anomaly by the Isolation Forest model.")
print("This structured and enriched format is suitable for further analysis, visualization, and understanding of log patterns and anomalies.")

## Summary:

### Data Analysis Key Findings

*   The original HDFS log data is in an unstructured text format, consisting of a sequence of individual log lines (e.g., `20151007 215958 672 INFO FSNamesystem: BLOCK* NameSystem.allocateBlock: /hdfs/kX7j62M10 <*><*> 10.250.14.12:50010`).
*   The processed data is structured into a pandas DataFrame (`clean_hdfs.csv`) with 18 rows and 5 columns: `template_id`, `template`, `occurrences`, `label`, and `template_id_scaled`.
*   The transformation from raw logs to the DataFrame was primarily performed by the Drain3 library, which parsed the log lines, grouped similar messages, and extracted a generalized `template` for each group.
*   The `template_id` column is a unique numerical identifier assigned to each log template by Drain3.
*   The `template` column contains the generalized pattern of the log messages in a group, with variable parts replaced by the wildcard `<*>`.
*   The `occurrences` column represents the count of original log lines that matched a specific template identified by Drain3.
*   The `label` column was added as a placeholder for ground truth labels (currently all 0s) for potential future supervised learning.
*   The `template_id_scaled` column contains a scaled version of `template_id` using StandardScaler, prepared for machine learning algorithms.
*   An `anomaly` column was added based on the output of an Isolation Forest model trained on `occurrences` and `template_id_scaled`, indicating whether a log template was classified as anomalous (1) or normal (0).

### Insights or Next Steps

*   The transformation process effectively converts unstructured log text into a structured format based on recurring patterns, significantly reducing the volume of data and making it suitable for quantitative analysis.
*   The resulting DataFrame is prepared for anomaly detection, with key features (`occurrences`, `template_id_scaled`) and an anomaly prediction (`anomaly`) column. The next steps could involve analyzing the detected anomalies, visualizing the results, or using the `label` column (if ground truth data becomes available) to evaluate the anomaly detection model.
