In [1]:
import os
import json
import pandas as pd
from dotenv import load_dotenv
from tqdm.notebook import tqdm

# LangChain components for RAG
from langchain_community.document_loaders import PyPDFLoader, TextLoader, UnstructuredFileLoader, UnstructuredMarkdownLoader 
from langchain.text_splitter import RecursiveCharacterTextSplitter 
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain_community.vectorstores import FAISS # Using FAISS for persistent vector store
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

# --- Configuration ---
load_dotenv() # Load your API key from .env

# Paths for data (relative to project root, as notebook is in 'notebooks/' folder)
# Input for sequence retrieval
INPUT_PARSED_JSONL_FULL = "../data/parsed_logs/hdfs_v2_parsed_hybrid_final.jsonl" # The 43GB parsed log file
INPUT_OFFSET_INDEX_CSV = "../data/parsed_logs/hdfs_v2_offset_index.csv" # The generated byte-offset index

# Input for RAG knowledge base
SOLUTION_DOCS_DIR = "../data/solution_docs/"

# Path where FAISS index will persist its database (relative to project root)
FAISS_INDEX_PATH = "../faiss_index/" 

# General parameters
NUM_PRECEDING_LOGS_FOR_SEQUENCE = 10 # Number of lines before problematic one to retrieve for sequence
PROBLEMATIC_TEMPLATES_LIST_PATH = "../data/templates/problematic_templates_list.txt" # For sample problems

# --- LLM and Embedding Model Initialization ---
llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash", 
    temperature=0.0, 
    max_output_tokens=4096, # Increased maximum output tokens
    google_api_key=os.getenv("GOOGLE_API_KEY")
)
print("Gemini LLM initialized.")

embedding_model = GoogleGenerativeAIEmbeddings(
    model="models/embedding-001", 
    google_api_key=os.getenv("GOOGLE_API_KEY")
)
print("Gemini Embedding model initialized.")

# Create necessary output directories (relative to project root)
os.makedirs(os.path.dirname(INPUT_PARSED_JSONL_FULL), exist_ok=True) # Ensures data/parsed_logs
os.makedirs(os.path.dirname(INPUT_OFFSET_INDEX_CSV), exist_ok=True) # Ensures data/parsed_logs
os.makedirs(SOLUTION_DOCS_DIR, exist_ok=True) # Ensures data/solution_docs
os.makedirs(FAISS_INDEX_PATH, exist_ok=True) # Ensures faiss_index/

# Ensure input files for byte index and parsed logs exist
if not os.path.exists(INPUT_PARSED_JSONL_FULL):
    print(f"ERROR: Full parsed JSONL file NOT FOUND at {INPUT_PARSED_JSONL_FULL}.")
    print("Please ensure you have run '01_hdfs_log_parsing_hybrid.ipynb' and copied the output.")
if not os.path.exists(INPUT_OFFSET_INDEX_CSV):
    print(f"ERROR: Byte-offset index CSV NOT FOUND at {INPUT_OFFSET_INDEX_CSV}.")
    print(f"Please run '05_byte_offset_indexing.ipynb' first to create it.")

print("\nFull setup complete. Paths and models configured.")

Gemini LLM initialized.
Gemini Embedding model initialized.

Full setup complete. Paths and models configured.


In [None]:
print("--- Loading Byte-Offset Index ---")
offset_map = {} # Initialize empty map

if os.path.exists(INPUT_OFFSET_INDEX_CSV):
    try:
        offset_df = pd.read_csv(INPUT_OFFSET_INDEX_CSV)
        for index, row in tqdm(offset_df.iterrows(), total=len(offset_df), desc="Loading offset map"):
            offset_map[(str(row['source_file']), int(row['line_id_in_file_header']))] = int(row['byte_offset'])
        print(f"Loaded byte-offset index with {len(offset_map)} entries.")
    except Exception as e:
        print(f"ERROR: Could not load offset index CSV: {e}")
        print("Please ensure '05_byte_offset_indexing.ipynb' ran successfully and generated a valid CSV.")
else:
    print("WARNING: Byte-offset index CSV not found. Sequence retrieval will not work.")


print("\n--- Building RAG Knowledge Base ---")

# Check if FAISS index already exists on disk
faiss_index_exists = os.path.exists(FAISS_INDEX_PATH) and os.listdir(FAISS_INDEX_PATH)

if faiss_index_exists:
    print(f"Loading existing FAISS index from {FAISS_INDEX_PATH}...")
    try:
        vectorstore = FAISS.load_local(FAISS_INDEX_PATH, embedding_model, allow_dangerous_deserialization=True) 
        print("FAISS index loaded successfully and contains data.")
    except Exception as e:
        print(f"Error loading FAISS index: {e}. Rebuilding from scratch.")
        faiss_index_exists = False 

if not faiss_index_exists: 
    print("No existing valid FAISS index found or loading failed. Building from scratch...")
    
    # 1. Load Solution Documents
    solution_doc_files_to_load = []
    if not os.path.exists(SOLUTION_DOCS_DIR):
        print(f"ERROR: Solution documents directory not found at {SOLUTION_DOCS_DIR}. RAG will not have context.")
    else:
        for root, _, files in os.walk(SOLUTION_DOCS_DIR):
            for file in files:
                file_path = os.path.join(root, file)
                if file_path.endswith('.pdf'):
                    loader = PyPDFLoader(file_path) 
                elif file_path.endswith(('.txt', '.md', '.html', '.docx', '.xlsx')):
                    loader = UnstructuredFileLoader(file_path) # Requires unstructured[all-extra]
                else:
                    print(f"Skipping unsupported file type: {file_path}")
                    continue
                
                try:
                    docs = loader.load()
                    solution_doc_files_to_load.extend(docs)
                    print(f"Loaded {len(docs)} pages/chunks from {os.path.basename(file_path)}")
                except Exception as e:
                    print(f"ERROR: Could not load {file_path}: {e}")

    print(f"\nTotal raw documents loaded for RAG: {len(solution_doc_files_to_load)}")
    if not solution_doc_files_to_load:
        print("WARNING: No solution documents were loaded. RAG will not have context.")

    # 2. Split Documents into Chunks
    print("Splitting documents into smaller chunks...")
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,     # Size of each chunk
        chunk_overlap=200,   # Overlap between chunks to maintain context
        length_function=len  # Use character length
    )
    rag_chunks = text_splitter.split_documents(solution_doc_files_to_load)
    print(f"Split into {len(rag_chunks)} chunks for RAG.")

    if not rag_chunks:
        print("WARNING: No chunks created for RAG. Check document loading or text splitter settings.")

    # 3. Create Embeddings and Store in FAISS
    print(f"Creating embeddings and storing in FAISS (in-memory and saving to disk at {FAISS_INDEX_PATH})...")
    vectorstore = FAISS.from_documents(
        documents=rag_chunks,
        embedding=embedding_model
    )
    vectorstore.save_local(FAISS_INDEX_PATH) # Save FAISS index to disk
    print("Embeddings created and stored in FAISS, and saved to disk.")

# 4. Create Retriever
retriever = vectorstore.as_retriever()
print("RAG retriever initialized.")

print("\n--- RAG Knowledge Base Setup Complete ---")        

--- Loading Byte-Offset Index ---


Loading offset map:   0%|          | 0/58095583 [00:00<?, ?it/s]

Loaded byte-offset index with 58095583 entries.

--- Building RAG Knowledge Base ---
No existing valid FAISS index found or loading failed. Building from scratch...


  from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4


Loaded 52 pages/chunks from ITM_Hadoop_troubleshooting_guide.pdf

Total raw documents loaded for RAG: 52
Splitting documents into smaller chunks...
Split into 128 chunks for RAG.
Creating embeddings and storing in FAISS (in-memory and saving to disk at ../faiss_index/)...
Embeddings created and stored in FAISS, and saved to disk.
RAG retriever initialized.

--- RAG Knowledge Base Setup Complete ---


In [16]:
# --- Define Sequence Retrieval Function ---
def get_contextual_log_sequence_from_disk(
    all_parsed_jsonl_path: str, # Path to the 43GB JSONL file
    target_entry_metadata: dict, # Problematic entry's metadata: {'source_file': ..., 'line_id_in_file_header': ...}
    num_lines_before: int, 
    offset_map: dict # The loaded (source_file, line_id) -> byte_offset map
) -> list:
    """
    Retrieves a sequence of log entries preceding a target problematic entry from the JSONL file on disk.
    Uses the byte-offset map for efficient seeking.
    """
    target_source_file = target_entry_metadata.get('source_file')
    target_line_id = target_entry_metadata.get('line_id_in_file_header')

    if not target_source_file or not isinstance(target_line_id, int):
        print(f"Error in get_contextual_log_sequence_from_disk: Missing 'source_file' or 'line_id_in_file_header' in target_entry_metadata.")
        return []

    sequence = []
    
    # Calculate the start line ID for the sequence
    start_line_id_in_file = max(1, target_line_id - num_lines_before)
    
    # Attempt to find the byte offset for the start of the sequence
    # Offset map key is (source_file, line_id_in_file_header)
    start_offset_key = (target_source_file, start_line_id_in_file)
    start_byte_offset = offset_map.get(start_offset_key)

    if start_byte_offset is None:
        # If the exact start_line_id isn't in the map, this means either:
        # 1. The very first few lines of the file are being requested (before the map's start).
        # 2. The offset_map doesn't contain an entry for this specific line_id.
        print(f"Warning: Exact start offset for line {start_line_id_in_file} in {target_source_file} not found in offset map. Returning empty sequence.")
        return [] 

    # Read lines from the parsed JSONL file on disk
    try:
        with open(all_parsed_jsonl_path, 'r', encoding='utf-8', errors='ignore') as f:
            f.seek(start_byte_offset) # Jump directly to the approximate start of the sequence

            lines_read_from_seek = 0
            # Read enough lines to cover the sequence, plus some buffer, until target line_id
            for line in f:
                if lines_read_from_seek >= num_lines_before + 5: # Read up to 2*N lines plus buffer to be safe
                    break 

                try:
                    parsed_seq_entry = json.loads(line)
                    # Add only logs from the same source file and BEFORE the target line_id
                    if (parsed_seq_entry.get('source_file') == target_source_file and 
                        parsed_seq_entry.get('line_id_in_file_header', 0) < target_line_id):
                        sequence.append(parsed_seq_entry)
                    lines_read_from_seek += 1
                except json.JSONDecodeError:
                    pass 

        # Ensure the sequence is sorted by line_id (chronological) and limited to num_lines_before
        sequence.sort(key=lambda x: x.get('line_id_in_file_header', 0))
        sequence = sequence[-num_lines_before:] # Take the last N lines, which are the preceding ones

    except Exception as e:
        print(f"Error retrieving sequence from disk for {target_source_file} line {target_line_id}: {e}")
        return []

    return sequence

print("Sequence retrieval function defined.")


print("\n--- Loading Sample Problematic Templates ---")
problematic_templates_for_test = []
current_template_data = {}
current_section = None

def clean_value(line): # Helper function from previous notebook
    return line.split(':', 1)[1].strip() if ':' in line else line.strip()

if os.path.exists(PROBLEMATIC_TEMPLATES_LIST_PATH):
    with open(PROBLEMATIC_TEMPLATES_LIST_PATH, 'r', encoding='utf-8', errors='ignore') as f:
        for line in f:
            line = line.strip()
            if line.startswith("--- Template ID:"):
                if current_template_data: # Save previous template if exists
                    problematic_templates_for_test.append(current_template_data)
                
                template_id = line.split(":", 1)[1].strip().replace(" ---", "")
                current_template_data = {"event_id": template_id, "original_log_full": "", "metadata": {}}
                current_section = None 
            elif line.startswith("Level:"):
                current_template_data["level"] = clean_value(line)
            elif line.startswith("Component:"):
                current_template_data["component"] = clean_value(line)
            elif line.startswith("Template:"):
                current_template_data["event_template"] = clean_value(line)
            elif line.startswith("Original Message Sample:"):
                current_section = "Original Message Sample"
                current_template_data["original_log_full"] = []
            elif current_section == "Original Message Sample":
                if line == "" or line.startswith("--- Template ID:") or line.startswith("--- End of"):
                    current_template_data["original_log_full"] = "\n".join(current_template_data["original_log_full"]).strip()
                    current_section = None 
                    if line.startswith("--- Template ID:"): 
                        template_id = line.split(":", 1)[1].strip().replace(" ---", "")
                        problematic_templates_for_test.append(current_template_data)
                        current_template_data = {"event_id": template_id, "original_log_full": "", "metadata": {}}
                        current_section = None
                else:
                    current_template_data["original_log_full"].append(line)
        if current_template_data: # Add the last template
            if isinstance(current_template_data["original_log_full"], list):
                current_template_data["original_log_full"] = "\n".join(current_template_data["original_log_full"]).strip()
            problematic_templates_for_test.append(current_template_data)

    print(f"Loaded {len(problematic_templates_for_test)} problematic templates for testing.")
else:
    print(f"ERROR: Problematic templates list file NOT FOUND at {PROBLEMATIC_TEMPLATES_LIST_PATH}.")

if not problematic_templates_for_test:
    print("WARNING: No problematic templates loaded. Sequence retrieval test will be skipped.")

Sequence retrieval function defined.

--- Loading Sample Problematic Templates ---
Loaded 47 problematic templates for testing.


In [21]:
print("--- Defining LLM Prompt Template ---")

# Define the Prompt Template for the LLM
prompt_template = """
You are an expert HDFS (Hadoop Distributed File System) Site Reliability Engineer (SRE) and incident responder.
Your task is to analyze a given problematic HDFS log entry, understand the underlying problem, and generate a comprehensive incident response plan.

Here is the problematic HDFS log entry for analysis:
{log_entry_full}

Here is the sequence of preceding log events that led up to this problem:
{sequence_of_events_json}

Here is relevant troubleshooting and solution context from our knowledge base:
{context}

Based on the HDFS log entry, the sequence of events, and the provided context, please perform the following:

1.  **Problem Summary:** Provide a concise, clear summary of the issue, referencing insights from the sequence if possible.
2.  **Severity Assessment:** Assign a severity level (Critical, High, Medium, Low) based on the log's impact.
3.  **Root Cause Hypothesis:** Suggest the most probable root cause(s), specifically mentioning any causal links identified in the sequence.
4.  **Affected Components:** List the HDFS components or services that are most likely affected.
5.  **Actionable Response Plan (Role-Specific):**
    * **DevOps/SRE Actions:** Detailed, step-by-step commands/checks for immediate diagnosis and mitigation.
    * **Developer Actions:** Specific areas to check in code, potential data issues, or configurations.
    * **Security Actions (if applicable):** Steps to verify/address potential security implications.

Format your entire response as a single JSON object with the following keys:
"summary": "...",
"severity": "...",
"root_cause_hypothesis": "...",
"affected_components": ["...", "..."],
"response_plan": {{
    "devops_sre_actions": [
        {{"step_description": "...", "command": "...", "type": "..."}},
        {{"step_description": "...", "command": "...", "type": "..."}}
    ],
    "developer_actions": [
        {{"step_description": "...", "command": "...", "type": "..."}},
        {{"step_description": "...", "command": "...", "type": "..."}}
    ],
    "security_actions": [
        {{"step_description": "...", "command": "...", "type": "..."}},
        {{"step_description": "...", "command": "...", "type": "..."}}
    ]
}}

**CRITICAL INSTRUCTION:** Ensure your response is ONLY the complete and valid JSON object. Do NOT include any text before or after the JSON. Do NOT truncate the JSON object. It must be perfectly parsable JSON.
"""

# Create the PromptTemplate object
PROMPT = PromptTemplate(
    template=prompt_template,
    input_variables=["log_entry_full", "sequence_of_events_json", "context"]
)

print("LLM Prompt Template defined.")
print("\n--- LLM Analysis & Solution Generation ---")

if problematic_templates_for_test: # Iterate through samples loaded in Cell 3
    # IMPORTANT: For quick testing, you might want to slice this list (e.g., problematic_templates_for_test[:5])
    for i, sample_entry in enumerate(tqdm(problematic_templates_for_test, desc="Analyzing problematic templates")):
        print(f"\n--- Analyzing Template {i+1} (ID: {sample_entry.get('event_id', 'N/A')}, Level: {sample_entry.get('level', 'N/A')}) ---")
        print(f"Event Template: {sample_entry.get('event_template', 'N/A')}")
        print(f"Original Full Log Snippet:\n{sample_entry.get('original_log_full', '')[:500]}...") # Show a snippet of the log

        # 1. Retrieve Contextual Sequence
        # Need to construct target_entry_metadata with source_file and line_id for the function
        # This is not directly available in problematic_templates_for_test from list.txt
        # For testing, we'll use a placeholder or assume a fixed one for now, as direct sequence
        # from problematic_templates_for_test is not readily available from the current format.
        
        # --- IMPORTANT: If you want to test sequence retrieval effectively here,
        # 'sample_entry' must contain 'source_file' and 'line_id_in_file_header'
        # which it DOES NOT from 'problematic_templates_list.txt'.
        # For a full test of sequence, you'd need problematic entries directly from the 43GB JSONL.
        # For this notebook's scope, we'll mock it or provide a warning.

        mock_target_entry_metadata = {
            "source_file": "hadoop-hdfs-datanode-mesos-01.log", # Example, please adjust for your data
            "line_id_in_file_header": 80939 # Example, please adjust for your data
        }
        print("\nNOTE: Sequence retrieval in this test uses a MOCKED target entry metadata.")
        print("For a real sequence, 'sample_entry' needs 'source_file'/'line_id_in_file_header'.")


        log_sequence = get_contextual_log_sequence_from_disk(
            all_parsed_jsonl_path=INPUT_PARSED_JSONL_FULL, # From Cell 1
            target_entry_metadata=mock_target_entry_metadata, # MOCKED
            num_lines_before=NUM_PRECEDING_LOGS_FOR_SEQUENCE,
            offset_map=offset_map # From Cell 2
        )
        sequence_json_str = json.dumps(log_sequence, indent=2)
        if not log_sequence:
            sequence_json_str = "No preceding log events retrieved (mocked or actual retrieval failed)."
            print("Sequence analysis result: No preceding events retrieved.")
        else:
            print(f"Retrieved {len(log_sequence)} preceding log entries for sequence analysis.")
        
        # 1. Prepare query for RAG retriever
        retriever_query = f"HDFS troubleshooting for {sample_entry.get('level', 'N/A')} log: {sample_entry.get('event_template', '')}. Provide solution for original message: {sample_entry.get('original_log_full', '')[:200]}"
        
        # 2. Retrieve relevant context from knowledge base
        print("Retrieving relevant context from knowledge base...")
        retrieved_docs = retriever.invoke(retriever_query)
        
        context_text = "\n\n".join([doc.page_content for doc in retrieved_docs])
        if not context_text:
            context_text = "No specific context found in the knowledge base. The LLM will generate a general solution."
            print("WARNING: No specific context retrieved from RAG. Solution may be general.")
        else:
            print(f"Retrieved {len(retrieved_docs)} document chunks.")

        # 3. Prepare the final prompt for the LLM
        final_prompt_for_llm = PROMPT.format(
            log_entry_full=sample_entry['original_log_full'], 
            sequence_of_events_json=sequence_json_str, 
            context=context_text
        )

        # 4. Invoke the LLM directly with the prepared prompt
        print("Sending final prompt to Gemini LLM for solution generation...")
        try:
            llm_response = llm.invoke(final_prompt_for_llm)
            solution_json_str = llm_response.content.strip()

            if solution_json_str.startswith("```json"):
                solution_json_str = solution_json_str.lstrip("```json").rstrip("```").strip()
            
            try:
                generated_solution = json.loads(solution_json_str)
                print("\n--- GENERATED INCIDENT RESPONSE PLAN (JSON) ---")
                print(json.dumps(generated_solution, indent=2))
                print("-----------------------------------------------")
            except json.JSONDecodeError as e:
                print(f"ERROR: LLM response was not valid JSON for template {sample_entry.get('event_id', 'N/A')}: {e}")
                print(f"Raw LLM response content (first 500 chars):\n {solution_json_str[:500]}") 

        except Exception as e:
            print(f"ERROR: Failed to invoke LLM for template {sample_entry.get('event_id', 'N/A')}: {e}")
            print("Please check your API key, internet connection, and Gemini API quotas.")

else:
    print("No problematic templates found to analyze. Please ensure Cell 3 loaded some.")

print("\n--- LLM Analysis & Solution Generation Complete for Samples ---")

--- Defining LLM Prompt Template ---
LLM Prompt Template defined.

--- LLM Analysis & Solution Generation ---


Analyzing problematic templates:   0%|          | 0/47 [00:00<?, ?it/s]


--- Analyzing Template 1 (ID: HDFS_6C708D31, Level: WARN) ---
Event Template: Slow BlockReceiver write packet to mirror took <NUM>ms (threshold=<NUM>ms)
Original Full Log Snippet:
2016-04-13 21:56:12,682 WARN org.apache.hadoop.hdfs.server.datanode.DataNode: Slow BlockReceiver write packet to mirror took 340ms (threshold=300ms)...

NOTE: Sequence retrieval in this test uses a MOCKED target entry metadata.
For a real sequence, 'sample_entry' needs 'source_file'/'line_id_in_file_header'.
Retrieved 10 preceding log entries for sequence analysis.
Retrieving relevant context from knowledge base...
Retrieved 4 document chunks.
Sending final prompt to Gemini LLM for solution generation...

--- GENERATED INCIDENT RESPONSE PLAN (JSON) ---
{
  "summary": "The DataNode is experiencing slow write operations to its mirror DataNode, exceeding the configured threshold of 300ms. The log entry 'Slow BlockReceiver write packet to mirror took 340ms (threshold=300ms)' indicates a performance bottleneck du

KeyboardInterrupt: 

In [None]:
print("--- Defining LLM Prompt Template ---")

# Define the Prompt Template for the LLM
prompt_template = """
You are an expert HDFS (Hadoop Distributed File System) Site Reliability Engineer (SRE) and incident responder.
Your task is to concisely analyze a given problematic HDFS log entry, understand the specific underlying problem, and generate a brief, actionable incident response plan.

Here is the problematic HDFS log entry for analysis:
{log_entry_full}

Here is the sequence of preceding log events that led up to this problem:
{sequence_of_events_json}

Here is relevant troubleshooting and solution context from our knowledge base:
{context}

Based on the HDFS log entry, the sequence of events, and the provided context, please perform the following:

1.  **Problem Summary:** Provide a concise, clear summary of what specifically went wrong, referencing insights from the sequence if possible.
2.  **Severity Assessment:** Assign a severity level (Critical, High, Medium, Low) based on the log's impact.
3.  **Root Cause Hypothesis:** Suggest the most probable root cause(s). **Explain the flow or chain of events that likely led to this problem, explicitly referencing specific events or patterns from the provided sequence.**
4.  **Affected Components:** List the HDFS components or services that are most likely affected.
5.  **Actionable Response Plan (Role-Specific):**
    * **DevOps/SRE Actions:** Detailed, step-by-step actions.
    * **Developer Actions:** Specific areas to check in code, potential data issues, or configurations.
    * **Security Actions (if applicable):** Steps to verify/address potential security implications.

Format your entire response as a single JSON object with the following keys:
"summary": "...",
"severity": "...",
"root_cause_hypothesis": "...",
"affected_components": ["...", "..."],
"response_plan": {{
    "devops_sre_actions": ["...", "..."],
    "developer_actions": ["...", "..."],
    "security_actions": ["...", "..."]
}}

**CRITICAL INSTRUCTION:** Ensure your response is ONLY the complete and valid JSON object. Do NOT include any text before or after the JSON. Do NOT truncate the JSON object. It must be perfectly parsable JSON.
"""

# Create the PromptTemplate object
PROMPT = PromptTemplate(
    template=prompt_template,
    input_variables=["log_entry_full", "sequence_of_events_json", "context"]
)

print("LLM Prompt Template defined.")
print("\n--- LLM Analysis & Solution Generation ---")

if problematic_templates_for_test:
    # Iterate through each problematic log entry found in the full dataset
    # IMPORTANT: For full runs, this loop can take a very long time and consume many API tokens.
    # For initial testing, you might want to slice this list (e.g., problematic_log_entries_full[:5])
    for i, entry_to_analyze in enumerate(tqdm(problematic_templates_for_test, desc="Analyzing problematic entries")):
        print(f"\n--- Analyzing Entry {i+1} (ID: {entry_to_analyze.get('event_id', 'N/A')}, Level: {entry_to_analyze.get('level', 'N/A')}) ---")
        print(f"Original Full Log Snippet:\n{entry_to_analyze.get('original_log_full', '')[:500]}...") 

        # 1. Retrieve Contextual Sequence
        # This function is defined in Cell 6, which is now executing before this cell.
        # It relies on 'parsed_hdfs_logs_full' (from Cell 6) and 'offset_map' (from Cell 2)
        
        log_sequence = get_contextual_log_sequence_from_disk(
            all_parsed_jsonl_path=INPUT_PARSED_JSONL_FULL, 
            target_entry_metadata=entry_to_analyze, 
            num_lines_before=NUM_PRECEDING_LOGS_FOR_SEQUENCE,
            offset_map=offset_map
        )
        sequence_json_str = json.dumps(log_sequence, indent=2)
        if not log_sequence:
            sequence_json_str = "No preceding log events retrieved or sequence unavailable."
            print("Sequence analysis result: No preceding events retrieved.")
        else:
            print(f"Retrieved {len(log_sequence)} preceding log entries for sequence analysis.")
        
        # 2. Prepare query for RAG retriever
        retriever_query = f"HDFS troubleshooting for {entry_to_analyze.get('level', 'N/A')} log: {entry_to_analyze.get('event_template', '')}. Provide solution for original message: {entry_to_analyze.get('original_log_full', '')[:200]}"
        
        # 3. Retrieve relevant context from knowledge base
        print("Retrieving relevant context from knowledge base...")
        retrieved_docs = retriever.invoke(retriever_query)
        
        context_text = "\n\n".join([doc.page_content for doc in retrieved_docs])
        if not context_text:
            context_text = "No specific context found in the knowledge base. The LLM will generate a general solution."
            print("WARNING: No specific context retrieved from RAG. Solution may be general.")
        else:
            print(f"Retrieved {len(retrieved_docs)} document chunks.")

        # 4. Prepare the final prompt for the LLM
        final_prompt_for_llm = PROMPT.format(
            log_entry_full=entry_to_analyze['original_log_full'], 
            sequence_of_events_json=sequence_json_str, 
            context=context_text
        )

        # 5. Invoke the LLM directly with the prepared prompt
        print("Sending final prompt to Gemini LLM for solution generation...")
        try:
            llm_response = llm.invoke(final_prompt_for_llm)
            solution_json_str = llm_response.content.strip()

            if solution_json_str.startswith("```json"):
                solution_json_str = solution_json_str.lstrip("```json").rstrip("```").strip()
            
            try:
                generated_solution = json.loads(solution_json_str)
                print("\n--- GENERATED INCIDENT RESPONSE PLAN (JSON) ---")
                print(json.dumps(generated_solution, indent=2))
                print("-----------------------------------------------")
            except json.JSONDecodeError as e:
                print(f"ERROR: LLM response was not valid JSON for entry {entry_to_analyze.get('event_id', 'N/A')}: {e}")
                print(f"Raw LLM response content (first 500 chars):\n {solution_json_str[:500]}") 

        except Exception as e:
            print(f"ERROR: Failed to invoke LLM for entry {entry_to_analyze.get('event_id', 'N/A')}: {e}")
            print("Please check your API key, internet connection, and Gemini API quotas.")

else:
    print("No problematic log entries found to analyze. Please ensure Cell 6 loaded some.")

print("\n--- Full LLM Analysis & Solution Generation Complete ---")

--- Defining LLM Prompt Template ---
LLM Prompt Template defined.

--- LLM Analysis & Solution Generation ---


Analyzing problematic entries:   0%|          | 0/47 [00:00<?, ?it/s]


--- Analyzing Entry 1 (ID: HDFS_6C708D31, Level: WARN) ---
Original Full Log Snippet:
2016-04-13 21:56:12,682 WARN org.apache.hadoop.hdfs.server.datanode.DataNode: Slow BlockReceiver write packet to mirror took 340ms (threshold=300ms)...


NameError: name 'OUTPUT_PARSED_JSONL' is not defined