In [16]:
import os
import json
import pandas as pd
from dotenv import load_dotenv
from tqdm.notebook import tqdm

# LangChain components for RAG
from langchain_community.document_loaders import PyPDFLoader, TextLoader, UnstructuredFileLoader, UnstructuredMarkdownLoader 
from langchain.text_splitter import RecursiveCharacterTextSplitter 
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain_community.vectorstores import FAISS # Using FAISS for persistent vector store
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

# --- Configuration ---
load_dotenv() # Load your API key from .env

# Paths for data (relative to project root, as notebook is in 'notebooks/' folder)
INPUT_PARSED_JSONL_FULL = "../data/parsed_logs/hdfs_v2_parsed_hybrid_final.jsonl" # The 43GB parsed log file
INPUT_OFFSET_INDEX_CSV = "../data/parsed_logs/hdfs_v2_offset_index.csv" # The generated byte-offset index

# Input for RAG knowledge base
SOLUTION_DOCS_DIR = "../data/solution_docs/"

# Path where FAISS index will persist its database (relative to project root)
FAISS_INDEX_PATH = "../faiss_index/" 

# General parameters
NUM_PRECEDING_LOGS_FOR_SEQUENCE = 20 # Number of lines before problematic one to retrieve for sequence
MAX_PARSED_LOGS_TO_LOAD = 100000 # <--- Limit the number of parsed HDFS logs to load (from 43GB file)
MAX_OFFSET_INDEX_LOAD = 100000 # <--- NEW: Limit the number of entries to load from byte-offset index CSV

# --- LLM and Embedding Model Initialization ---
llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash", 
    temperature=0.0, 
    max_output_tokens=4096, # Increased maximum output tokens
    google_api_key=os.getenv("GOOGLE_API_KEY")
)
print("Gemini LLM initialized.")

embedding_model = GoogleGenerativeAIEmbeddings(
    model="models/embedding-001", 
    google_api_key=os.getenv("GOOGLE_API_KEY")
)
print("Gemini Embedding model initialized.")

# Create necessary output directories (relative to project root)
os.makedirs(os.path.dirname(INPUT_PARSED_JSONL_FULL), exist_ok=True) # Ensures data/parsed_logs
os.makedirs(os.path.dirname(INPUT_OFFSET_INDEX_CSV), exist_ok=True) # Ensures data/parsed_logs
os.makedirs(SOLUTION_DOCS_DIR, exist_ok=True) # Ensures data/solution_docs
os.makedirs(FAISS_INDEX_PATH, exist_ok=True) # Ensures faiss_index/

# Ensure input files for byte index and parsed logs exist
if not os.path.exists(INPUT_PARSED_JSONL_FULL):
    print(f"ERROR: Full parsed JSONL file NOT FOUND at {INPUT_PARSED_JSONL_FULL}.")
    print("Please ensure you have run '01_hdfs_log_parsing_hybrid.ipynb' and copied the output.")
if not os.path.exists(INPUT_OFFSET_INDEX_CSV):
    print(f"ERROR: Byte-offset index CSV NOT FOUND at {INPUT_OFFSET_INDEX_CSV}.")
    print(f"Please run '05_byte_offset_indexing.ipynb' first to create it.")

print("\nFull setup complete. Paths and models configured.")

Gemini LLM initialized.
Gemini Embedding model initialized.

Full setup complete. Paths and models configured.


In [17]:
print("--- Loading Byte-Offset Index ---")
offset_map = {} # Initialize empty map

if os.path.exists(INPUT_OFFSET_INDEX_CSV):
    try:
        offset_df = pd.read_csv(INPUT_OFFSET_INDEX_CSV)
        
        # NEW: Limit the number of rows loaded from the byte-offset CSV
        if len(offset_df) > MAX_OFFSET_INDEX_LOAD:
            print(f"Limiting byte-offset index loading to first {MAX_OFFSET_INDEX_LOAD} entries for testing.")
            offset_df = offset_df.head(MAX_OFFSET_INDEX_LOAD) # Use .head() to limit DataFrame rows

        for index, row in tqdm(offset_df.iterrows(), total=len(offset_df), desc="Loading offset map"):
            offset_map[(str(row['source_file']), int(row['line_id_in_file_header']))] = int(row['byte_offset'])
        print(f"Loaded byte-offset index with {len(offset_map)} entries.")
    except Exception as e:
        print(f"ERROR: Could not load offset index CSV: {e}")
        print("Please ensure '05_byte_offset_indexing.ipynb' ran successfully and generated a valid CSV.")
else:
    print("WARNING: Byte-offset index CSV not found. Sequence retrieval will not work.")


print("\n--- Building RAG Knowledge Base ---")

# Check if FAISS index already exists on disk
faiss_index_exists = os.path.exists(FAISS_INDEX_PATH) and os.listdir(FAISS_INDEX_PATH)

if faiss_index_exists:
    print(f"Loading existing FAISS index from {FAISS_INDEX_PATH}...")
    try:
        vectorstore = FAISS.load_local(FAISS_INDEX_PATH, embedding_model, allow_dangerous_deserialization=True) 
        print("FAISS index loaded successfully and contains data.")
    except Exception as e:
        print(f"Error loading FAISS index: {e}. Rebuilding from scratch.")
        faiss_index_exists = False 

if not faiss_index_exists: 
    print("No existing valid FAISS index found or loading failed. Building from scratch...")
    
    # 1. Load Solution Documents
    solution_doc_files_to_load = []
    if not os.path.exists(SOLUTION_DOCS_DIR):
        print(f"ERROR: Solution documents directory not found at {SOLUTION_DOCS_DIR}. RAG will not have context.")
    else:
        for root, _, files in os.walk(SOLUTION_DOCS_DIR):
            for file in files:
                file_path = os.path.join(root, file)
                if file_path.endswith('.pdf'):
                    loader = PyPDFLoader(file_path) 
                elif file_path.endswith(('.txt', '.md', '.html', '.docx', '.xlsx')):
                    loader = UnstructuredFileLoader(file_path) 
                else:
                    print(f"Skipping unsupported file type: {file_path}")
                    continue
                
                try:
                    docs = loader.load()
                    solution_doc_files_to_load.extend(docs)
                    print(f"Loaded {len(docs)} pages/chunks from {os.path.basename(file_path)}")
                except Exception as e:
                    print(f"ERROR: Could not load {file_path}: {e}")

    print(f"\nTotal raw documents loaded for RAG: {len(solution_doc_files_to_load)}")
    if not solution_doc_files_to_load:
        print("WARNING: No solution documents were loaded. RAG will not have context.")

    # 2. Split Documents into Chunks
    print("Splitting documents into smaller chunks...")
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,     # Size of each chunk
        chunk_overlap=200,   # Overlap between chunks to maintain context
        length_function=len  # Use character length
    )
    rag_chunks = text_splitter.split_documents(solution_doc_files_to_load)
    
    # OLD LIMIT - REMOVED: if len(rag_chunks) > MAX_RAG_CHUNKS_TO_LOAD:
    #     print(f"Limiting RAG chunks to first {MAX_RAG_CHUNKS_TO_LOAD} for testing.")
    #     rag_chunks = rag_chunks[:MAX_RAG_CHUNKS_TO_LOAD]

    print(f"Split into {len(rag_chunks)} chunks for RAG.")

    if not rag_chunks:
        print("WARNING: No chunks created for RAG. Check document loading or text splitter settings.")

    # 3. Create Embeddings and Store in FAISS
    print(f"Creating embeddings and storing in FAISS (in-memory and saving to disk at {FAISS_INDEX_PATH})...")
    vectorstore = FAISS.from_documents(
        documents=rag_chunks,
        embedding=embedding_model
    )
    vectorstore.save_local(FAISS_INDEX_PATH) # Save FAISS index to disk
    print("Embeddings created and stored in FAISS, and saved to disk.")

# 4. Create Retriever
retriever = vectorstore.as_retriever()
print("RAG retriever initialized.")

print("\n--- RAG Knowledge Base Setup Complete ---")

--- Loading Byte-Offset Index ---
Limiting byte-offset index loading to first 100000 entries for testing.


Loading offset map:   0%|          | 0/100000 [00:00<?, ?it/s]

Loaded byte-offset index with 100000 entries.

--- Building RAG Knowledge Base ---
Loading existing FAISS index from ../faiss_index/...
FAISS index loaded successfully and contains data.
RAG retriever initialized.

--- RAG Knowledge Base Setup Complete ---


In [18]:
# --- Define Sequence Retrieval Function ---
def get_contextual_log_sequence_from_disk(
    all_parsed_jsonl_path: str, # Path to the 43GB JSONL file
    target_entry_metadata: dict, # Problematic entry's metadata: {'source_file': ..., 'line_id_in_file_header': ...}
    num_lines_before: int, 
    offset_map: dict # The loaded (source_file, line_id) -> byte_offset map
) -> list:
    """
    Retrieves a sequence of log entries preceding a target problematic entry from the JSONL file on disk.
    Uses the byte-offset map for efficient seeking.
    """
    target_source_file = target_entry_metadata.get('source_file')
    target_line_id = target_entry_metadata.get('line_id_in_file_header')

    if not target_source_file or not isinstance(target_line_id, int):
        print(f"Error in get_contextual_log_sequence_from_disk: Missing 'source_file' or 'line_id_in_file_header' in target_entry_metadata.")
        return []

    sequence = []
    
    # Calculate the start line ID for the sequence
    start_line_id_in_file = max(1, target_line_id - num_lines_before)
    
    # Attempt to find the byte offset for the start of the sequence
    # Offset map key is (source_file, line_id_in_file_header)
    start_offset_key = (target_source_file, start_line_id_in_file)
    start_byte_offset = offset_map.get(start_offset_key)

    if start_byte_offset is None:
        print(f"Warning: Exact start offset for line {start_line_id_in_file} in {target_source_file} not found in offset map. Returning empty sequence.")
        return [] 

    # Read lines from the parsed JSONL file on disk
    try:
        with open(all_parsed_jsonl_path, 'r', encoding='utf-8', errors='ignore') as f:
            f.seek(start_byte_offset) # Jump directly to the approximate start of the sequence

            lines_read_from_seek = 0
            # Read enough lines to cover the sequence, plus some buffer, until target line_id
            for line in f:
                if lines_read_from_seek >= num_lines_before * 2 + 5: # Read up to 2*N lines plus buffer to be safe
                    break 

                try:
                    parsed_seq_entry = json.loads(line)
                    # Add only logs from the same source file and BEFORE the target line_id
                    if (parsed_seq_entry.get('source_file') == target_source_file and 
                        parsed_seq_entry.get('line_id_in_file_header', 0) < target_line_id):
                        sequence.append(parsed_seq_entry)
                    lines_read_from_seek += 1
                except json.JSONDecodeError:
                    pass 

        # Ensure the sequence is sorted by line_id (chronological) and limited to num_lines_before
        sequence.sort(key=lambda x: x.get('line_id_in_file_header', 0))
        sequence = sequence[-num_lines_before:] # Take the last N lines, which are the preceding ones

    except Exception as e:
        print(f"Error retrieving sequence from disk for {target_source_file} line {target_line_id}: {e}")
        return []

    return sequence

print("Sequence retrieval function defined.")


print("\n--- Loading & Filtering Parsed HDFS Logs for LLM Analysis ---")

# Load only the first MAX_PARSED_LOGS_TO_LOAD entries for testing
problematic_log_entries_for_llm = [] 
all_parsed_logs_for_sequence_lookup = [] # This list will hold all parsed entries (up to MAX_PARSED_LOGS_TO_LOAD)
PROBLEMATIC_LEVELS_TO_ANALYZE = ['ERROR', 'WARN', 'FATAL']

if os.path.exists(INPUT_PARSED_JSONL_FULL):
    print(f"Loading first {MAX_PARSED_LOGS_TO_LOAD} lines from {INPUT_PARSED_JSONL_FULL} for testing...")
    with open(INPUT_PARSED_JSONL_FULL, 'r', encoding='utf-8', errors='ignore') as f:
        # Use tqdm to show progress for this loading loop
        for i, line in enumerate(tqdm(f, desc=f"Loading up to {MAX_PARSED_LOGS_TO_LOAD} parsed logs")):
            if i >= MAX_PARSED_LOGS_TO_LOAD: # Stop after MAX_PARSED_LOGS_TO_LOAD lines
                break
            try:
                parsed_entry = json.loads(line)
                all_parsed_logs_for_sequence_lookup.append(parsed_entry) # Store all loaded for sequence lookup
                
                if 'level' in parsed_entry and parsed_entry['level'].strip().upper() in PROBLEMATIC_LEVELS_TO_ANALYZE:
                    # Store its original index in this loaded list for sequence retrieval
                    parsed_entry['original_index_in_loaded_list'] = i 
                    problematic_log_entries_for_llm.append(parsed_entry)

            except json.JSONDecodeError as e:
                print(f"Warning: Could not decode JSON on line {i+1} of parsed logs: {e} - Line: {line.strip()[:100]}...")
                
    print(f"Successfully loaded {len(all_parsed_logs_for_sequence_lookup)} total parsed HDFS log entries into memory (for sequence lookup).")
    print(f"Found {len(problematic_log_entries_for_llm)} problematic log entries for LLM analysis.")
else:
    print(f"ERROR: Full parsed JSONL file NOT FOUND at {INPUT_PARSED_JSONL_FULL}.")

if not problematic_log_entries_for_llm:
    print("WARNING: No problematic log entries loaded in the sample. LLM analysis will be skipped.")

print("\n--- Parsed HDFS Logs Loaded & Problematic Entries Filtered (Sample) ---")

Sequence retrieval function defined.

--- Loading & Filtering Parsed HDFS Logs for LLM Analysis ---
Loading first 100000 lines from ../data/parsed_logs/hdfs_v2_parsed_hybrid_final.jsonl for testing...


Loading up to 100000 parsed logs: 0it [00:00, ?it/s]

Successfully loaded 100000 total parsed HDFS log entries into memory (for sequence lookup).
Found 11 problematic log entries for LLM analysis.

--- Parsed HDFS Logs Loaded & Problematic Entries Filtered (Sample) ---


In [19]:
print("--- Defining LLM Prompt Template ---")

# Define the Prompt Template for the LLM
# FINALIZED: Adds meta-reasoning, causal flow in root cause, and impact of solution steps.
prompt_template = """
You are an expert HDFS (Hadoop Distributed File System) Site Reliability Engineer (SRE) and incident responder.
Your task is to concisely analyze a given problematic HDFS log entry, understand the specific underlying problem, and generate a brief, actionable incident response plan.

Here is the problematic HDFS log entry for analysis:
{log_entry_full}

Here is the sequence of preceding log events that led up to this problem:
{sequence_of_events_json}

Here is relevant troubleshooting and solution context from our knowledge base:
{context}

Based on the HDFS log entry, the sequence of events, and the provided context, please perform the following:

**LLM Analysis Feedback (Self-Evaluation):**
* **Confidence Level (High/Medium/Low):** How confident are you in this analysis based on the provided information?
* **Context Sufficiency (Sufficient/Partially Sufficient/Insufficient):** Is the provided `log_entry_full`, `sequence_of_events_json`, and `context` enough for a definitive root cause and solution?
* **Needed Additional Info (If Insufficient):** Explain what specific additional log entries (e.g., timestamps, components, error messages), metrics, or external context would improve accuracy.

1.  **Problem Summary (Concise):** Provide a brief, clear summary of what specifically went wrong, referencing insights from the sequence if possible.
2.  **Severity Assessment:** Assign a severity level (Critical, High, Medium, Low) based on the log's impact.
3.  **Root Cause Hypothesis:** Suggest the most probable root cause(s). **Explain the flow or chain of events that likely led to this problem, explicitly referencing specific events or patterns from the provided sequence.** Be specific about the modules or components involved in the causal chain.
4.  **Affected Components:** List the HDFS components or services that are most likely affected.
5.  **Actionable Response Plan (Brief & Role-Specific):**
    * For each action, provide:
        * `step_description`: A concise description of the action.
        * `responsible_team`: Specify WHICH TEAM (DevOps/SRE, Developer, Security).
        * `responsible_module_or_component`: Specify WHICH MODULE/COMPONENT should perform the action (e.g., DataNode Storage, NameNode IPC Layer, HDFS Client Library).
        * `specific_effect_on_problem`: What this action will specifically do and how it directly addresses the error or root cause.
        * `expected_outcome_or_status`: What specific result to look for if the action is successful (e.g., "logs show 'service started'", "metric returns to baseline").
    * **Limit the number of actions to the top 3-5 most impactful per role.**
    * **Do NOT include direct commands in the `step_description`.** Focus on the action and its specific impact.

Format your entire response as a single JSON object with the following keys:
"llm_analysis_feedback": {{
    "confidence_level": "...",
    "context_sufficiency": "...",
    "needed_additional_info": "..."
}},
"summary": "...",
"severity": "...",
"root_cause_hypothesis": "...",
"affected_components": ["...", "..."],
"response_plan": {{
    "devops_sre_actions": [
        {{"step_description": "...", "responsible_team": "...", "responsible_module_or_component": "...", "specific_effect_on_problem": "...", "expected_outcome_or_status": "..."}},
        {{"step_description": "...", "responsible_team": "...", "responsible_module_or_component": "...", "specific_effect_on_problem": "...", "expected_outcome_or_status": "..."}}
    ],
    "developer_actions": [
        {{"step_description": "...", "responsible_team": "...", "responsible_module_or_component": "...", "specific_effect_on_problem": "...", "expected_outcome_or_status": "..."}},
        {{"step_description": "...", "responsible_team": "...", "responsible_module_or_component": "...", "specific_effect_on_problem": "...", "expected_outcome_or_status": "..."}}
    ],
    "security_actions": [
        {{"step_description": "...", "responsible_team": "...", "responsible_module_or_component": "...", "specific_effect_on_problem": "...", "expected_outcome_or_status": "..."}},
        {{"step_description": "...", "responsible_team": "...", "responsible_module_or_component": "...", "specific_effect_on_problem": "...", "expected_outcome_or_status": "..."}}
    ]
}}

**CRITICAL INSTRUCTION:** Ensure your response is ONLY the complete and valid JSON object. Do NOT include any text before or after the JSON. Do NOT truncate the JSON object. It must be perfectly parsable JSON.
"""

# Create the PromptTemplate object
PROMPT_TEMPLATE_LLM = PromptTemplate( # Renamed to avoid clash if 'PROMPT' is used by other LangChain components
    template=prompt_template,
    input_variables=["log_entry_full", "sequence_of_events_json", "context"]
)

print("LLM Prompt Template defined.")
print("\n--- LLM Analysis & Solution Generation ---")

if problematic_log_entries_for_llm: # Iterate through samples loaded in Cell 3
    # IMPORTANT: For quick testing, you might want to slice this list (e.g., problematic_log_entries_for_llm[:5])
    for i, entry_to_analyze in enumerate(tqdm(problematic_log_entries_for_llm, desc="Analyzing problematic entries")):
        print(f"\n--- Analyzing Entry {i+1} (ID: {entry_to_analyze.get('event_id', 'N/A')}, Level: {entry_to_analyze.get('level', 'N/A')}) ---")
        print(f"Event Template: {entry_to_analyze.get('event_template', 'N/A')}")
        print(f"Original Full Log Snippet:\n{entry_to_analyze.get('original_log_full', '')[:500]}...") 

        # 1. Retrieve Contextual Sequence (using the function from Cell 3)
        # This function is defined in Cell 3, which is now executing before this cell.
        # It relies on 'all_parsed_logs_for_sequence_lookup' (from Cell 3) and 'offset_map' (from Cell 2).
        # Both of these are loaded at the beginning of this notebook.
        
        log_sequence = get_contextual_log_sequence_from_disk(
            all_parsed_jsonl_path=INPUT_PARSED_JSONL_FULL, # This path is for the f.seek()
            target_entry_metadata=entry_to_analyze, 
            num_lines_before=NUM_PRECEDING_LOGS_FOR_SEQUENCE,
            offset_map=offset_map # From Cell 2
        )
        sequence_json_str = json.dumps(log_sequence, indent=2)
        if not log_sequence:
            sequence_json_str = "No preceding log events retrieved or sequence unavailable."
            print("Sequence analysis result: No preceding events retrieved.")
        else:
            print(f"Retrieved {len(log_sequence)} preceding log entries for sequence analysis.")
        
        # 2. Prepare query for RAG retriever
        retriever_query = f"HDFS troubleshooting for {entry_to_analyze.get('level', 'N/A')} log: {entry_to_analyze.get('event_template', '')}. Original log snippet: {entry_to_analyze.get('original_log_full', '')[:200]}"
        
        # 3. Retrieve relevant context from knowledge base
        print("Retrieving relevant context from knowledge base...")
        retrieved_docs = retriever.invoke(retriever_query)
        
        context_text = "\n\n".join([doc.page_content for doc in retrieved_docs])
        if not context_text:
            context_text = "No specific context found in the knowledge base. The LLM will generate a general solution."
            print("WARNING: No specific context retrieved from RAG. Solution may be general.")
        else:
            print(f"Retrieved {len(retrieved_docs)} document chunks.")

        # 4. Prepare the final prompt for the LLM
        final_prompt_for_llm = PROMPT_TEMPLATE_LLM.format(
            log_entry_full=entry_to_analyze['original_log_full'], 
            sequence_of_events_json=sequence_json_str, 
            context=context_text
        )

        # 5. Invoke the LLM directly with the prepared prompt
        print("Sending final prompt to Gemini LLM for solution generation...")
        try:
            llm_response = llm.invoke(final_prompt_for_llm)
            solution_json_str = llm_response.content.strip()

            if solution_json_str.startswith("```json"):
                solution_json_str = solution_json_str.lstrip("```json").rstrip("```").strip()
            
            try:
                generated_solution = json.loads(solution_json_str)
                print("\n--- GENERATED INCIDENT RESPONSE PLAN (JSON) ---")
                print(json.dumps(generated_solution, indent=2))
                print("-----------------------------------------------")
            except json.JSONDecodeError as e:
                print(f"ERROR: LLM response was not valid JSON for entry {entry_to_analyze.get('event_id', 'N/A')}: {e}")
                print(f"Raw LLM response content (first 500 chars):\n {solution_json_str[:500]}") 

        except Exception as e:
            print(f"ERROR: Failed to invoke LLM for entry {entry_to_analyze.get('event_id', 'N/A')}: {e}")
            print("Please check your API key, internet connection, and Gemini API quotas.")

else:
    print("No problematic templates found to analyze. Please ensure Cell 3 loaded some.")

print("\n--- Full LLM Analysis & Solution Generation Complete ---")

--- Defining LLM Prompt Template ---
LLM Prompt Template defined.

--- LLM Analysis & Solution Generation ---


Analyzing problematic entries:   0%|          | 0/11 [00:00<?, ?it/s]


--- Analyzing Entry 1 (ID: HDFS_6C708D31, Level: WARN) ---
Event Template: Slow BlockReceiver write packet to mirror took <NUM>ms (threshold=<NUM>ms)
Original Full Log Snippet:
2016-04-13 21:56:12,682 WARN org.apache.hadoop.hdfs.server.datanode.DataNode: Slow BlockReceiver write packet to mirror took 340ms (threshold=300ms)...
Retrieved 20 preceding log entries for sequence analysis.
Retrieving relevant context from knowledge base...
Retrieved 4 document chunks.
Sending final prompt to Gemini LLM for solution generation...


KeyboardInterrupt: 

In [None]:
print("--- Defining LLM Prompt Template ---")

# Define the Prompt Template for the LLM
# FINALIZED: Adds meta-reasoning, causal flow in root cause, and impact of solution steps.
prompt_template = """
You are an expert HDFS (Hadoop Distributed File System) Site Reliability Engineer (SRE) and incident responder.
Your task is to concisely analyze a given problematic HDFS log entry, understand the specific underlying problem, and generate a brief, actionable incident response plan.

Here is the problematic HDFS log entry for analysis:
{log_entry_full}

Here is the sequence of preceding log events that led up to this problem:
{sequence_of_events_json}

Here is relevant troubleshooting and solution context from our knowledge base:
{context}

Based on the HDFS log entry, the sequence of events, and the provided context, please perform the following:

**LLM Analysis Feedback (Self-Evaluation):**
* **Confidence Level (High/Medium/Low):** How confident are you in this analysis based on the provided information?
* **Context Sufficiency (Sufficient/Partially Sufficient/Insufficient):** Is the provided `log_entry_full`, `sequence_of_events_json`, and `context` enough for a definitive root cause and solution?
* **Needed Additional Info (If Insufficient):** Explain what specific additional log entries (e.g., timestamps, components, error messages), metrics, or external context would improve accuracy.

1.  **Problem Summary (Concise):** Provide a brief, clear summary of what specifically went wrong, referencing insights from the sequence if possible.
2.  **Severity Assessment:** Assign a severity level (Critical, High, Medium, Low) based on the log's impact.
3.  **Impact Assessment (Brief):** Briefly describe the potential impact on HDFS services or users (e.g., data durability, performance, availability).
4.  **Root Cause Hypothesis:** Suggest the most probable root cause(s). **Explain the flow or chain of events that likely led to this problem, explicitly referencing specific events or patterns from the provided sequence.** Be specific about the modules or components involved in the causal chain.
5.  **Affected Components:** List the HDFS components or services that are most likely affected.
6.  **Actionable Response Plan (Brief & Role-Specific):**
    * For each action, provide:
        * `step_description`: A concise description of the action.
        * `responsible_team`: Specify WHICH TEAM (DevOps/SRE, Developer, Security).
        * `responsible_module_or_component`: Specify WHICH MODULE/COMPONENT should perform the action (e.g., DataNode Storage, NameNode IPC Layer, HDFS Client Library).
        * `specific_effect_on_problem`: What this action will specifically do and how it directly addresses the error or root cause.
        * `expected_outcome_or_status`: What specific result to look for if the action is successful (e.g., "logs show 'service started'", "metric returns to baseline").
        * `type`: Classify as 'DIAGNOSTIC_ONLY' or 'POTENTIALLY_MODIFIES_STATE'.
    * **Prioritize the top 3-5 most impactful actions per role.**
    * **Do NOT include direct commands as the 'command' field is not part of this structure.** Focus on the action and its specific impact.

7.  **Known Workarounds/Temporary Mitigations (If Applicable):**
    * Provide a brief list of any quick, temporary fixes that could alleviate the immediate problem while a permanent solution is being investigated. If none, state "None identified".

Format your entire response as a single JSON object with the following keys:
"llm_analysis_feedback": {{
    "confidence_level": "...",
    "context_sufficiency": "...",
    "needed_additional_info": "..."
}},
"summary": "...",
"severity": "...",
"impact_assessment": "...", # <--- NEW FIELD
"root_cause_hypothesis": "...",
"affected_components": ["...", "..."],
"response_plan": {{
    "devops_sre_actions": [
        {{"step_description": "...", "responsible_team": "...", "responsible_module_or_component": "...", "specific_effect_on_problem": "...", "expected_outcome_or_status": "...", "type": "..."}},
        {{"step_description": "...", "responsible_team": "...", "responsible_module_or_component": "...", "specific_effect_on_problem": "...", "expected_outcome_or_status": "...", "type": "..."}}
    ],
    "developer_actions": [
        {{"step_description": "...", "responsible_team": "...", "responsible_module_or_component": "...", "specific_effect_on_problem": "...", "expected_outcome_or_status": "...", "type": "..."}},
        {{"step_description": "...", "responsible_team": "...", "responsible_module_or_component": "...", "specific_effect_on_problem": "...", "expected_outcome_or_status": "...", "type": "..."}}
    ],
    "security_actions": [
        {{"step_description": "...", "responsible_team": "...", "responsible_module_or_component": "...", "specific_effect_on_problem": "...", "expected_outcome_or_status": "...", "type": "..."}},
        {{"step_description": "...", "responsible_team": "...", "responsible_module_or_component": "...", "specific_effect_on_problem": "...", "expected_outcome_or_status": "...", "type": "..."}}
    ]
}},
"temporary_mitigations": ["...", "..."] # <--- NEW FIELD
}}

**CRITICAL INSTRUCTION:** Ensure your response is ONLY the complete and valid JSON object. Do NOT include any text before or after the JSON. Do NOT truncate the JSON object. It must be perfectly parsable JSON.
"""

# Create the PromptTemplate object
PROMPT_TEMPLATE_LLM = PromptTemplate(
    template=prompt_template,
    input_variables=["log_entry_full", "sequence_of_events_json", "context"]
)

print("LLM Prompt Template defined.")
print("\n--- LLM Analysis & Solution Generation ---")

if problematic_log_entries_for_llm: # Iterate through samples loaded in Cell 3
    # IMPORTANT: For quick testing, you might want to slice this list (e.g., problematic_log_entries_full[:5])
    for i, entry_to_analyze in enumerate(tqdm(problematic_log_entries_for_llm, desc="Analyzing problematic entries")):
        print(f"\n--- Analyzing Entry {i+1} (ID: {entry_to_analyze.get('event_id', 'N/A')}, Level: {entry_to_analyze.get('level', 'N/A')}) ---")
        print(f"Event Template: {entry_to_analyze.get('event_template', 'N/A')}")
        print(f"Original Full Log Snippet:\n{entry_to_analyze.get('original_log_full', '')[:500]}...") 

        # 1. Retrieve Contextual Sequence
        log_sequence = get_contextual_log_sequence_from_disk(
            all_parsed_jsonl_path=INPUT_PARSED_JSONL_FULL, 
            target_entry_metadata=entry_to_analyze, 
            num_lines_before=NUM_PRECEDING_LOGS_FOR_SEQUENCE,
            offset_map=offset_map 
        )
        sequence_json_str = json.dumps(log_sequence, indent=2)
        if not log_sequence:
            sequence_json_str = "No preceding log events retrieved or sequence unavailable."
            print("Sequence analysis result: No preceding events retrieved.")
        else:
            print(f"Retrieved {len(log_sequence)} preceding log entries for sequence analysis.")
        
        # 2. Prepare query for RAG retriever
        retriever_query = f"HDFS troubleshooting for {entry_to_analyze.get('level', 'N/A')} log: {entry_to_analyze.get('event_template', '')}. Provide solution for original message: {entry_to_analyze.get('original_log_full', '')[:200]}"
        
        # 3. Retrieve relevant context from knowledge base
        print("Retrieving relevant context from knowledge base...")
        retrieved_docs = retriever.invoke(retriever_query)
        
        context_text = "\n\n".join([doc.page_content for doc in retrieved_docs])
        if not context_text:
            context_text = "No specific context found in the knowledge base. The LLM will generate a general solution."
            print("WARNING: No specific context retrieved from RAG. Solution may be general.")
        else:
            print(f"Retrieved {len(retrieved_docs)} document chunks.")

        # 4. Prepare the final prompt for the LLM
        final_prompt_for_llm = PROMPT_TEMPLATE_LLM.format(
            log_entry_full=entry_to_analyze['original_log_full'], 
            sequence_of_events_json=sequence_json_str, 
            context=context_text
        )

        # 5. Invoke the LLM directly with the prepared prompt
        print("Sending final prompt to Gemini LLM for solution generation...")
        try:
            llm_response = llm.invoke(final_prompt_for_llm)
            solution_json_str = llm_response.content.strip()

            if solution_json_str.startswith("```json"):
                solution_json_str = solution_json_str.lstrip("```json").rstrip("```").strip()
            
            try:
                generated_solution = json.loads(solution_json_str)
                print("\n--- GENERATED INCIDENT RESPONSE PLAN (JSON) ---")
                print(json.dumps(generated_solution, indent=2))
                print("-----------------------------------------------")
            except json.JSONDecodeError as e:
                print(f"ERROR: LLM response was not valid JSON for entry {entry_to_analyze.get('event_id', 'N/A')}: {e}")
                print(f"Raw LLM response content (first 500 chars):\n {solution_json_str[:500]}") 

        except Exception as e:
            print(f"ERROR: Failed to invoke LLM for entry {entry_to_analyze.get('event_id', 'N/A')}: {e}")
            print("Please check your API key, internet connection, and Gemini API quotas.")

else:
    print("No problematic log entries found to analyze. Please ensure Cell 3 loaded some.")

print("\n--- Full LLM Analysis & Solution Generation Complete ---")

--- Defining LLM Prompt Template ---
LLM Prompt Template defined.

--- LLM Analysis & Solution Generation ---


NameError: name 'problematic_log_entries_full' is not defined