In [None]:
import os
import json
import pandas as pd
from tqdm.notebook import tqdm

# --- Configuration ---
# Control flag to use a smaller sample file for index generation
# Set to False to process the full 43GB file for byte-offset indexing
USE_SAMPLE_PARSED_FILE_FOR_INDEX = False # <--- Set to False for processing the full 43GB file

# Paths for data (relative to project root, as notebook is in 'notebooks/' folder)
# Input for building the index
FULL_PARSED_JSONL_PATH = "../data/parsed_logs/hdfs_v2_parsed_hybrid_final.jsonl"
SAMPLE_PARSED_JSONL_PATH = "../data/parsed_logs/sample_parsed_logs.jsonl" # Path to your manually created sample file

# Dynamically set the input path based on the flag
if USE_SAMPLE_PARSED_FILE_FOR_INDEX:
    INPUT_PARSED_JSONL_FOR_INDEXING = SAMPLE_PARSED_JSONL_PATH
else:
    INPUT_PARSED_JSONL_FOR_INDEXING = FULL_PARSED_JSONL_PATH

# Output for the generated byte-offset index
OUTPUT_OFFSET_INDEX_FILE = "../data/parsed_logs/hdfs_v2_offset_index.csv" 

# Ensure the output directory for the index exists
os.makedirs(os.path.dirname(OUTPUT_OFFSET_INDEX_FILE), exist_ok=True)

# --- Print Statements for Clarity ---
print("Setup complete. Paths configured for Byte-Offset Indexing.")
print(f"Using input file for indexing: {INPUT_PARSED_JSONL_FOR_INDEXING}")
if not os.path.exists(INPUT_PARSED_JSONL_FOR_INDEXING):
    print(f"ERROR: Input JSONL file NOT FOUND at {INPUT_PARSED_JSONL_FOR_INDEXING}.")
    if USE_SAMPLE_PARSED_FILE_FOR_INDEX:
        print("Please ensure 'sample_parsed_logs.jsonl' is created and placed in 'data/parsed_logs/'.")
    else:
        print("Please ensure 'hdfs_v2_parsed_hybrid_final.jsonl' exists or set USE_SAMPLE_PARSED_FILE_FOR_INDEX=True for testing.")

print("-----------------------------------")

In [None]:
print(f"Building byte-offset index for {INPUT_PARSED_JSONL_FOR_INDEXING}...")

offset_map_data = [] # List to store dictionaries for DataFrame
current_byte_offset = 0 # Track byte offset in the parsed JSONL file

try:
    with open(INPUT_PARSED_JSONL_FOR_INDEXING, 'r', encoding='utf-8', errors='ignore') as f_in:
        # Use tqdm to show progress for reading the large JSONL file
        for i, line_str in tqdm(enumerate(f_in), desc="Generating byte offset index"): # <--- tqdm is applied here
            # Store the current line's offset
            
            # --- CRUCIAL CHANGE: Parse the JSON line to get its metadata including source_file ---
            try:
                parsed_entry = json.loads(line_str)
            except json.JSONDecodeError as e:
                print(f"Warning: Error decoding JSON in offset index build (Line {i+1}): {e}. Skipping line.")
                current_byte_offset += len(line_str.encode('utf-8')) # Still update offset for next line
                continue # Skip malformed JSON lines
            
            source_file_name = parsed_entry.get('source_file')
            line_id_in_original_file = parsed_entry.get('line_id_in_file_header')

            if source_file_name is None or line_id_in_original_file is None:
                print(f"Warning: Missing 'source_file' or 'line_id_in_file_header' in parsed entry (Line {i+1}). Skipping line for index.")
                current_byte_offset += len(line_str.encode('utf-8'))
                continue # Skip if essential metadata is missing

            # Store the current line's offset along with its original file and line ID
            offset_map_data.append({
                'source_file': source_file_name,
                'line_id_in_file_header': line_id_in_file_header,
                'byte_offset': current_byte_offset
            })
            
            # Update offset for the next line (add length of current line + newline char)
            current_byte_offset += len(line_str.encode('utf-8')) # Encode to bytes to get correct byte length

    print(f"\nSuccessfully created index for {len(offset_map_data)} lines.")

except FileNotFoundError:
    print(f"ERROR: Parsed JSONL file not found at {INPUT_PARSED_JSONL_FOR_INDEXING}. Please check path and ensure it's generated.")
    offset_map_data = [] # Clear data to prevent further errors
except Exception as e:
    print(f"An error occurred during index generation: {e}")

In [None]:
print(f"Saving byte-offset index to {OUTPUT_OFFSET_INDEX_FILE}...")

if offset_map_data:
    offset_df = pd.DataFrame(offset_map_data)
    offset_df.to_csv(OUTPUT_OFFSET_INDEX_FILE, index=False)
    print(f"Byte-offset index saved successfully with {len(offset_df)} entries.")
else:
    print("WARNING: No offset data to save. Index was not generated.")

print(f"\nExample of saved offset index (first 5 rows):\n{offset_df.head()}")