In [9]:
import re
import json
from collections import Counter
import pandas as pd

In [10]:
# Initialize
inputs = []
targets = []
sources = []
source_counter = Counter()
failed = 0
both_present = 0
failed_lines = []

In [11]:
response_file = "outputs/batch_0_25_Jun_25_output.jsonl"      # Replace with your actual input path
detailed_log = "outputs/detailed_log.txt"
output_excel = "outputs/batch_0_25_Jun_25_output.xlsx"
total = 0

In [12]:

with open(response_file, "r", encoding="utf-8") as f, open(detailed_log, "w", encoding="utf-8") as log_file:
    for idx, line in enumerate(f, 1):
        total += 1
        input_part = None
        target_part = None
        source_method = "n/a"

        try:
            data = json.loads(line)
            content = data["response"]["body"]["choices"][0]["message"]["content"]
            original_content = content
            content = content.replace('\r\n', '\n').strip()

            log_file.write(f"\n--- Line {idx} ---\n")
            log_file.write(f"RAW Content: {repr(original_content)}\n")

            # 1. Try ## INPUT ## and ## TARGET ##
            match = re.search(r'\n*## INPUT ##\n*(.+?)\n*## TARGET ##\n*(.+)', content, re.DOTALL)
            if match:
                input_part = match.group(1)
                target_part = match.group(2)
                source_method = "regex_markers"
                both_present += 1

            # 2. Try split by double newline
            elif content.count("\n\n") == 1:
                parts = content.split("\n\n", 1)
                if len(parts) == 2:
                    input_part = parts[0]
                    target_part = parts[1]
                    source_method = "double_newline"

            # 3. Fallback: if both missing
            if input_part is None and target_part is None:
                input_part = ""
                target_part = content
                source_method = "fallback_full_content"

            # Record results
            inputs.append(input_part)
            targets.append(target_part)
            sources.append(source_method)
            source_counter[source_method] += 1

            log_file.write(f"Extracted INPUT: {repr(input_part)}\n")
            log_file.write(f"Extracted TARGET: {repr(target_part)}\n")
            log_file.write(f"Method Used: {source_method}\n")

        except Exception as e:
            inputs.append("")
            targets.append("n/a")
            sources.append("error")
            source_counter["error"] += 1
            failed += 1
            failed_lines.append(idx)
            log_file.write(f"\nLine {idx}: EXCEPTION - {e}\n")
            log_file.write(f"Line Content: {repr(line)}\n")

In [13]:
# Write Excel file
df = pd.DataFrame({
    "input": inputs,
    "target": targets,
    "source_method": sources
})
df.to_excel(output_excel, index=False)

In [14]:
# Write summary to log and print
summary = f"""
========== SUMMARY ==========
Total lines processed: {total}
Successfully extracted (regex_markers): {source_counter['regex_markers']}
Fallback to double_newline: {source_counter['double_newline']}
Fallback to full content: {source_counter['fallback_full_content']}
Failed (exceptions): {source_counter['error']}
=============================
"""

print(summary)
with open(detailed_log, "a", encoding="utf-8") as log_file:
    log_file.write(summary)


Total lines processed: 500
Successfully extracted (regex_markers): 76
Fallback to double_newline: 138
Fallback to full content: 286
Failed (exceptions): 0

