In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from dotenv import load_dotenv

In [2]:
from preprocessing.text_agent import get_or_process_text

text_df = pd.read_json("data/parsed_text_data.jsonl", lines=True)
text_df.head()

Unnamed: 0,filename,filetype,content
0,https_saw.galois.com_intro_IntroToSAW.html.txt,txt,First Example: Counting Set Bits\n\nMost devel...
1,https_saw.galois.com_intro_Pointers.html.txt,txt,Specifying Memory Layout\n\nPrograms are about...
2,https_saw.galois.com_intro_Salsa20.html.txt,txt,Compositional Verification and Salsa20\n\nFirs...
3,https_saw.galois.com_intro_HMACProblem.html.txt,txt,Proof Maintenance Exercises: s2n HMAC\n\nThe e...
4,https_saw.galois.com_intro_HMACSolution.html.txt,txt,Proof Maintenance Exercises: Solutions\n\nThis...


In [4]:
load_dotenv()
OPEN_AI_KEY = os.getenv("OPEN_AI_KEY")

In [4]:
text_processing = []
i = 0
for _, row in text_df.iterrows():
    i += 1
    raw = row["content"]
    print(f"Processing row {i} / {len(text_df)}: {row['filename']}")
    result = get_or_process_text(raw, model="gpt-4.1-mini", key=OPEN_AI_KEY)
    if isinstance(result, tuple) and len(result) == 2:
        hash, processed = result
    else:
        hash, processed = None, result
    text_processing.append({
        "filename": row["filename"],
        "hash": hash,
        "input": row["content"],
        "processed": processed
    })
    row["content"] = processed
text_processing_df = pd.DataFrame(text_processing)
text_processing_df.head()

Processing row 1 / 30: https_saw.galois.com_intro_IntroToSAW.html.txt
Processing row 2 / 30: https_saw.galois.com_intro_Pointers.html.txt
Processing row 3 / 30: https_saw.galois.com_intro_Salsa20.html.txt
Processing row 4 / 30: https_saw.galois.com_intro_HMACProblem.html.txt
Processing row 5 / 30: https_saw.galois.com_intro_HMACSolution.html.txt
Processing row 6 / 30: https_galoisinc.github.io_cryptol_master_BasicSyntax.html.txt
Processing row 7 / 30: https_galoisinc.github.io_cryptol_master_Expressions.html.txt
Processing row 8 / 30: https_galoisinc.github.io_cryptol_master_BasicTypes.html.txt
Processing row 9 / 30: https_galoisinc.github.io_cryptol_master_OverloadedOperations.html.txt
Processing row 10 / 30: https_galoisinc.github.io_cryptol_master_TypeDeclarations.html.txt
Processing row 11 / 30: https_galoisinc.github.io_cryptol_master_Modules.html.txt
Processing row 12 / 30: https_galoisinc.github.io_cryptol_master_FFI.html.txt
Processing row 13 / 30: https_galoisinc.github.io_cry

Unnamed: 0,filename,hash,input,processed
0,https_saw.galois.com_intro_IntroToSAW.html.txt,c1c0ea89741609032f2be284d23cdfbfa6a984fb42642b...,First Example: Counting Set Bits\n\nMost devel...,```markdown\n## First Example: Counting Set Bi...
1,https_saw.galois.com_intro_Pointers.html.txt,dc3e7d9ee52338bb41902d67a67e1e8ce7db549aebc5fd...,Specifying Memory Layout\n\nPrograms are about...,```markdown\n## Specifying Memory Layout\n\nPr...
2,https_saw.galois.com_intro_Salsa20.html.txt,6ad48d3e7d1b0b4c60a585567246ec7ef960671963c177...,Compositional Verification and Salsa20\n\nFirs...,```markdown\n## Compositional Verification and...
3,https_saw.galois.com_intro_HMACProblem.html.txt,34b8dc6564472d2cb9f1cc2ab31939734f5ad17c3ac3e6...,Proof Maintenance Exercises: s2n HMAC\n\nThe e...,```markdown\n## Proof Maintenance Exercises: s...
4,https_saw.galois.com_intro_HMACSolution.html.txt,7165c5da44ba90f705c2b3ec9a157ecfb058ea515bcc5d...,Proof Maintenance Exercises: Solutions\n\nThis...,```markdown\n## Proof Maintenance Exercises: S...


In [5]:
text_df.head()

Unnamed: 0,filename,filetype,content
0,https_saw.galois.com_intro_IntroToSAW.html.txt,txt,```markdown\n## First Example: Counting Set Bi...
1,https_saw.galois.com_intro_Pointers.html.txt,txt,```markdown\n## Specifying Memory Layout\n\nPr...
2,https_saw.galois.com_intro_Salsa20.html.txt,txt,```markdown\n## Compositional Verification and...
3,https_saw.galois.com_intro_HMACProblem.html.txt,txt,```markdown\n## Proof Maintenance Exercises: s...
4,https_saw.galois.com_intro_HMACSolution.html.txt,txt,```markdown\n## Proof Maintenance Exercises: S...


In [6]:
text_df.to_json("data/training_datasets/cleaned_text_data.jsonl", lines=True, orient="records")