In [9]:
from pathlib import Path 
import os, dotenv
dotenv.load_dotenv()
os.chdir(Path(os.getenv("PYTHONPATH")).expanduser())

In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from dotenv import load_dotenv

In [11]:
from src.preprocessing.text_agent import get_or_process_text

text_df = pd.read_json("data/parsed_text_data.jsonl", lines=True)
text_df.head()

Unnamed: 0,filename,filetype,content
0,https://saw.galois.com/intro/IntroToSAW.html,txt,First Example: Counting Set Bits\n\nMost devel...
1,https://saw.galois.com/intro/Pointers.html,txt,Specifying Memory Layout\n\nPrograms are about...
2,https://saw.galois.com/intro/Salsa20.html,txt,Compositional Verification and Salsa20\n\nFirs...
3,https://saw.galois.com/intro/HMACProblem.html,txt,Proof Maintenance Exercises: s2n HMAC\n\nThe e...
4,https://saw.galois.com/intro/HMACSolution.html,txt,Proof Maintenance Exercises: Solutions\n\nThis...


In [12]:
load_dotenv()
OPEN_AI_KEY = os.getenv("OPEN_AI_KEY")

In [13]:
exclusion_list = [
    "cryptol-specs/Primitive/Keyless/Hash/MD5.md",
    "cryptol-specs/Primitive/Symmetric/Cipher/Block/PRINCE.md",
    "cryptol-specs/Primitive/Symmetric/Cipher/Block/TripleDES.md",
    "cryptol-specs/Primitive/Symmetric/Cipher/Block/DES.md",
    "cryptol-specs/Primitive/Symmetric/Cipher/Block/LED.md",
    "cryptol-specs/Primitive/Symmetric/Cipher/Block/PRINCE.md"
    "cryptol-specs/Primitive/Symmetric/Cipher/Authenticated/ChaChaPolyCryptolIETF.md",
    "cryptol-specs/Primitive/Symmetric/Cipher/Authenticated/ChaChaPolyCryptolIETF.md",
    "cryptol-specs/Primitive/Symmetric/Cipher/Authenticated/SIV_rfc5297.md",
    "cryptol-specs/Primitive/Symmetric/Cipher/Stream/ZUC1_6_Tests.md",
    "cryptol-specs/Primitive/Symmetric/Cipher/Stream/Salsa20.md",
    "cryptol-specs/Primitive/Symmetric/Cipher/Stream/ZUC1_6.md",
    "cryptol-specs/Primitive/Symmetric/KDF/HKDF.md",
    "cryptol-specs/Primitive/Symmetric/KDF/HKDF256Tests.md"
]

In [14]:
text_processing = []
i = 0
for _, row in text_df.iterrows():
    i += 1
    raw = row["content"]
    print(f"Processing row {i} / {len(text_df)}: {row['filename']}")
    if row["filename"] in exclusion_list:
        print(f"  Skipping excluded file: {row['filename']}")
        text_processing.append({
            "filename": row["filename"],
            "hash": None,
            "input": row["content"],
            "processed": raw
        })
        continue
    result = get_or_process_text(raw, model="gpt-4.1-mini", key=OPEN_AI_KEY)
    if isinstance(result, tuple) and len(result) == 2:
        hash, processed = result
    else:
        hash, processed = None, result
    text_processing.append({
        "filename": row["filename"],
        "hash": hash,
        "input": row["content"],
        "processed": processed
    })
    row["content"] = processed
text_processing_df = pd.DataFrame(text_processing)
text_processing_df.head()

Processing row 1 / 59: https://saw.galois.com/intro/IntroToSAW.html
Processing row 2 / 59: https://saw.galois.com/intro/Pointers.html
Processing row 3 / 59: https://saw.galois.com/intro/Salsa20.html
Processing row 4 / 59: https://saw.galois.com/intro/HMACProblem.html
Processing row 5 / 59: https://saw.galois.com/intro/HMACSolution.html
Processing row 6 / 59: https://galoisinc.github.io/cryptol/master/BasicSyntax.html
Processing row 7 / 59: https://galoisinc.github.io/cryptol/master/Expressions.html
Processing row 8 / 59: https://galoisinc.github.io/cryptol/master/BasicTypes.html
Processing row 9 / 59: https://galoisinc.github.io/cryptol/master/OverloadedOperations.html
Processing row 10 / 59: https://galoisinc.github.io/cryptol/master/TypeDeclarations.html
Processing row 11 / 59: https://galoisinc.github.io/cryptol/master/Modules.html
Processing row 12 / 59: https://galoisinc.github.io/cryptol/master/FFI.html
Processing row 13 / 59: https://galoisinc.github.io/cryptol/master/Project.ht

Unnamed: 0,filename,hash,input,processed
0,https://saw.galois.com/intro/IntroToSAW.html,c1c0ea89741609032f2be284d23cdfbfa6a984fb42642b...,First Example: Counting Set Bits\n\nMost devel...,```markdown\n## First Example: Counting Set Bi...
1,https://saw.galois.com/intro/Pointers.html,dc3e7d9ee52338bb41902d67a67e1e8ce7db549aebc5fd...,Specifying Memory Layout\n\nPrograms are about...,```markdown\n## Specifying Memory Layout\n\nPr...
2,https://saw.galois.com/intro/Salsa20.html,6ad48d3e7d1b0b4c60a585567246ec7ef960671963c177...,Compositional Verification and Salsa20\n\nFirs...,```markdown\n## Compositional Verification and...
3,https://saw.galois.com/intro/HMACProblem.html,34b8dc6564472d2cb9f1cc2ab31939734f5ad17c3ac3e6...,Proof Maintenance Exercises: s2n HMAC\n\nThe e...,```markdown\n## Proof Maintenance Exercises: s...
4,https://saw.galois.com/intro/HMACSolution.html,7165c5da44ba90f705c2b3ec9a157ecfb058ea515bcc5d...,Proof Maintenance Exercises: Solutions\n\nThis...,```markdown\n## Proof Maintenance Exercises: S...


In [15]:
text_df.head()

Unnamed: 0,filename,filetype,content
0,https://saw.galois.com/intro/IntroToSAW.html,txt,```markdown\n## First Example: Counting Set Bi...
1,https://saw.galois.com/intro/Pointers.html,txt,```markdown\n## Specifying Memory Layout\n\nPr...
2,https://saw.galois.com/intro/Salsa20.html,txt,```markdown\n## Compositional Verification and...
3,https://saw.galois.com/intro/HMACProblem.html,txt,```markdown\n## Proof Maintenance Exercises: s...
4,https://saw.galois.com/intro/HMACSolution.html,txt,```markdown\n## Proof Maintenance Exercises: S...


In [16]:
text_df.to_json("data/training_datasets/cleaned_text_data.jsonl", lines=True, orient="records")