In [1]:
from pathlib import Path 
import os, dotenv
dotenv.load_dotenv()
os.chdir(Path(os.getenv("PYTHONPATH")).expanduser())

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from dotenv import load_dotenv

In [3]:
from src.preprocessing.text_agent import get_or_process_text

text_df = pd.read_json("data/parsed_text_data.jsonl", lines=True)
text_df.head()

Unnamed: 0,filename,filetype,content
0,https://galoisinc.github.io/cryptol/master/Bas...,text,Basic Syntax\n============\n\n\nDeclarations\n...
1,https://galoisinc.github.io/cryptol/master/Exp...,text,Expressions\n===========\n\nThis section provi...
2,https://galoisinc.github.io/cryptol/master/Bas...,text,Basic Types\n===========\n\n\nTuples and Recor...
3,https://galoisinc.github.io/cryptol/master/Ove...,text,Overloaded Operations\n=====================\n...
4,https://galoisinc.github.io/cryptol/master/Typ...,text,.. _type-declarations:\n\nType Declarations\n=...


In [4]:
load_dotenv()
OPEN_AI_KEY = os.getenv("OPEN_AI_KEY")

In [5]:
exclusion_list = [
    "cryptol-specs/Primitive/Keyless/Hash/MD5.md",
    "cryptol-specs/Primitive/Symmetric/Cipher/Block/PRINCE.md",
    "cryptol-specs/Primitive/Symmetric/Cipher/Block/TripleDES.md",
    "cryptol-specs/Primitive/Symmetric/Cipher/Block/DES.md",
    "cryptol-specs/Primitive/Symmetric/Cipher/Block/LED.md",
    "cryptol-specs/Primitive/Symmetric/Cipher/Block/PRINCE.md"
    "cryptol-specs/Primitive/Symmetric/Cipher/Authenticated/ChaChaPolyCryptolIETF.md",
    "cryptol-specs/Primitive/Symmetric/Cipher/Authenticated/ChaChaPolyCryptolIETF.md",
    "cryptol-specs/Primitive/Symmetric/Cipher/Authenticated/SIV_rfc5297.md",
    "cryptol-specs/Primitive/Symmetric/Cipher/Stream/ZUC1_6_Tests.md",
    "cryptol-specs/Primitive/Symmetric/Cipher/Stream/Salsa20.md",
    "cryptol-specs/Primitive/Symmetric/Cipher/Stream/ZUC1_6.md",
    "cryptol-specs/Primitive/Symmetric/KDF/HKDF.md",
    "cryptol-specs/Primitive/Symmetric/KDF/HKDF256Tests.md"
]

In [6]:
text_processing = []
i = 0
for _, row in text_df.iterrows():
    i += 1
    raw = row["content"]
    print(f"Processing row {i} / {len(text_df)}: {row['filename']}")
    if row["filename"] in exclusion_list:
        print(f"  Skipping excluded file: {row['filename']}")
        text_processing.append({
            "filename": row["filename"],
            "hash": None,
            "input": row["content"],
            "processed": raw
        })
        continue
    result = get_or_process_text(raw, model="gpt-5.1", key=OPEN_AI_KEY)
    if isinstance(result, tuple) and len(result) == 2:
        hash, processed = result
    else:
        hash, processed = None, result
    text_processing.append({
        "filename": row["filename"],
        "hash": hash,
        "input": row["content"],
        "processed": processed
    })
    row["content"] = processed
text_processing_df = pd.DataFrame(text_processing)
text_processing_df.head()

Processing row 1 / 9: https://galoisinc.github.io/cryptol/master/BasicSyntax.html
Processing row 2 / 9: https://galoisinc.github.io/cryptol/master/Expressions.html
Processing row 3 / 9: https://galoisinc.github.io/cryptol/master/BasicTypes.html
Processing row 4 / 9: https://galoisinc.github.io/cryptol/master/OverloadedOperations.html
Processing row 5 / 9: https://galoisinc.github.io/cryptol/master/TypeDeclarations.html
Processing row 6 / 9: https://galoisinc.github.io/cryptol/master/Modules.html
Processing row 7 / 9: https://galoisinc.github.io/cryptol/master/FFI.html
Processing row 8 / 9: https://galoisinc.github.io/cryptol/master/Project.html
Processing row 9 / 9: https://galoisinc.github.io/cryptol/master/REPLCommands.html


Unnamed: 0,filename,hash,input,processed
0,https://galoisinc.github.io/cryptol/master/Bas...,7ccdbadd606469fa25d4ecd8d284e71b757f61d248aece...,Basic Syntax\n============\n\n\nDeclarations\n...,```markdown\n## Basic Syntax\n\n## Declaration...
1,https://galoisinc.github.io/cryptol/master/Exp...,d6ba93c8ac721d5f7e87be92d39ea0f434996d4c3cfade...,Expressions\n===========\n\nThis section provi...,```markdown\n## Expressions\n\nThis section pr...
2,https://galoisinc.github.io/cryptol/master/Bas...,da284a3edd0f3c74a53af187bb48f74a488538a5677fa2...,Basic Types\n===========\n\n\nTuples and Recor...,```markdown\n## Basic Types\n\n## Tuples and R...
3,https://galoisinc.github.io/cryptol/master/Ove...,1026afe5d7f6548b36516f55a03b7f856120c66e0f26ef...,Overloaded Operations\n=====================\n...,```markdown\n## Overloaded Operations\n\nMany ...
4,https://galoisinc.github.io/cryptol/master/Typ...,f68da7591444610eb2fa1fd58f4001de66b88715d02631...,.. _type-declarations:\n\nType Declarations\n=...,```markdown\n## Type Declarations\n\n### Type ...


In [7]:
text_df.head()

Unnamed: 0,filename,filetype,content
0,https://galoisinc.github.io/cryptol/master/Bas...,text,```markdown\n## Basic Syntax\n\n## Declaration...
1,https://galoisinc.github.io/cryptol/master/Exp...,text,```markdown\n## Expressions\n\nThis section pr...
2,https://galoisinc.github.io/cryptol/master/Bas...,text,```markdown\n## Basic Types\n\n## Tuples and R...
3,https://galoisinc.github.io/cryptol/master/Ove...,text,```markdown\n## Overloaded Operations\n\nMany ...
4,https://galoisinc.github.io/cryptol/master/Typ...,text,```markdown\n## Type Declarations\n\n### Type ...


In [8]:
text_df.to_json("data/training_datasets/cleaned_text_data.jsonl", lines=True, orient="records")