In [1]:
from pathlib import Path 
import datetime
import os, dotenv
dotenv.load_dotenv()
os.chdir(Path(os.getenv("PYTHONPATH")).expanduser())

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from dotenv import load_dotenv

In [3]:
VARIANT = "nocomments"
VERSION = "v1.2s"
preformat_data_df = pd.read_json(f"data/training_datasets/train_test_split/all_{VARIANT}_{VERSION}.jsonl", lines=True)
token_count_df = pd.read_csv(f"data/{VARIANT}_file_metrics_{VERSION}.csv")
preformat_data_df = preformat_data_df.merge(token_count_df[['filename', 'num_tokens_model']], on='filename', how='left')
preformat_data_df['filetype'] = preformat_data_df['filetype'].apply(
    lambda x: 'cryptol' if x == 'cry' else 'saw' if x == 'saw' else 'text'
    )
preformat_data_df = preformat_data_df[preformat_data_df["filetype"] != "text"].reset_index(drop=True)

preformat_data_df.head()


Unnamed: 0,filename,filetype,content,variant,set,num_tokens_model
0,AES-GCM-SIV-proof/proof/cryptol-specs/AES.cry,cryptol,module AES where\n\nimport `Common::AES\n\ntyp...,without_comments,supervised,173.0
1,AES-GCM-SIV-proof/proof/cryptol-specs/AES128.cry,cryptol,module AES128 where\n\nimport `Common::AES\nim...,without_comments,supervised,214.0
2,AES-GCM-SIV-proof/proof/cryptol-specs/AES256.cry,cryptol,module AES256 where\n\nimport `Common::AES\nim...,without_comments,supervised,214.0
3,AES-GCM-SIV-proof/proof/cryptol-specs/TBox.cry,cryptol,type Nb = 4\ntype State = [4][Nb]...,without_comments,supervised,3400.0
4,AES-GCM-SIV-proof/proof/asm/deps/saw-script/do...,cryptol,"all : {n, a} (fin n) => (a -> Bit, [n]a) -> Bi...",without_comments,supervised,443.0


In [4]:
from src.preprocessing.sft_instruct_preprocess import iter_call_openai_structured, alpaca_df_to_qwen_messages, build_prompt_call_openai_structured

out_path = Path(f"cache/alpaca_instruct_cache/SFT_{VARIANT}_source_code_{VERSION}.jsonl")
out_path.parent.mkdir(parents=True, exist_ok=True)  # create dirs if missing
file_cache_path = f"cache/alpaca_instruct_cache/SFT_{VARIANT}_source_code_{VERSION}.jsonl"

test_df = preformat_data_df.sample(1, random_state=42).reset_index(drop=True)
input_mode="full"

preformat_data_df = preformat_data_df[preformat_data_df["filetype"] == "cryptol"].reset_index(drop=True)


result = iter_call_openai_structured(preformat_data_df, "gpt-5.1", input_mode, file_cache_path)
result.head()

Key  not found in CRYPTOL_VECTOR_STORE_ID.


Unnamed: 0,filename,filetype,set,instruction,input,output,content
0,AES-GCM-SIV-proof/proof/cryptol-specs/AES.cry,cryptol,supervised,Write a Cryptol specification that defines the...,,,module AES where\n\nimport `Common::AES\n\ntyp...
1,AES-GCM-SIV-proof/proof/cryptol-specs/AES128.cry,cryptol,supervised,Write a Cryptol module AES128 that defines the...,,,module AES128 where\n\nimport `Common::AES\nim...
2,AES-GCM-SIV-proof/proof/cryptol-specs/AES256.cry,cryptol,supervised,Write a Cryptol module AES256 that provides 25...,,,module AES256 where\n\nimport `Common::AES\nim...
3,AES-GCM-SIV-proof/proof/cryptol-specs/TBox.cry,cryptol,supervised,Define a Cryptol module that specifies the AES...,,,type Nb = 4\ntype State = [4][Nb]...
4,AES-GCM-SIV-proof/proof/asm/deps/saw-script/do...,cryptol,supervised,"In Cryptol, write or refine a specification of...",,,"all : {n, a} (fin n) => (a -> Bit, [n]a) -> Bi..."


In [5]:
for idx, row in result[result['filetype'] == 'saw'].iterrows():
    print("Instruction:")
    print(row['instruction'])
    print("Input:")
    print(row['input'])
    print("Output:")
    print(row['output'])
    print("="*50)

In [6]:
result = alpaca_df_to_qwen_messages(
    result, 
    output="content",
    #system_prompt="You are a meticulous assistant that writes formal specifications and verification code for Cryptol programs.",
    drop_input=True,
    include_filename_in_user=False
    )
    
result.head()

Unnamed: 0,messages,filename,filetype,set
0,"[{'role': 'system', 'content': 'Return exactly...",AES-GCM-SIV-proof/proof/cryptol-specs/AES.cry,cryptol,supervised
1,"[{'role': 'system', 'content': 'Return exactly...",AES-GCM-SIV-proof/proof/cryptol-specs/AES128.cry,cryptol,supervised
2,"[{'role': 'system', 'content': 'Return exactly...",AES-GCM-SIV-proof/proof/cryptol-specs/AES256.cry,cryptol,supervised
3,"[{'role': 'system', 'content': 'Return exactly...",AES-GCM-SIV-proof/proof/cryptol-specs/TBox.cry,cryptol,supervised
4,"[{'role': 'system', 'content': 'Return exactly...",AES-GCM-SIV-proof/proof/asm/deps/saw-script/do...,cryptol,supervised


In [7]:

result = result.merge(
    preformat_data_df[["filename"]],
    on="filename",
    how="left"
)
reorder_df = result[["filename", "filetype", "set", "messages"]].copy()
reorder_df.head()


Unnamed: 0,filename,filetype,set,messages
0,AES-GCM-SIV-proof/proof/cryptol-specs/AES.cry,cryptol,supervised,"[{'role': 'system', 'content': 'Return exactly..."
1,AES-GCM-SIV-proof/proof/cryptol-specs/AES128.cry,cryptol,supervised,"[{'role': 'system', 'content': 'Return exactly..."
2,AES-GCM-SIV-proof/proof/cryptol-specs/AES256.cry,cryptol,supervised,"[{'role': 'system', 'content': 'Return exactly..."
3,AES-GCM-SIV-proof/proof/cryptol-specs/TBox.cry,cryptol,supervised,"[{'role': 'system', 'content': 'Return exactly..."
4,AES-GCM-SIV-proof/proof/asm/deps/saw-script/do...,cryptol,supervised,"[{'role': 'system', 'content': 'Return exactly..."


In [8]:
reorder_df.to_json(f"data/training_datasets/SFT_message_format_{VARIANT}_{VERSION}.jsonl", orient="records", lines=True)

In [9]:
training_df = reorder_df[reorder_df["set"] != "holdout"].reset_index(drop=True)

training_df.drop(columns=['set'], inplace=True)
training_df.head()

Unnamed: 0,filename,filetype,messages
0,AES-GCM-SIV-proof/proof/cryptol-specs/AES.cry,cryptol,"[{'role': 'system', 'content': 'Return exactly..."
1,AES-GCM-SIV-proof/proof/cryptol-specs/AES128.cry,cryptol,"[{'role': 'system', 'content': 'Return exactly..."
2,AES-GCM-SIV-proof/proof/cryptol-specs/AES256.cry,cryptol,"[{'role': 'system', 'content': 'Return exactly..."
3,AES-GCM-SIV-proof/proof/cryptol-specs/TBox.cry,cryptol,"[{'role': 'system', 'content': 'Return exactly..."
4,AES-GCM-SIV-proof/proof/asm/deps/saw-script/do...,cryptol,"[{'role': 'system', 'content': 'Return exactly..."


In [10]:
training_df.to_json(f"data/training_datasets/{VARIANT}_message_format_{VERSION}.jsonl", orient="records", lines=True)