In [1]:
from pathlib import Path 
import datetime
import os, dotenv
dotenv.load_dotenv()
os.chdir(Path(os.getenv("PYTHONPATH")).expanduser())

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from dotenv import load_dotenv

In [3]:
VARIANT = "nocomments"
preformat_data_df = pd.read_json(f"data/training_datasets/train_test_split/all_{VARIANT}.jsonl", lines=True)
preformat_data_df['filetype'] = preformat_data_df['filetype'].apply(
    lambda x: 'cryptol' if x == 'cry' else 'saw' if x == 'saw' else 'text'
    )
preformat_data_df = preformat_data_df[preformat_data_df["filetype"] != "text"].reset_index(drop=True)
preformat_data_df.head()


Unnamed: 0,filename,filetype,content,variant,set
0,cryptol/examples/splitAt.cry,cryptol,"x = [1,2,3,4] : [_][8]\n\ny = (splitAt x) : ([...",without_comments,unsupervised
1,cryptol/examples/AE.cry,cryptol,module AE where\n\nparameter\n type A : * ...,without_comments,unsupervised
2,cryptol/examples/xor_cipher.cry,cryptol,encrypt : {a}(fin a) => [8] -> [a][8] -> [a][8...,without_comments,supervised
3,cryptol/examples/zero_weird.cry,cryptol,x : {a}() => a -> [16]\nx v = zero v \n\nprope...,without_comments,unsupervised
4,cryptol/examples/builtin_lifting.cry,cryptol,"x = [True,False]\ny = [False,True]\n\nproperty...",without_comments,unsupervised


In [4]:
from src.preprocessing.sft_instruct_preprocess import iter_call_openai_structured, alpaca_df_to_qwen_messages, build_prompt_call_openai_structured

file_cache_path = f"cache/alpaca_instruct_cache/SFT_{VARIANT}_source_code_V1.jsonl"

test_df = preformat_data_df.sample(1, random_state=42).reset_index(drop=True)
input_mode="full"

preformat_data_df = preformat_data_df[preformat_data_df["filetype"] == "cryptol"].reset_index(drop=True)


result = iter_call_openai_structured(preformat_data_df, "gpt-5-2025-08-07", input_mode, file_cache_path)
result.head()

Response:

ParsedResponse[AlpacaRow](id='resp_09e4c07d64df281c0069188fbdb0388190804865a25dbbdcec', created_at=1763217341.0, error=None, incomplete_details=None, instructions=None, metadata={}, model='gpt-5-2025-08-07', object='response', output=[ResponseReasoningItem(id='rs_09e4c07d64df281c0069188fbe2a8c81909c8e394998ac980b', summary=[], type='reasoning', content=None, encrypted_content=None, status=None), ParsedResponseOutputMessage[AlpacaRow](id='msg_09e4c07d64df281c0069188fe5c9a48190b55a453d75dbfd35', content=[ParsedResponseOutputText[AlpacaRow](annotations=[], text='{"instruction":"In Cryptol, define the following with explicit types and a verification: (1) a constant x as a 4-element sequence of 8-bit values containing 1, 2, 3, and 4 in order; (2) a constant y as the pair produced by splitting x at index 2 into two sequences, each of length 2 and element width 8; (3) constants a, b, c, and d as 8-bit values equal to the first and second elements of the first component of y, and th

Unnamed: 0,filename,filetype,set,instruction,input,output,content
0,cryptol/examples/splitAt.cry,cryptol,unsupervised,"In Cryptol, define the following with explicit...",,,"x = [1,2,3,4] : [_][8]\n\ny = (splitAt x) : ([..."
1,cryptol/examples/AE.cry,cryptol,unsupervised,Define a Cryptol module named AE that contains...,,,module AE where\n\nparameter\n type A : * ...
2,cryptol/examples/xor_cipher.cry,cryptol,supervised,"In Cryptol, define type signatures for encrypt...",,,encrypt : {a}(fin a) => [8] -> [a][8] -> [a][8...
3,cryptol/examples/zero_weird.cry,cryptol,unsupervised,Write a Cryptol specification that defines: (1...,,,x : {a}() => a -> [16]\nx v = zero v \n\nprope...
4,cryptol/examples/builtin_lifting.cry,cryptol,unsupervised,Write a Cryptol specification that demonstrate...,,,"x = [True,False]\ny = [False,True]\n\nproperty..."


In [5]:
for idx, row in result[result['filetype'] == 'saw'].iterrows():
    print("Instruction:")
    print(row['instruction'])
    print("Input:")
    print(row['input'])
    print("Output:")
    print(row['output'])
    print("="*50)

In [6]:
result = alpaca_df_to_qwen_messages(
    result, 
    output="content",
    #system_prompt="You are a meticulous assistant that writes formal specifications and verification code for Cryptol programs.",
    drop_input=True,
    include_filename_in_user=False
    )
result.head()

Unnamed: 0,messages,filename,filetype,set
0,"[{'role': 'system', 'content': 'Return exactly...",cryptol/examples/splitAt.cry,cryptol,unsupervised
1,"[{'role': 'system', 'content': 'Return exactly...",cryptol/examples/AE.cry,cryptol,unsupervised
2,"[{'role': 'system', 'content': 'Return exactly...",cryptol/examples/xor_cipher.cry,cryptol,supervised
3,"[{'role': 'system', 'content': 'Return exactly...",cryptol/examples/zero_weird.cry,cryptol,unsupervised
4,"[{'role': 'system', 'content': 'Return exactly...",cryptol/examples/builtin_lifting.cry,cryptol,unsupervised


In [7]:
result = result.merge(
    preformat_data_df[["filename", "variant"]],
    on="filename",
    how="left"
)
reorder_df = result[["filename", "filetype", "set", "variant", "messages"]].copy()
reorder_df.head()


Unnamed: 0,filename,filetype,set,variant,messages
0,cryptol/examples/splitAt.cry,cryptol,unsupervised,without_comments,"[{'role': 'system', 'content': 'Return exactly..."
1,cryptol/examples/AE.cry,cryptol,unsupervised,without_comments,"[{'role': 'system', 'content': 'Return exactly..."
2,cryptol/examples/xor_cipher.cry,cryptol,supervised,without_comments,"[{'role': 'system', 'content': 'Return exactly..."
3,cryptol/examples/zero_weird.cry,cryptol,unsupervised,without_comments,"[{'role': 'system', 'content': 'Return exactly..."
4,cryptol/examples/builtin_lifting.cry,cryptol,unsupervised,without_comments,"[{'role': 'system', 'content': 'Return exactly..."


In [8]:
reorder_df.to_json(f"data/training_datasets/SFT_message_format_{VARIANT}_source_code_V2.jsonl", orient="records", lines=True)