In [1]:
from pathlib import Path 
import os, dotenv
dotenv.load_dotenv()
os.chdir(Path(os.getenv("PYTHONPATH")).expanduser())

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json

In [3]:
VARIATION = "nocomments"
jsonl_path = f"data/training_datasets/verified_{VARIATION}.jsonl"
#jsonl_path = "data/training_datasets/verified_nocomments.jsonl"

df = pd.read_json(jsonl_path, lines=True)

In [4]:
df.head()

Unnamed: 0,filename,filetype,content,variant
0,AES-GCM-SIV-proof/proof/ref-128/clmul_emulator...,saw,let {{\nmul_result : [64] -> [64] -> [2][64]\n...,without_comments
1,AES-GCM-SIV-proof/proof/ref-128/common.saw,saw,let allocArg name ty = crucible_fresh_var name...,without_comments
2,AES-GCM-SIV-proof/proof/ref-128/aes_emulation.saw,saw,let {{\ntoState : [4][32] -> State\ntoState co...,without_comments
3,AES-GCM-SIV-proof/proof/ref-128/proof.saw,saw,"include ""common.saw"";\n\nc_code <- llvm_load_m...",without_comments
4,AES-GCM-SIV-proof/proof/cryptol-specs/AES.cry,cry,module AES where\n\nimport `Common::AES\n\ntyp...,without_comments


In [5]:
from src.preprocessing.interpreter_process import verify_df_row_with_cryptol
from src.preprocessing.saw_subprocess import run_saw_script, load_saw_results

saw_results = []
MOUNT_DIR = os.getenv("MOUNT_DIR")
dir_path = Path(MOUNT_DIR)
dir_path.mkdir(parents=True, exist_ok=True) 
SERVER_URL = "http://localhost:8080"
#print(verify_df_row_with_cryptol(df, 35, host_mount_dir=MOUNT_DIR, server_url=SERVER_URL))
rows = []
out_path = Path("data/training_datasets/syntax_check/cry_saw_syntax_verification_results.jsonl")
out_path.parent.mkdir(parents=True, exist_ok=True)  # create dirs if missing
results_df = load_saw_results("data/training_datasets/syntax_check/cry_saw_syntax_verification_results.jsonl")
for i in range(len(df)):
    if i % 10 == 0 and i > 0:
        print(f"Processed {i} of {len(df)} files")
    if df.iloc[i]["filetype"] == 'cry':
        repl_result = verify_df_row_with_cryptol(
                df, i, host_mount_dir=MOUNT_DIR, server_url=SERVER_URL)
        rows.append({"filename" : df.iloc[i]["filename"], **repl_result["load_info"]})
    else:
        if df.iloc[i]["filename"] in results_df['filename'].values:
            result_ = results_df[results_df['filename'] == df.iloc[i]["filename"]]
            rows.append({"filename" : df.iloc[i]["filename"], "load_ok": result_["load_ok"].values[0], "error": result_["error"].values[0] if not result_["load_ok"].values[0] else None, "file_deps": ["SAW"]})
        else:
            rows.append({"filename" : df.iloc[i]["filename"], "load_ok": False, "error": "saw", "file_deps": ["SAW"]})
            continue
            saw_result = run_saw_script(f"{os.getenv("REPO_ROOT")}/{df.iloc[i]['filename']}")
            row = {"filename": df.iloc[i]["filename"], **saw_result}
            results_df = pd.concat([results_df, pd.DataFrame([row])], ignore_index=True)
            #saw_results.append({"filename": df.iloc[i]["filename"], **saw_result})
            if saw_result["returncode"] == 0:
                rows.append({"filename" : df.iloc[i]["filename"], "load_ok": True, "error": None, "file_deps": ["SAW"]})
            else:
                rows.append({"filename" : df.iloc[i]["filename"], "load_ok": False, "error": "saw", "file_deps": ["SAW"]})

Processed 10 of 1330 files
Processed 20 of 1330 files
Processed 30 of 1330 files
Processed 40 of 1330 files
Processed 50 of 1330 files
Processed 60 of 1330 files
Processed 70 of 1330 files
Processed 80 of 1330 files
Processed 90 of 1330 files
Processed 100 of 1330 files
Processed 110 of 1330 files
Processed 120 of 1330 files
Processed 130 of 1330 files
Processed 140 of 1330 files
Processed 150 of 1330 files
Processed 160 of 1330 files
Processed 170 of 1330 files
Processed 180 of 1330 files
Processed 190 of 1330 files
Processed 200 of 1330 files
Processed 210 of 1330 files
Processed 220 of 1330 files
Processed 230 of 1330 files
Processed 240 of 1330 files
Processed 250 of 1330 files
Processed 260 of 1330 files
Processed 270 of 1330 files
Processed 280 of 1330 files
Processed 290 of 1330 files
Processed 300 of 1330 files
Processed 310 of 1330 files
Processed 320 of 1330 files
Processed 330 of 1330 files
Processed 340 of 1330 files
Processed 350 of 1330 files
Processed 360 of 1330 files
P

In [6]:
results_df.head()

Unnamed: 0,filename,load_ok,file,error,file_deps
0,cryptol/examples/splitAt.cry,True,files/splitAt_euokmxbs.cry,,[Cryptol]
1,cryptol/examples/AE.cry,True,files/AE_9g0jgbb4.cry,,error
2,cryptol/examples/xor_cipher.cry,True,files/xor_cipher_l2cd41g9.cry,,[Cryptol]
3,cryptol/examples/zero_weird.cry,True,files/zero_weird_1uwzhu5l.cry,,[Cryptol]
4,cryptol/examples/builtin_lifting.cry,True,files/builtin_lifting_mvvk9w8q.cry,,[Cryptol]


In [7]:
repl_results_df = pd.DataFrame(rows)
repl_results_df = repl_results_df.join(df[["filename", "filetype"]].set_index('filename'), on='filename', how='left', rsuffix='_orig')
repl_results_df.head()

Unnamed: 0,filename,load_ok,error,file_deps,file,filetype
0,AES-GCM-SIV-proof/proof/ref-128/clmul_emulator...,False,saw,[SAW],,saw
1,AES-GCM-SIV-proof/proof/ref-128/common.saw,False,saw,[SAW],,saw
2,AES-GCM-SIV-proof/proof/ref-128/aes_emulation.saw,False,saw,[SAW],,saw
3,AES-GCM-SIV-proof/proof/ref-128/proof.saw,False,saw,[SAW],,saw
4,AES-GCM-SIV-proof/proof/cryptol-specs/AES.cry,True,,"[Common::AES, Cryptol]",files/AES_ftmqkao3.cry,cry


In [8]:
print(len(repl_results_df[(repl_results_df["load_ok"] == True) & (repl_results_df["filetype"] == "cry")]))
print(len(repl_results_df[(repl_results_df["load_ok"] == False) & (repl_results_df["filetype"] == "cry")]))

683
0


In [9]:
print(len(repl_results_df[(repl_results_df["load_ok"] == True) & (repl_results_df["filetype"] != "cry")]))
print(len(repl_results_df[(repl_results_df["load_ok"] == False) & (repl_results_df["filetype"] != "cry")]))

262
385


In [10]:
print(len(repl_results_df[repl_results_df["load_ok"] == True]))
print(len(repl_results_df[repl_results_df["load_ok"] == False]))

945
385
