In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
from src.preprocessing.comment_process import normalize_blanklines, strip_cryptol_comments_all

In [2]:
jsonl_path = "data/training_datasets/verified_nomods.jsonl"

nomod_df = pd.read_json(jsonl_path, lines=True)

In [3]:
nomod_df.head()

Unnamed: 0,filename,filetype,content
0,cryptol/examples/splitAt.cry,cry,"x = [1,2,3,4] : [_][8]\n\ny = (splitAt x) : ([..."
1,cryptol/examples/AE.cry,cry,// WORK IN PROGRESS\n\n/*\nImplementation of t...
2,cryptol/examples/xor_cipher.cry,cry,encrypt : {a}(fin a) => [8] -> [a][8] -> [a][8...
3,cryptol/examples/zero_weird.cry,cry,x : {a}() => a -> [16]\nx v = zero v \n\nprope...
4,cryptol/examples/builtin_lifting.cry,cry,"//builtins lift over tuples, seqs, and records..."


In [8]:
nocomment_df = nomod_df.copy()
nocomment_df["content"] = nocomment_df["content"].apply(strip_cryptol_comments_all)
nocomment_df["content"] = nocomment_df["content"].apply(normalize_blanklines)

In [9]:
nocomment_df.head(20)

Unnamed: 0,filename,filetype,content
0,cryptol/examples/splitAt.cry,cry,"x = [1,2,3,4] : [_][8]\n\ny = (splitAt x) : ([..."
1,cryptol/examples/AE.cry,cry,module AE where\n\nparameter\n type A : * ...
2,cryptol/examples/xor_cipher.cry,cry,encrypt : {a}(fin a) => [8] -> [a][8] -> [a][8...
3,cryptol/examples/zero_weird.cry,cry,x : {a}() => a -> [16]\nx v = zero v \n\nprope...
4,cryptol/examples/builtin_lifting.cry,cry,"x = [True,False]\ny = [False,True]\n\nproperty..."
5,cryptol/examples/inflist.cry,cry,"a = [1 ... ]\nb = [1,2 ... ]\nc = [1 .. 5]\nd ..."
6,cryptol/examples/Karatsuba.cry,cry,"module Karatsuba where\n\nkmult : {limb,n} (fi..."
7,cryptol/examples/comp.cry,cry,"x : [_]([2],[3],[3],[4])\nx = [(a,b,c,d) | a <..."
8,cryptol/examples/mini.cry,cry,id : [32] -> [32]\nid x = rec x\n where rec k...
9,cryptol/examples/Test.cry,cry,module Test where\nimport Cipher\n\nevktest : ...


In [10]:
nocomment_df.tail(20)

Unnamed: 0,filename,filetype,content
755,saw-script/deps/what4/what4/doc/bvdomain.cry,cry,module bvdomain where\n\nimport arithdomain as...
756,saw-script/deps/what4/what4/doc/bitsdomain.cry,cry,module bitsdomain where\n\ntype Dom n = { loma...
757,saw-script/deps/what4/what4/doc/arithdomain.cry,cry,"module arithdomain where\n\nbit : {i, n} (fin ..."
758,saw-script/doc/llvm-java-verification-with-saw...,saw,"cls <- java_load_class ""FFS"";\nbc <- llvm_load..."
759,saw-script/doc/llvm-java-verification-with-saw...,saw,"let add_spec = do {\n x <- jvm_fresh_var ""x..."
760,saw-script/doc/llvm-java-verification-with-saw...,saw,"set_base 16;\n\nprint ""Extracting reference te..."
761,saw-script/doc/llvm-java-verification-with-saw...,cry,"all : {n, a} (fin n) => (a -> Bit, [n]a) -> Bi..."
762,saw-script/doc/llvm-java-verification-with-saw...,saw,"java_ffs_ref <- read_aig ""java_ffs_ref.aig"";\n..."
763,saw-script/doc/llvm-java-verification-with-saw...,saw,"import ""NQueens.cry"";\n\nprint {{ nQueens [0,1..."
764,saw-script/doc/llvm-java-verification-with-saw...,cry,module Cipher where\n\ntype Cipher KeySize Blo...


In [11]:
nocomment_df["variant"] = "without_comments"
nocomment_df.head()

Unnamed: 0,filename,filetype,content,variant
0,cryptol/examples/splitAt.cry,cry,"x = [1,2,3,4] : [_][8]\n\ny = (splitAt x) : ([...",without_comments
1,cryptol/examples/AE.cry,cry,module AE where\n\nparameter\n type A : * ...,without_comments
2,cryptol/examples/xor_cipher.cry,cry,encrypt : {a}(fin a) => [8] -> [a][8] -> [a][8...,without_comments
3,cryptol/examples/zero_weird.cry,cry,x : {a}() => a -> [16]\nx v = zero v \n\nprope...,without_comments
4,cryptol/examples/builtin_lifting.cry,cry,"x = [True,False]\ny = [False,True]\n\nproperty...",without_comments


In [12]:
nocomment_df.to_json("data/training_datasets/verified_nocomments.jsonl", orient="records", lines=True, force_ascii=False)


In [13]:
from preprocessing.comment_extractor import extract_strip_cry_comments
# Create Hybrid Dataset
DECISION_CACHE_PATH = "GPT_comment_decisions_cache.jsonl"
comment_rows = []
dataset_rows = []
for index, row in nomod_df.iterrows():

    if row.filetype == "cry" or True:
        comments, file_record_ = extract_strip_cry_comments(
            filename=row.filename,
            content=row.content,
            llm_model_name="gpt-oss:20b",
            decision_cache_path=DECISION_CACHE_PATH
        )
        comment_rows.extend(comments)
        file_record = {
            "filename": file_record_["filename"],
            "filetype": row.filetype,
            "content": file_record_["content"],
            "variant": "hybrid"
        }
    else:
        file_record = {
            "filename": row.filename,
            "filetype": row.filetype,
            "content": row.content,
            "variant": "hybrid"
        }
    dataset_rows.append(file_record)

comment_df = pd.DataFrame(comment_rows)
hybrid_df = pd.DataFrame(dataset_rows)



In [14]:
comment_df.head()

Unnamed: 0,filename,sha1,comment,keep,snippet
0,cryptol/examples/AE.cry,5e86684a393aa148fd0a454958fa967d95065be8,// WORK IN PROGRESS,False,
1,cryptol/examples/AE.cry,259850075ed51d4fd666df2fe0a2063d600374ce,/*\nImplementation of the algorithms from the ...,True,module AE where\n\nparameter\n type A : * ...
2,cryptol/examples/AE.cry,150ca06f0e9d2946cbce4dc93eae2277e1402f91,// State type,False,type K : *
3,cryptol/examples/AE.cry,84c8479e2cda530b361b5d04e7c090204713d38e,// Key type,False,type n : #
4,cryptol/examples/AE.cry,d552c54a82a17f171563b8dd15d500d5212539b2,// Block size,False,type p : #


In [15]:
hybrid_df.head()

Unnamed: 0,filename,filetype,content,variant
0,cryptol/examples/splitAt.cry,cry,"x = [1,2,3,4] : [_][8]\n\ny = (splitAt x) : ([...",hybrid
1,cryptol/examples/AE.cry,cry,/*\nImplementation of the algorithms from the ...,hybrid
2,cryptol/examples/xor_cipher.cry,cry,encrypt : {a}(fin a) => [8] -> [a][8] -> [a][8...,hybrid
3,cryptol/examples/zero_weird.cry,cry,x : {a}() => a -> [16]\nx v = zero v \n\nprope...,hybrid
4,cryptol/examples/builtin_lifting.cry,cry,"//builtins lift over tuples, seqs, and records...",hybrid


In [None]:
HYBRID_PATH = "data/training_datasets/GPTverified_hybrid.jsonl"
COMMENT_STAT_PATH = "data/GPTcomment_stats.jsonl"

hybrid_df.to_json(HYBRID_PATH, lines=True, orient="records")
comment_df.to_json(COMMENT_STAT_PATH, lines=True, orient="records")

