In [1]:
from pathlib import Path 
import os, dotenv
dotenv.load_dotenv()
os.chdir(Path(os.getenv("PYTHONPATH")).expanduser())

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
from src.preprocessing.quality_process import compute_file_metrics

In [3]:
# Source code location
# /Users/josh/SecurityAnalytics/development/cryptol
# /Users/josh/SecurityAnalytics/development/cryptol-specs
# /Users/josh/SecurityAnalytics/development/saw-script
from transformers import AutoTokenizer


VERSION = "1.2s"
VARIATION = "nocomments"
DATA = f'data/training_datasets/verified_{VARIATION}_{VERSION}.jsonl'
# --- paths ---


# --- load dataset ---
rows = []
with open(DATA, "r", encoding="utf-8") as f:
    for line in f:
        rows.append(json.loads(line))

print(f"Loaded {len(rows)} rows")
qwen_tokenizer = AutoTokenizer.from_pretrained(
    "Qwen/Qwen2.5-Coder-7B-Instruct",  # or your exact Qwen2.5-Coder ID
    trust_remote_code=True,
)

# --- run quality_process.py for each row ---
results = []
for row in rows:
    results.append(
        compute_file_metrics(
            row["filename"],
            row["content"],
            model_tokenizer=qwen_tokenizer
        )
    )

# --- save to CSV ---
df = pd.DataFrame(results)

  from .autonotebook import tqdm as notebook_tqdm


Loaded 2761 rows


In [4]:
df.head()

Unnamed: 0,filename,sha1,bytes,lines,avg_line_len,max_line_len,non_ascii_ratio,binary_like,enc_total_matched,enc_max_run,enc_fraction,enc_hits_base64,enc_hits_hexbytes,enc_hits_unicode,num_tokens_lang,k_shingle,num_shingles,hexnum_ratio,num_tokens_model,junk_path
0,AES-GCM-SIV-proof/proof/cryptol-specs/AES.cry,6502af5ae03bf06118116731814e2701b6ceefd6,518,19,26.32,75,0.0,False,0,0,0.0,0,0,0,138,5,134,0.0,173,False
1,AES-GCM-SIV-proof/proof/cryptol-specs/AES128.cry,524b0175819d4abbc5cfe239a06cf31f633a177a,651,28,22.29,47,0.0,False,0,0,0.0,0,0,0,124,5,120,0.0,214,False
2,AES-GCM-SIV-proof/proof/cryptol-specs/AES256.cry,f13fdcd8487b90dd5282df8321131ef2693b39bc,649,28,22.21,47,0.0,False,0,0,0.0,0,0,0,124,5,120,0.0,214,False
3,AES-GCM-SIV-proof/proof/cryptol-specs/TBox.cry,e94ed6cd78a859e6a40f101204bb39d91770e1a4,4548,105,42.32,76,0.0,False,0,0,0.0,0,0,0,1397,5,1393,0.0,3400,False
4,AES-GCM-SIV-proof/proof/asm/deps/saw-script/do...,fa6af617c773221e56025a26a9de5ffd6f02acb8,1103,36,29.67,80,0.0,False,0,0,0.0,0,0,0,391,5,387,0.0,443,False


In [5]:
# StarCoder-like thresholds (tune if needed)
MAX_BYTES         = 200_000
MAX_NONASCII      = 0.20
ENC_MAX_RUN_CHARS = 1024
ENC_MAX_FRACTION  = 0.50
MAX_LINES_TOTAL   = 100_000
MAX_LINE_AVG_LEN  = 100
MAX_LINE_MAX_LEN  = 1_000
MIN_TOKENS_LANG   = 40      # language-token gate (Cryptol tokenizer)
MAX_TOKENS_LANG   = 10_000  # optional upper bound
MIN_TOKENS_MODEL  = 40      # only if youâ€™ve populated num_tokens_model
MAX_HEXNUM_RATIO  = 0.20


# --- exact dedup (keep first occurrence of each sha1) ---
# mark duplicates (True means "is duplicate" => drop later)
dup_mask = df.duplicated(subset=["sha1"], keep="first")

# --- encoded data (StarCoder) ---
enc_mask = (df["enc_max_run"] > ENC_MAX_RUN_CHARS) | (df["enc_fraction"] > ENC_MAX_FRACTION)

# --- long-line filters (StarCoder) ---
longline_mask = (
    (df["lines"] > MAX_LINES_TOTAL) |
    (df["avg_line_len"] > MAX_LINE_AVG_LEN) |
    (df["max_line_len"] > MAX_LINE_MAX_LEN)
)

# --- binary-like content ---
binary_mask = df["binary_like"].fillna(False)

# --- non-ascii density ---
nonascii_mask = df["non_ascii_ratio"].fillna(0) > MAX_NONASCII

# --- size guardrail (bytes) ---
bytes_mask = df["bytes"].fillna(0) > MAX_BYTES

# --- language-token bounds ---
lang_small_mask = df["num_tokens_lang"].fillna(0) < MIN_TOKENS_LANG
lang_large_mask = df["num_tokens_lang"].fillna(0) > MAX_TOKENS_LANG

# --- shingles exist (needed for Jaccard) ---
no_shingles_mask = df["num_shingles"].fillna(0) <= 0

# --- numeric/hex blob concentration ---
hexnum_mask = df["hexnum_ratio"].fillna(0) > MAX_HEXNUM_RATIO

# --- model-token gate (only apply where available) ---
if "num_tokens_model" in df.columns:
    model_small_mask = df["num_tokens_model"].fillna(np.inf) < MIN_TOKENS_MODEL
else:
    model_small_mask = pd.Series(False, index=df.index)


In [6]:
# Combine all hard-drop reasons
drop_mask = (
    dup_mask |
    enc_mask |
    longline_mask |
    binary_mask |
    nonascii_mask |
    bytes_mask |
    lang_small_mask |
    lang_large_mask |
    no_shingles_mask |
    hexnum_mask |
    model_small_mask
)

# Optional: compute a human-readable fail reason (first rule that tripped)
def first_reason(i):
    if dup_mask.iat[i]:          return "exact_duplicate"
    if enc_mask.iat[i]:          return "encoded_data"
    if longline_mask.iat[i]:     return "long_lines"
    if binary_mask.iat[i]:       return "binary_like"
    if nonascii_mask.iat[i]:     return "too_much_nonascii"
    if bytes_mask.iat[i]:        return "too_large_bytes"
    if lang_small_mask.iat[i]:   return "too_few_lang_tokens"
    if lang_large_mask.iat[i]:   return "too_many_lang_tokens"
    if no_shingles_mask.iat[i]:  return "no_shingles"
    if hexnum_mask.iat[i]:       return "hexnum_blob"
    if model_small_mask.iat[i]:  return "too_few_model_tokens"
    return "ok"

df = df.copy()
df["quality_ok"] = ~drop_mask
df["fail_reason"] = [first_reason(i) for i in range(len(df))]


In [7]:
dedup_cols = [
    "filename", "sha1",
    # size/lines
    "bytes", "lines", "avg_line_len", "max_line_len",
    # content/encoding
    "non_ascii_ratio", "binary_like",
    "enc_total_matched", "enc_max_run", "enc_fraction",
    "enc_hits_base64", "enc_hits_hexbytes", "enc_hits_unicode",
    # tokens/shingles
    "num_tokens_lang", "k_shingle", "num_shingles", "hexnum_ratio",
    # model tokens (optional)
    "num_tokens_model",
    # path heuristic & status
    "quality_ok", "fail_reason",
]

candidate_df = df.loc[df["quality_ok"], dedup_cols].reset_index(drop=True)
similar_process_df = pd.read_json(DATA, lines=True)
similar_process_df = similar_process_df[similar_process_df['filename'].isin(candidate_df['filename'])].reset_index(drop=True)
put_back_path = Path("data/dropped/files_to_put_back.csv")
'''
if put_back_path.exists():
    put_back_set = pd.read_csv(put_back_path)
    put_back_filenames = set(put_back_set["filename"].dropna().tolist())
else:
    put_back_filenames = set()

for fname in put_back_filenames:
    if fname in df['filename'].values:
        candidate_df = pd.concat([candidate_df, df[df['filename'] == fname][dedup_cols]], ignore_index=True)
        candidate_df.loc[candidate_df['filename'] == fname, 'quality_ok'] = True
        '''
similar_process_df.head()

Unnamed: 0,filename,filetype,content,variant
0,AES-GCM-SIV-proof/proof/cryptol-specs/AES.cry,cry,module AES where\n\nimport `Common::AES\n\ntyp...,without_comments
1,AES-GCM-SIV-proof/proof/cryptol-specs/AES128.cry,cry,module AES128 where\n\nimport `Common::AES\nim...,without_comments
2,AES-GCM-SIV-proof/proof/cryptol-specs/AES256.cry,cry,module AES256 where\n\nimport `Common::AES\nim...,without_comments
3,AES-GCM-SIV-proof/proof/cryptol-specs/TBox.cry,cry,type Nb = 4\ntype State = [4][Nb]...,without_comments
4,AES-GCM-SIV-proof/proof/asm/deps/saw-script/do...,cry,"all : {n, a} (fin n) => (a -> Bit, [n]a) -> Bi...",without_comments


In [8]:
print("[summary] total:", len(df))
print("[summary] kept :", int(df["quality_ok"].sum()))
print("[summary] dropped:", int((~df["quality_ok"]).sum()))
print("[summary] drop reasons:")
print(df.loc[~df["quality_ok"], "fail_reason"].value_counts())


[summary] total: 2761
[summary] kept : 1236
[summary] dropped: 1525
[summary] drop reasons:
fail_reason
exact_duplicate         847
too_few_lang_tokens     636
encoded_data             27
hexnum_blob              13
too_few_model_tokens      1
long_lines                1
Name: count, dtype: int64


In [9]:
dropped = df[df["quality_ok"] == False].copy().reset_index(drop=True)
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2761 entries, 0 to 2760
Data columns (total 22 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   filename           2761 non-null   object 
 1   sha1               2761 non-null   object 
 2   bytes              2761 non-null   int64  
 3   lines              2761 non-null   int64  
 4   avg_line_len       2761 non-null   float64
 5   max_line_len       2761 non-null   int64  
 6   non_ascii_ratio    2761 non-null   float64
 7   binary_like        2761 non-null   bool   
 8   enc_total_matched  2761 non-null   int64  
 9   enc_max_run        2761 non-null   int64  
 10  enc_fraction       2761 non-null   float64
 11  enc_hits_base64    2761 non-null   int64  
 12  enc_hits_hexbytes  2761 non-null   int64  
 13  enc_hits_unicode   2761 non-null   int64  
 14  num_tokens_lang    2761 non-null   int64  
 15  k_shingle          2761 non-null   int64  
 16  num_shingles       2761 

In [10]:
review_data_set = df[~df['fail_reason'].isin(['ok', 'exact_duplicate'])].copy().reset_index(drop=True)
out_path = Path(f"data/dropped/review_files_{VARIATION}_{VERSION}.csv")
out_path.parent.mkdir(parents=True, exist_ok=True)  # create dirs if missing
review_data_set.to_csv(out_path, index=False)


In [11]:
from src.preprocessing.similiar_process import run_from_dataframe

# candidate_df must have an absolute-path 'filename' column.
df_files, df_pairs, similar_files = run_from_dataframe(
    similar_process_df,
    filename_col="filename",
    content_col="content",
    out_dir="minhash_outputs",
)


[info] ==== Starting MinHash/LSH over DataFrame ====
[info] params: K_SHINGLE=5, NUM_PERM=512, LSH_THRESHOLD=0.7
[info] loaded 1236 files from candidate_df
[info] files indexed   : 1236
[diag] total candidate pairs: 1075
[diag] pairs with jaccard >= 0.7: 849
[info] wrote CSV and Parquet to minhash_outputs/

[info] ==== MinHash/LSH run summary ====
[info] files loaded  : 1236
[info] files indexed : 1236
[info] files with 0 shingles (tokens < 5): 0
[info] candidate pairs (from LSH) : 1075
[info] pairs with Jaccard >= 0.60: 911
[info] pairs with Jaccard >= 0.70: 849
[info] pairs with Jaccard >= 0.80: 634
[info] pairs with Jaccard >= 0.85: 541
[info] pairs with Jaccard >= 0.90: 398
[info] avg Jaccard (candidates)  : 0.8071
[info] max Jaccard               : 1.0000
[info] min Jaccard               : 0.5280

[info] top pairs:
                                                                                                        a                                                               

In [12]:
df_files.head()

Unnamed: 0,filename,num_tokens,num_shingles,num_perm,k_shingle,minhash_hashvalues
0,AES-GCM-SIV-proof/proof/asm/cryptol/AES128_GCM...,742,637,512,5,"[6651975, 871518, 16710139, 8870568, 924959, 2..."
1,AES-GCM-SIV-proof/proof/asm/cryptol/AES128_GCM...,75,70,512,5,"[10853965, 46774336, 32514154, 10692928, 48889..."
2,AES-GCM-SIV-proof/proof/asm/cryptol/AES128_GCM...,70,65,512,5,"[265750243, 46774336, 32514154, 10692928, 4888..."
3,AES-GCM-SIV-proof/proof/asm/cryptol/AES128_GCM...,209,187,512,5,"[6651975, 13681742, 16710139, 10692928, 457368..."
4,AES-GCM-SIV-proof/proof/asm/cryptol/AES128_GCM...,184,171,512,5,"[6651975, 46774336, 31424797, 10692928, 457368..."


In [13]:
from src.preprocessing.cluster_process import run_clustering

# If you already have df_files/df_pairs in memory:
df_keep, df_drop, df_clusters = run_clustering(
    df_files=df_files,          # from similiar_process
    df_pairs=df_pairs,          # from similiar_process
    jaccard_keep_threshold=0.70,
    out_dir="minhash_outputs",
    content_lookup=None,        # or {filename: raw_text} if you want text-derived penalties
    save_outputs=True
)

[info] clusters formed   : 957
[info] kept files        : 957
[info] dropped files     : 279
[info] wrote keep/drop/cluster CSVs to minhash_outputs/


In [14]:
dataset = df[df['filename'].isin(df_keep['filename'].tolist())].copy()
df.loc[
    df['filename'].isin(df_drop['filename'].tolist()),
    'quality_ok'
    ] = False
df.loc[
    df['filename'].isin(df_drop['filename'].tolist()),
    'fail_reason'
    ] = 'similiar_file_exists'

dropped = df[~df['filename'].isin(dataset['filename'].tolist())].copy().reset_index(drop=True)


In [15]:
dropped.to_csv(f"data/dropped/{VARIATION}_dropped_files_{VERSION}.csv", index=False)

In [16]:
verified_files = set(dataset["filename"].unique())

all_files_df = pd.read_json(DATA, lines=True)
all_files_filtered_df = all_files_df[all_files_df["filename"].isin(verified_files)].reset_index(drop=True)
out_path = Path(f"data/training_datasets/verified_{VARIATION}_{VERSION}.jsonl")
out_path.parent.mkdir(parents=True, exist_ok=True)  # create dirs if missing
all_files_filtered_df.to_json(out_path, lines=True, orient="records")

In [17]:
df.to_csv(f"data/{VARIATION}_file_metrics_{VERSION}.csv", index=False)