In [1]:
from pathlib import Path 
import os 

os.chdir(Path("~/SecurityAnalytics/DataPreprocess").expanduser())

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
from src.preprocessing.quality_process import compute_file_metrics

In [3]:
# Source code location
# /Users/josh/SecurityAnalytics/development/cryptol
# /Users/josh/SecurityAnalytics/development/cryptol-specs
# /Users/josh/SecurityAnalytics/development/saw-script

# --- paths ---
jsonl_path = "data/all_sources_raw.jsonl"      # your input dataset

# --- load dataset ---
rows = []
with open(jsonl_path, "r", encoding="utf-8") as f:
    for line in f:
        rows.append(json.loads(line))

print(f"Loaded {len(rows)} rows")

# --- run quality_process.py for each row ---
results = []
for row in rows:
    results.append(
        compute_file_metrics(
            row["filename"],
            row["content"]
        )
    )

# --- save to CSV ---
df = pd.DataFrame(results)

Loaded 3126 rows


In [4]:
# StarCoder-like thresholds (tune if needed)
MAX_BYTES         = 200_000
MAX_NONASCII      = 0.20
ENC_MAX_RUN_CHARS = 1024
ENC_MAX_FRACTION  = 0.50
MAX_LINES_TOTAL   = 100_000
MAX_LINE_AVG_LEN  = 100
MAX_LINE_MAX_LEN  = 1_000
MIN_TOKENS_LANG   = 40      # language-token gate (Cryptol tokenizer)
MAX_TOKENS_LANG   = 10_000  # optional upper bound
MIN_TOKENS_MODEL  = 32      # only if you’ve populated num_tokens_model
MAX_HEXNUM_RATIO  = 0.20


# --- exact dedup (keep first occurrence of each sha1) ---
# mark duplicates (True means "is duplicate" => drop later)
dup_mask = df.duplicated(subset=["sha1"], keep="first")

# --- encoded data (StarCoder) ---
enc_mask = (df["enc_max_run"] > ENC_MAX_RUN_CHARS) | (df["enc_fraction"] > ENC_MAX_FRACTION)

# --- long-line filters (StarCoder) ---
longline_mask = (
    (df["lines"] > MAX_LINES_TOTAL) |
    (df["avg_line_len"] > MAX_LINE_AVG_LEN) |
    (df["max_line_len"] > MAX_LINE_MAX_LEN)
)

# --- binary-like content ---
binary_mask = df["binary_like"].fillna(False)

# --- non-ascii density ---
nonascii_mask = df["non_ascii_ratio"].fillna(0) > MAX_NONASCII

# --- size guardrail (bytes) ---
bytes_mask = df["bytes"].fillna(0) > MAX_BYTES

# --- language-token bounds ---
lang_small_mask = df["num_tokens_lang"].fillna(0) < MIN_TOKENS_LANG
lang_large_mask = df["num_tokens_lang"].fillna(0) > MAX_TOKENS_LANG

# --- shingles exist (needed for Jaccard) ---
no_shingles_mask = df["num_shingles"].fillna(0) <= 0

# --- numeric/hex blob concentration ---
hexnum_mask = df["hexnum_ratio"].fillna(0) > MAX_HEXNUM_RATIO

# --- model-token gate (only apply where available) ---
if "num_tokens_model" in df.columns:
    model_small_mask = df["num_tokens_model"].fillna(np.inf) < MIN_TOKENS_MODEL
else:
    model_small_mask = pd.Series(False, index=df.index)


  model_small_mask = df["num_tokens_model"].fillna(np.inf) < MIN_TOKENS_MODEL


In [5]:
# Combine all hard-drop reasons
drop_mask = (
    dup_mask |
    enc_mask |
    longline_mask |
    binary_mask |
    nonascii_mask |
    bytes_mask |
    lang_small_mask |
    lang_large_mask |
    no_shingles_mask |
    hexnum_mask |
    model_small_mask
)

# Optional: compute a human-readable fail reason (first rule that tripped)
def first_reason(i):
    if dup_mask.iat[i]:          return "exact_duplicate"
    if enc_mask.iat[i]:          return "encoded_data"
    if longline_mask.iat[i]:     return "long_lines"
    if binary_mask.iat[i]:       return "binary_like"
    if nonascii_mask.iat[i]:     return "too_much_nonascii"
    if bytes_mask.iat[i]:        return "too_large_bytes"
    if lang_small_mask.iat[i]:   return "too_few_lang_tokens"
    if lang_large_mask.iat[i]:   return "too_many_lang_tokens"
    if no_shingles_mask.iat[i]:  return "no_shingles"
    if hexnum_mask.iat[i]:       return "hexnum_blob"
    if model_small_mask.iat[i]:  return "too_few_model_tokens"
    return "ok"

df = df.copy()
df["quality_ok"] = ~drop_mask
df["fail_reason"] = [first_reason(i) for i in range(len(df))]


In [6]:
dedup_cols = [
    "filename", "sha1",
    # size/lines
    "bytes", "lines", "avg_line_len", "max_line_len",
    # content/encoding
    "non_ascii_ratio", "binary_like",
    "enc_total_matched", "enc_max_run", "enc_fraction",
    "enc_hits_base64", "enc_hits_hexbytes", "enc_hits_unicode",
    # tokens/shingles
    "num_tokens_lang", "k_shingle", "num_shingles", "hexnum_ratio",
    # model tokens (optional)
    "num_tokens_model",
    # path heuristic & status
    "quality_ok", "fail_reason",
]

candidate_df = df.loc[df["quality_ok"], dedup_cols].reset_index(drop=True)
put_back_set = pd.read_csv("data/dropped/files_to_put_back.csv")
put_back_filenames = set(put_back_set['filename'].tolist())
for fname in put_back_filenames:
    if fname in df['filename'].values:
        candidate_df = pd.concat([candidate_df, df[df['filename'] == fname][dedup_cols]], ignore_index=True)
        candidate_df.loc[candidate_df['filename'] == fname, 'quality_ok'] = True

In [7]:
print("[summary] total:", len(df))
print("[summary] kept :", int(df["quality_ok"].sum()))
print("[summary] dropped:", int((~df["quality_ok"]).sum()))
print("[summary] drop reasons:")
print(df.loc[~df["quality_ok"], "fail_reason"].value_counts())


[summary] total: 3126
[summary] kept : 1447
[summary] dropped: 1679
[summary] drop reasons:
fail_reason
exact_duplicate         1010
too_few_lang_tokens      555
encoded_data              74
hexnum_blob               19
too_many_lang_tokens       9
long_lines                 7
too_large_bytes            4
too_much_nonascii          1
Name: count, dtype: int64


In [8]:
dropped = df[df["quality_ok"] == False].copy().reset_index(drop=True)


In [9]:
review_data_set = df[~df['fail_reason'].isin(['ok', 'exact_duplicate'])].copy().reset_index(drop=True)
review_data_set.to_csv("data/dropped/review_files.csv", index=False)

In [13]:
from src.preprocessing.similiar_process import run_from_dataframe

# candidate_df must have an absolute-path 'filename' column.
df_files, df_pairs, similar_files = run_from_dataframe(
    candidate_df,
    filename_col="filename",
    root_dir="/Users/josh/SecurityAnalytics",  # prepended to filename when opening
    out_dir="minhash_outputs",
)


[info] ==== Starting MinHash/LSH over DataFrame ====
[info] params: K_SHINGLE=5, NUM_PERM=512, LSH_THRESHOLD=0.7
[info] loaded 1504 files from candidate_df
[info] files indexed   : 1504
[diag] total candidate pairs: 262
[diag] pairs with jaccard >= 0.7: 242
[info] wrote CSV and Parquet to minhash_outputs/

[info] ==== MinHash/LSH run summary ====
[info] files loaded  : 1504
[info] files indexed : 1504
[info] files with 0 shingles (tokens < 5): 0
[info] candidate pairs (from LSH) : 262
[info] pairs with Jaccard >= 0.60: 256
[info] pairs with Jaccard >= 0.70: 242
[info] pairs with Jaccard >= 0.80: 191
[info] pairs with Jaccard >= 0.85: 162
[info] pairs with Jaccard >= 0.90: 133
[info] avg Jaccard (candidates)  : 0.8763
[info] max Jaccard               : 1.0000
[info] min Jaccard               : 0.5200

[info] top pairs:
                                                                                   a                                                                                      

In [14]:
from src.preprocessing.cluster_process import run_clustering

# If you already have df_files/df_pairs in memory:
df_keep, df_drop, df_clusters = run_clustering(
    df_files=df_files,          # from similiar_process
    df_pairs=df_pairs,          # from similiar_process
    jaccard_keep_threshold=0.70,
    out_dir="minhash_outputs",
    content_lookup=None,        # or {filename: raw_text} if you want text-derived penalties
    save_outputs=True
)

[info] clusters formed   : 1312
[info] kept files        : 1312
[info] dropped files     : 192
[info] wrote keep/drop/cluster CSVs to minhash_outputs/


In [15]:
dataset = df[df['filename'].isin(df_keep['filename'].tolist())].copy()
df.loc[
    df['filename'].isin(df_drop['filename'].tolist()),
    'quality_ok'
    ] = False
df.loc[
    df['filename'].isin(df_drop['filename'].tolist()),
    'fail_reason'
    ] = 'similiar_file_exists'

dropped = df[~df['filename'].isin(dataset['filename'].tolist())].copy().reset_index(drop=True)


In [16]:
dropped.to_csv("data/dropped/dropped_files.csv", index=False)

In [18]:
verified_files = set(dataset["filename"].unique())

all_files_df = pd.read_json(jsonl_path, lines=True)
all_files_filtered_df = all_files_df[all_files_df["filename"].isin(verified_files)].reset_index(drop=True)

all_files_filtered_df.to_json("data/training_datasets/verified_nomods.jsonl", lines=True, orient="records")

In [19]:
df.to_csv("data/file_metrics.csv", index=False)