In [1]:
import pandas as pd
import pickle
import os

In [2]:
import json

# Path to your IF results
res_path = "/kaggle/input/isolation-forest-results-and-testing-datasets/blockchain_anomaly_results_encoded.json"

# Try reading as a single JSON array
try:
    with open(res_path, "r") as f:
        data = json.load(f)
    res_df = pd.DataFrame(data)
    print(f"Loaded {len(res_df)} records from JSON array.")
except ValueError:
    # Fallback to line-by-line (NDJSON)
    print("JSON array load failed; falling back to line-by-line parsing.")
    records = []
    with open(res_path, "r") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            records.append(json.loads(line))
    res_df = pd.DataFrame(records)
    print(f"Loaded {len(res_df)} records from NDJSON.")

# Now do the same for your mock transactions if needed:
test_path = "/kaggle/input/isolation-forest-results-and-testing-datasets/mock_blockchain_transactions_large.json"

# Assuming mock file *is* an array of objects, you can just do:
with open(test_path, "r") as f:
    mock_data = json.load(f)
test_df = pd.DataFrame(mock_data)
print(f"Loaded {len(test_df)} mock transactions.")

# …and then proceed with the merge, filter, feature extraction, and predictions as before.


Loaded 945 records from JSON array.
Loaded 945 mock transactions.


In [3]:
# 2. Merge on tx_hash to align IF prediction with ground truth
merged = pd.merge(
    res_df, test_df,
    left_on="tx_hash", right_on="tx_hash",
    suffixes=("_if", "")
)

In [4]:
# 3. Filter for IF false negatives:
#    predicted auth (“authentic”) but actual type == “anomaly”
fn = merged[
    (merged["anomaly"] == "authentic") &
    (merged["type"]    != "authentic")
].copy()

print(f"Found {len(fn)} false-negative records.")

Found 251 false-negative records.


In [5]:
# 4. Build feature DataFrame exactly matching your pipelines’ X
#    pipelines expect columns:
#      ['BlockHeight','Value','hour','day','month','year','From','To']
fn["TimeStamp"] = pd.to_datetime(fn["timestamp"], unit="s")
fn["hour"]  = fn["TimeStamp"].dt.hour
fn["day"]   = fn["TimeStamp"].dt.day
fn["month"] = fn["TimeStamp"].dt.month
fn["year"]  = fn["TimeStamp"].dt.year

X_fn = fn[[
    "block_number", "value",
    "hour","day","month","year",
    "from", "to"
]].rename(columns={
    "block_number": "BlockHeight",
    "value":        "Value",
    "from":         "From",
    "to":           "To"
})

In [6]:
# 5. Load your saved pipelines from the Kaggle input dir
model_dir = "/kaggle/input/comp517_models/scikitlearn/default/1"
pipelines = {}
for fname in os.listdir(model_dir):
    if fname.startswith("pipeline_") and fname.endswith(".pkl"):
        name = fname.replace("pipeline_","").replace(".pkl","").replace("_","+")
        with open(os.path.join(model_dir, fname),"rb") as f:
            pipelines[name] = pickle.load(f)

print("Models to test:", list(pipelines.keys()))

Models to test: ['RF+XGB', 'MLP', 'RF+MLP', 'XGB', 'RF+MLP+XGB', 'MLP+XGB', 'RF']


In [7]:
# 6. Predict & summarize
summary = []
for name, pipe in pipelines.items():
    preds = pipe.predict(X_fn)
    valid_count = int((preds == 0).sum())
    error_count = int((preds == 1).sum())
    summary.append({
        "Model":     name,
        "Valid (0)": valid_count,
        "Error (1)": error_count,
        "Total":     len(preds)
    })

summary_df = pd.DataFrame(summary)
print("\nFalse-Negative Subset Prediction Summary:")
print(summary_df)


False-Negative Subset Prediction Summary:
        Model  Valid (0)  Error (1)  Total
0      RF+XGB        251          0    251
1         MLP          0        251    251
2      RF+MLP          0        251    251
3         XGB        251          0    251
4  RF+MLP+XGB        251          0    251
5     MLP+XGB        250          1    251
6          RF        251          0    251


In [8]:
# 7. Save to CSV if you like
summary_df.to_csv("fn_prediction_summary.csv", index=False)
print("\nSaved summary to fn_prediction_summary.csv")


Saved summary to fn_prediction_summary.csv
