In [None]:
import pandas as pd

input_path = "OctopusGuard/data/experiment_address.csv"
df = pd.read_csv(input_path)

df["scam"] = 1
df.loc[df.index[-30:], "scam"] = 0

models = ["smartinv", "SmarTest", "verismart", "honeypot_is", "slither", "ours"]
tasks = ["scam"]

for model in models:
    for task in tasks:
        df[f"{model}_{task}"] = None 

output_path = "OctopusGuard/evaluations/competitor_benchmarking/competitor_benchmarking_template.csv"
df.to_csv(output_path, index=False, encoding='utf-8-sig')

print(f"success: {output_path}")


In [None]:
import os
import json
import re
import pandas as pd
from openai import OpenAI

client = OpenAI(
    api_key="sk-",  
    base_url="https://api.deepseek.com"
)

def deepseek_judge_match(gt: str, slither_output: str) -> str:
    prompt = f"""
The following is the Ground Truth (human-labeled vulnerability description):
{gt}

And here is the vulnerability analysis output by the Slither tool:
{slither_output}

As a blockchain security expert, please evaluate the following:
1. Does the Slither output cover all the core issues in the Ground Truth? Summarize your judgment in one sentence (Match, Partial Match, No Match).
2. Give a similarity score between 0 and 100 indicating how well Slither's output matches the Ground Truth. Please use the format: `Score: XX`
"""
    response = client.chat.completions.create(
        model="deepseek-chat",
        messages=[{"role": "user", "content": prompt.strip()}],
        max_tokens=256,
        stream=False
    )
    return response.choices[0].message.content.strip()

def extract_ground_truths(json_path):
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    ground_truths = []
    for i in range(0, 300 * 6, 6):
        sample_group = data[i:i+6]
        ground_truth = sample_group[4]['completion']
        ground_truths.append(ground_truth.strip())
    return ground_truths

def extract_slither_reports(log_path):
    with open(log_path, 'r', encoding='utf-8') as f:
        content = f.read()
    
    reports = re.split(r"======= Analyzing contract: .+? =======\s*", content)[1:]
    return [report.strip() for report in reports[:300]] 

def parse_score(response_text: str) -> int:
    match = re.search(r"Score:\s*(\d{1,3})", response_text)
    if match:
        score = int(match.group(1))
        return max(0, min(100, score))  
    return 0  

def main():
    json_path = "OctopusGuard/data/test_multimodal_data.json"
    log_path = "OctopusGuard/evaluations/competitor_benchmarking/combined_slither_report.txt"
    csv_path = "OctopusGuard/evaluations/competitor_benchmarking/competitor_benchmarking_template.csv"
    output_log_path = "deepseek_eval_log_slither.txt"

    ground_truths = extract_ground_truths(json_path)
    slither_outputs = extract_slither_reports(log_path)

    df = pd.read_csv(csv_path)
    slither_scam_results = []

    with open(output_log_path, "w", encoding="utf-8") as logfile:
        for idx, (gt, slither_report) in enumerate(zip(ground_truths, slither_outputs)):
            print(f"\n===== Contract {idx+1} =====")
            logfile.write(f"\n===== Contract {idx+1} =====\n")
            logfile.flush()
            try:
                reply = deepseek_judge_match(gt, slither_report)
                print(reply)
                logfile.write(reply + "\n")
                logfile.flush()

                score = parse_score(reply)
                print(f"[Score Parsed]: {score}")
                logfile.write(f"[Score Parsed]: {score}\n")

                if idx < 270:
                    scam = 1 if score >= 50 else 0
                else:
                    scam = 0 if score >= 50 else 1

                slither_scam_results.append(scam)

            except Exception as e:
                error_msg = f"[ERROR] Contract {idx+1} failed: {e}"
                print(error_msg)
                logfile.write(error_msg + "\n")
                logfile.flush()
                slither_scam_results.append(0)  

    df['slither_scam'] = slither_scam_results
    df.to_csv(csv_path, index=False)

if __name__ == "__main__":
    main()


In [None]:
import os
import json
import re
import csv
import pandas as pd
from openai import OpenAI

client = OpenAI(
    api_key="sk-",  
    base_url="https://api.deepseek.com"
)

def deepseek_judge_match(gt: str, slither_output: str) -> str:
    prompt = f"""
The following is the Ground Truth (human-labeled vulnerability description):
{gt}

And here is the vulnerability analysis output by the VeriSmart tool:
{slither_output}

As a blockchain security expert, please evaluate the following:
1. Does the Slither output cover all the core issues in the Ground Truth? Summarize your judgment in one sentence (Match, Partial Match, No Match).
2. Give a similarity score between 0 and 100 indicating how well Slither's output matches the Ground Truth. Please use the format: `Score: XX`
"""
    response = client.chat.completions.create(
        model="deepseek-chat",
        messages=[{"role": "user", "content": prompt.strip()}],
        max_tokens=256,
        stream=False
    )
    return response.choices[0].message.content.strip()

def parse_score(response_text: str) -> int:
    match = re.search(r"Score:\s*(\d{1,3})", response_text)
    if match:
        score = int(match.group(1))
        return max(0, min(100, score))  
    return 0  

def extract_ground_truths(json_path):
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    ground_truths = []
    for i in range(0, 300 * 6, 6):
        sample_group = data[i:i+6]
        ground_truth = sample_group[4]['completion']
        ground_truths.append(ground_truth.strip())
    return ground_truths

def load_contract_addresses(csv_path):
    addresses = []
    with open(csv_path, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            addr = row.get("contract_address")
            if addr:
                addresses.append(addr.strip())
    return addresses[:300]  

def load_verismart_logs(log_dir, addresses):
    logs = []
    for addr in addresses:
        filename = os.path.join(log_dir, f"{addr}_verify.log")
        if os.path.exists(filename):
            with open(filename, "r", encoding="utf-8") as f:
                logs.append(f.read().strip())
        else:
            logs.append("[LOG NOT FOUND]")
    return logs

def main():
    json_path = "OctopusGuard/data/test_multimodal_data.json"
    log_dir = "OctopusGuard/evaluations/competitor_benchmarking/versmart_logs"
    addr_csv_path = "OctopusGuard/evaluations/competitor_benchmarking/competitor_benchmarking_template.csv"
    output_log_path = "deepseek_eval_log_verismart.txt"

    ground_truths = extract_ground_truths(json_path)
    addresses = load_contract_addresses(addr_csv_path)
    verismart_logs = load_verismart_logs(log_dir, addresses)

    df = pd.read_csv(addr_csv_path)
    scam_results = []

    with open(output_log_path, "w", encoding="utf-8") as logfile:
        for idx, (gt, verilog) in enumerate(zip(ground_truths, verismart_logs)):
            print(f"\n===== Contract {idx+1} Deepseek Response (VeriSmart) =====")
            logfile.write(f"\n===== Contract {idx+1} Deepseek Response (VeriSmart) =====\n")
            logfile.flush()
            try:
                reply = deepseek_judge_match(gt, verilog)
                print(reply)
                logfile.write(reply + "\n")
                logfile.flush()

                score = parse_score(reply)
                print(f"[Score Parsed]: {score}")
                logfile.write(f"[Score Parsed]: {score}\n")

                if idx < 270:  
                    scam = 1 if score >= 50 else 0
                else:  
                    scam = 0 if score >= 50 else 1

                scam_results.append(scam)

            except Exception as e:
                error_msg = f"[ERROR] Contract {idx+1} failed: {e}"
                print(error_msg)
                logfile.write(error_msg + "\n")
                logfile.flush()
                scam_results.append(0)  

    df = df[:300]  
    df['verismart_scam'] = scam_results
    df.to_csv(addr_csv_path, index=False)

if __name__ == "__main__":
    main()


In [None]:
import os
import json
import re
import csv
import pandas as pd
from openai import OpenAI

client = OpenAI(
    api_key="sk-",  # Replace with your Deepseek API Key
    base_url="https://api.deepseek.com"
)

def deepseek_judge_match(gt: str, slither_output: str) -> str:
    prompt = f"""
The following is the Ground Truth (human-labeled vulnerability description):
{gt}

And here is the vulnerability analysis output by the SmarTest tool:
{slither_output}

As a blockchain security expert, please evaluate the following:
1. Does the Slither output cover all the core issues in the Ground Truth? Summarize your judgment in one sentence (Match, Partial Match, No Match).
2. Give a similarity score between 0 and 100 indicating how well Slither's output matches the Ground Truth. Please use the format: `Score: XX`
"""
    response = client.chat.completions.create(
        model="deepseek-chat",
        messages=[{"role": "user", "content": prompt.strip()}],
        max_tokens=256,
        stream=False
    )
    return response.choices[0].message.content.strip()

def parse_score(response_text: str) -> int:
    match = re.search(r"Score:\s*(\d{1,3})", response_text)
    if match:
        score = int(match.group(1))
        return max(0, min(100, score))  
    return 0  

def extract_ground_truths(json_path):
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    ground_truths = []
    for i in range(0, 300 * 6, 6):
        sample_group = data[i:i+6]
        ground_truth = sample_group[4]['completion']
        ground_truths.append(ground_truth.strip())
    return ground_truths

def load_contract_addresses(csv_path):
    addresses = []
    with open(csv_path, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for row in reader:
            addr = row.get("contract_address")
            if addr:
                addresses.append(addr.strip())
    return addresses[:300]  

def load_verismart_logs(log_dir, addresses):
    logs = []
    for addr in addresses:
        filename = os.path.join(log_dir, f"{addr}_exploit.log")
        if os.path.exists(filename):
            with open(filename, "r", encoding="utf-8") as f:
                logs.append(f.read().strip())
        else:
            logs.append("[LOG NOT FOUND]")
    return logs

def main():
    json_path = "OctopusGuard/data/test_multimodal_data.json"
    log_dir = "OctopusGuard/evaluations/competitor_benchmarking/versmart_logs"
    addr_csv_path = "OctopusGuard/evaluations/competitor_benchmarking/competitor_benchmarking_template.csv"
    output_log_path = "deepseek_eval_log_smartest.txt"

    ground_truths = extract_ground_truths(json_path)
    addresses = load_contract_addresses(addr_csv_path)
    verismart_logs = load_verismart_logs(log_dir, addresses)

    df = pd.read_csv(addr_csv_path)
    scam_results = []

    with open(output_log_path, "w", encoding="utf-8") as logfile:
        for idx, (gt, verilog) in enumerate(zip(ground_truths, verismart_logs)):
            print(f"\n===== Contract {idx+1} Deepseek Response (SmarTest) =====")
            logfile.write(f"\n===== Contract {idx+1} Deepseek Response (SmarTest) =====\n")
            logfile.flush()
            try:
                reply = deepseek_judge_match(gt, verilog)
                print(reply)
                logfile.write(reply + "\n")
                logfile.flush()

                score = parse_score(reply)
                print(f"[Score Parsed]: {score}")
                logfile.write(f"[Score Parsed]: {score}\n")

                if idx < 270:
                    scam = 1 if score >= 50 else 0
                else:
                    scam = 0 if score >= 50 else 1

                scam_results.append(scam)

            except Exception as e:
                error_msg = f"[ERROR] Contract {idx+1} failed: {e}"
                print(error_msg)
                logfile.write(error_msg + "\n")
                logfile.flush()
                scam_results.append(0) 

    df = df[:300]  
    df['smartest_scam'] = scam_results
    df.to_csv(addr_csv_path, index=False)

if __name__ == "__main__":
    main()


In [None]:
import os
import json
import re
import pandas as pd
from openai import OpenAI

client = OpenAI(
    api_key="sk-", 
    base_url="https://api.deepseek.com"
)

def deepseek_judge_match(gt: str, slither_output: str) -> str:
    prompt = f"""
The following is the Ground Truth (human-labeled vulnerability description):
{gt}

And here is the vulnerability analysis output by the smartInv tool:
{slither_output}

As a blockchain security expert, please evaluate the following:
1. Does the smartInv output cover all the core issues in the Ground Truth? Summarize your judgment in one sentence (Match, Partial Match, No Match).
2. Give a similarity score between 0 and 100 indicating how well smartInv's output matches the Ground Truth. Please use the format: `Score: XX`
"""
    response = client.chat.completions.create(
        model="deepseek-chat",
        messages=[{"role": "user", "content": prompt.strip()}],
        max_tokens=256,
        stream=False
    )
    return response.choices[0].message.content.strip()

def extract_ground_truths(json_path):
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    ground_truths = []
    for i in range(0, 300 * 6, 6):
        sample_group = data[i:i+6]
        ground_truth = sample_group[4]['completion']
        ground_truths.append(ground_truth.strip())
    return ground_truths

def extract_slither_reports(log_path):
    with open(log_path, 'r', encoding='utf-8') as f:
        content = f.read()

    pattern = re.compile(
        r"==================== Contract Address:.*?\n(.*?)\n====================== ANALYSIS COMPLETE =====================",
        re.DOTALL
    )
    
    reports = pattern.findall(content)
    
    return [report.strip() for report in reports[:781]]

def parse_score(response_text: str) -> int:
    match = re.search(r"Score:\s*(\d{1,3})", response_text)
    if match:
        score = int(match.group(1))
        return max(0, min(100, score)) 
    return 0  

def main():
    json_path = "OctopusGuard/data/test_multimodal_data.json"
    log_path = "OctopusGuard/evaluations/competitor_benchmarking/analysis_log_samrtInv.txt"
    csv_path = "OctopusGuard/evaluations/competitor_benchmarking/competitor_benchmarking_template.csv"
    output_log_path = "deepseek_eval_log_smartInv.txt"

    ground_truths = extract_ground_truths(json_path)
    slither_outputs = extract_slither_reports(log_path)

    df = pd.read_csv(csv_path)
    slither_scam_results = []

    with open(output_log_path, "w", encoding="utf-8") as logfile:
        for idx, (gt, slither_report) in enumerate(zip(ground_truths, slither_outputs)):
            print(f"\n===== Contract {idx+1} =====")
            logfile.write(f"\n===== Contract {idx+1} =====\n")
            logfile.flush()
            try:
                reply = deepseek_judge_match(gt, slither_report)
                print(reply)
                logfile.write(reply + "\n")
                logfile.flush()

                score = parse_score(reply)
                print(f"[Score Parsed]: {score}")
                logfile.write(f"[Score Parsed]: {score}\n")

                if idx < 270:
                    scam = 1 if score >= 50 else 0
                else:
                    scam = 0 if score >= 50 else 1

                slither_scam_results.append(scam)

            except Exception as e:
                error_msg = f"[ERROR] Contract {idx+1} failed: {e}"
                print(error_msg)
                logfile.write(error_msg + "\n")
                logfile.flush()
                slither_scam_results.append(0) 

    df['smartInv_scam'] = slither_scam_results
    df.to_csv(csv_path, index=True)

if __name__ == "__main__":
    main()


In [None]:
import requests
import pandas as pd
import time
import json
import os

input_csv_path = "OctopusGuard/evaluations/competitor_benchmarking/competitor_benchmarking_template.csv"
log_file_path = "OctopusGuard/evaluations/competitor_benchmarking/honeypotIs_api_log.txt"

df = pd.read_csv(input_csv_path)

if 'honeypot_is_scam' not in df.columns:
    df['honeypot_is_scam'] = ''

with open(log_file_path, 'a', encoding='utf-8') as log_file:

    for idx, row in df.iterrows():
        address = row['contract_address']

        current_value = str(row['honeypot_is_scam']).strip()
        if current_value in ['0', '1']:
            continue

        print(f" {address}...")

        url = "https://api.honeypot.is/v2/IsHoneypot"
        params = {'address': address}

        retry_count = 0
        max_retries = 20

        while retry_count < max_retries:
            try:
                response = requests.get(url, params=params, timeout=30)
                response.raise_for_status()
                data = response.json()

                is_honeypot = data.get('honeypotResult', {}).get('isHoneypot', None)

                if is_honeypot is True:
                    df.at[idx, 'honeypot_is_scam'] = 1
                elif is_honeypot is False:
                    df.at[idx, 'honeypot_is_scam'] = 0
                else:
                    df.at[idx, 'honeypot_is_scam'] = ''

                log_file.write(f"address: {address}\n")
                log_file.write(json.dumps(data, ensure_ascii=False, indent=4))
                log_file.write("\n" + "="*80 + "\n")
                log_file.flush()

                break  

            except Exception as e:
                retry_count += 1
                print(f"failed ({retry_count}/20): {str(e)}ï¼Œretrying...")
                time.sleep(3)

        if retry_count >= max_retries:
            print(f"{address} no liquidity pool")
            df.at[idx, 'honeypot_is_scam'] = 0

        time.sleep(1)  

df['honeypot_is_scam'] = pd.to_numeric(df['honeypot_is_scam'], errors='coerce').astype('Int64')
df.to_csv(input_csv_path, index=False, encoding='utf-8-sig')


In [None]:

import pandas as pd
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, confusion_matrix

df = pd.read_csv("OctopusGuard/evaluations/competitor_benchmarking/competitor_benchmarking_template.csv")  

y_true = df["scam"]

tools = [col for col in df.columns if col not in ["contract_address", "scam"]]

results = []

for tool in tools:
    y_pred = df[tool]
    
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0,1]).ravel()
    
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    print(f"result - {tool}")
    print(f"TP: {tp}")
    print(f"FN: {fn}")
    print(f"FP: {fp}")
    print(f"TN: {tn}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("-" * 40)

    results.append({
        "Tool": tool,
        "TP": tp,
        "FN": fn,
        "FP": fp,
        "TN": tn,
        "Precision": round(precision, 4),
        "Recall": round(recall, 4),
        "Accuracy": round(accuracy, 4),
        "F1 Score": round(f1, 4)
    })

results_df = pd.DataFrame(results)
results_df.to_csv("competitor_benchmarking_results.csv", index=False)
