In [18]:
import pandas as pd

# IoC behavior trend
ioc_trends = {
    "setup": "location",
    "spyware and information stealing": "function",
    "dropper/malware": "attack_type",
    "execute automatically upon download": "trigger_mechanism",
    "base64-encoded": "hide"
}

# keyword mapping of malicious behavior
behavior_keywords = {
    "file": ["write", "read", "delete"],
    "cmd": ["execute", "args", "env", "total"],
    "dns": ["query", "host", "type"],
    "entropy": ["entropy"],
    "setup": ["setup", "install"],
    "dropper": ["download", "write"],
    "obfuscation": ["base64", "encode", "obfuscation"]
}

def match_feature_to_ioc_trends(feature_name: str) -> dict:
    feature_lower = feature_name.lower()
    match = {
        "setup_related": any(k in feature_lower for k in behavior_keywords["setup"]),
        "spyware_related": any(k in feature_lower for k in behavior_keywords["file"] + behavior_keywords["cmd"]),
        "dropper_related": any(k in feature_lower for k in behavior_keywords["dropper"]),
        "auto_exec_related": any(k in feature_lower for k in behavior_keywords["cmd"] + ["auto", "exec"]),
        "obfuscation_related": any(k in feature_lower for k in behavior_keywords["obfuscation"]),
    }
    match["total_match_score"] = sum(match.values())
    return match

def evaluate_group(features, name):
    scores = [match_feature_to_ioc_trends(f)["total_match_score"] for f in features]
    avg_score = sum(scores) / len(scores)
    return {"group": name, "avg_trend_match_score": avg_score}


In [19]:
correlation_features = [
    "socket_unique_ips",
    "socket_unique_hostnames",
    "file_write_count",
    "file_unique_paths",
    "dns_unique_types",
    "dns_total_queries",
    "dns_unique_hosts",
    "file_read_count",
    "cmd_total_count",
    "cmd_unique_commands"
]

# Generate feature trend matching table
results = []
for feat in correlation_features:
    match_result = match_feature_to_ioc_trends(feat)
    match_result["feature"] = feat
    results.append(match_result)

match = pd.DataFrame(results)
print(match)

   setup_related  spyware_related  dropper_related  auto_exec_related  \
0          False            False            False              False   
1          False            False            False              False   
2          False             True             True              False   
3          False            False            False              False   
4          False            False            False              False   
5          False             True            False               True   
6          False            False            False              False   
7          False             True            False              False   
8          False             True            False               True   
9          False            False            False              False   

   obfuscation_related  total_match_score                  feature  
0                False                  0        socket_unique_ips  
1                False                  0  socket_unique_h

In [20]:
shap_features = [
    "file_write_count", "cmd_total_envs", "cmd_total_count", "file_delete_count",
    "file_unique_paths", "cmd_total_args", "cmd_unique_commands", "file_read_count",
    "socket_unique_ips", "dns_unique_hosts"
]

In [21]:
entropy_edge_features = [
    "DNS:AAAA_A", "Action:Write", "DNS:A_AAAA", "CMD:execute", "Action:Read",
    "socket_ip:access", "Action:Read", "socket_host:access", "CMD:execute", "socket_ip:access"
]

entropy_node_features = [
    "Readline/readline-i.ri", "libxml/xmlstring.h", "ClassMethods/commands-i.ri", "bundler/plugin",
    "Color/set_color-i.ri", "HiddenCommand/cdesc-HiddenCommand.ri", "Thor/Base", "templates/newgem",
    "Actions/inject_into_class-i.ri", "source/git"
]

In [22]:
# === Step 5: process all groups ===
groups = {
    "SHAP + XGBoost": shap_features,
    "Correlation": correlation_features,
    "Entropy-Edge": entropy_edge_features,
    "Entropy-Node": entropy_node_features,
}

results = [evaluate_group(feats, name) for name, feats in groups.items()]
score_df = pd.DataFrame(results)

# === Step 6: output result ===
print(score_df)

            group  avg_trend_match_score
0  SHAP + XGBoost                    1.0
1     Correlation                    0.7
2    Entropy-Edge                    0.8
3    Entropy-Node                    0.1
