In [None]:
import networkx as nx
import pickle
import copy
from cdlib import algorithms
import os
import time
from cdlib import algorithms

with open('./SECCG generated Knowledge Graph.pkl', 'rb') as f:
    G_Value = pickle.load(f)
undirected_G_Value=copy.deepcopy(G_Value)
G_Value_undirected=undirected_G_Value.to_undirected()

In [None]:
import os
import time
import logging
import pickle
from copy import copy
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import math
import tqdm
import numpy as np
from math import log, e
from multiprocessing import Pool
from functools import partial

def filter_graph_by_behave_conf(graph):
    # Create a new graph with edges that have behave_conf > 0.5
    new_graph = nx.Graph()
    for edge in graph.edges:
        if graph[edge[0]][edge[1]].get("behave_conf", 0) > 0.5:
            new_graph.add_edge(edge[0], edge[1], **graph[edge[0]][edge[1]])

    # Remove isolated nodes from the new graph
    new_graph.remove_nodes_from(list(nx.isolates(new_graph)))
    return new_graph

def get_graph_tactics(graph):
    tactic_list = []
    for edge in graph.edges:
        one_hot = graph[edge[0]][edge[1]].get("tactic_conf")
        tactic_this = [i for i, x in enumerate(one_hot) if x == 1]
        tactic_list.append(tactic_this)
    return tactic_list

def check_unique_tactic(tactic_list):
    total_tactic_list = set()
    for single_tactic_list in tactic_list:
        for tactic in single_tactic_list:
            total_tactic_list.add(tactic)
    return len(total_tactic_list)

def check_graph_by_unique_article_id(graph):
    id_set = set()
    for edge in graph.edges:
        if graph[edge[0]][edge[1]].get("article_id") is not None:
            id_set.add(graph[edge[0]][edge[1]]["article_id"])
    # check size of the set >=2
    if len(id_set) >= 2:
        return True
    return False

def check_graph_has_tactic(graph):
    # Create a new graph with edges that have behave_conf > 0.5
    for edge in graph.edges:
        if graph[edge[0]][edge[1]].get("tactic_conf") is not None:
            if "1" in str(graph[edge[0]][edge[1]]["tactic_conf"]):
                return True
    return False

def new_draw_graph(
    graph, draw_edges=True, save_folder="defaultgraph", saveorshow="show"
):
    pos = nx.circular_layout(graph) 
    plt.figure(num=None, figsize=(20, 20), dpi=100)
    nx.draw_networkx(
        graph,
        pos,
        with_labels=True,
        node_size=20,
        arrowsize=90,
        linewidths=1.5,
        arrowstyle="->",
        edge_color="red",
        node_shape="o",
        bbox=dict(facecolor="black", edgecolor="black", boxstyle="round,pad=0.3"),
        node_color="black",
        font_size=15,
        font_color="white",
        # labels=nx.get_node_attributes(graph, 'label'),
    )
    if draw_edges:
        edge_labels = nx.get_edge_attributes(graph, "relation")
        articles_labels = nx.get_edge_attributes(graph, "article_id")
        one_hot = nx.get_edge_attributes(graph, "tactic_conf")
        true_tactic = {}
        for key in one_hot.keys():
            true_tactic[key] = [i for i, x in enumerate(one_hot[key]) if x == 1]
        # tactic_this=[i for i, x in enumerate(one_hot) if x == 1]
        data = articles_labels
        article_dict = {}
        article_id = 0
        for key, value in data.items():
            if value not in article_dict:
                article_dict[value] = "from article" + str(article_id)
                article_id = article_id + 1
        result_dict = {}
        for key, value in data.items():
            result_dict[value] = article_dict[value]
        for key, value in edge_labels.items():
            new_value = "relation:" + value
            article_true_value = articles_labels[key]
            article_012_value = result_dict[article_true_value]
            new_value += f"\n{article_012_value}"
            if key in true_tactic:
                if len(true_tactic[key]) > 0:
                    new_value = new_value + "\nattack tactic:\n"
                    for tactic in true_tactic[key]:
                        new_value += f"{big_label_list[tactic]},\n"
            edge_labels[key] = new_value
        nx.draw_networkx_edge_labels(
            graph,
            pos,
            edge_labels=edge_labels,
            font_color="red",
            font_size=12,
        )
    if saveorshow == "show":
        plt.show()
    if saveorshow == "save":
        if not os.path.exists(save_folder):
            os.makedirs(save_folder)
        # save with random name
        plt.savefig(save_folder + "/" + str(random.randint(0, 1000000)) + ".png")
        plt.close()

def calculate_percentage(folder_path):
    total_count = 0
    meaningful_count = 0
    for filename in os.listdir(folder_path):
        if filename.endswith(".png"):
            total_count += 1
            if filename[-5] == "y":
                meaningful_count += 1
            elif filename[-5] == "n":
                continue
            else:
                print(f"Warning: unexpected filename {filename}")
    if total_count == 0:
        return 0
    else:
        return meaningful_count / total_count * 100

def entropy(lst):
    n = len(lst)
    counts = {}
    for x in lst:
        counts[x] = counts.get(x, 0) + 1
    probs = [count / n for count in counts.values()]
    return -sum(p * math.log2(p) for p in probs)

def entropy_one_hot(list):
    # print('entropy_one_hot list',list)
    # Convert the list to a numpy array
    if len(list) == 0:
        return 0
    arr = np.array(list)
    # Find the number of rows and columns
    n_rows, n_cols = arr.shape
    # Initialize the entropy to zero
    ent = 0
    # Loop over each column
    for i in range(n_cols):
        # Extract the column as a vector
        col = arr[:, i]
        # Count the number of ones and zeros
        ones = np.count_nonzero(col)
        zeros = n_rows - ones
        # Calculate the probabilities of ones and zeros
        p_ones = ones / n_rows
        p_zeros = zeros / n_rows
        # Check if the probabilities are nonzero
        if p_ones > 0 and p_zeros > 0:
            # Add the entropy contribution of this column
            ent += -p_ones * log(p_ones, e) - p_zeros * log(p_zeros, e)
    # Return the entropy
    return ent

def calculate_graph_stats(graph):
    nodes_data = pd.DataFrame(graph.nodes(data=True), columns=["node", "data"])
    edges_data = pd.DataFrame(
        graph.edges(data=True), columns=["source", "target", "data"]
    )
    nodes_data_df = pd.DataFrame(graph.nodes(data=True), columns=["node", "data"])
    entity_count = (
        nodes_data_df["data"].apply(lambda x: x.get("entity_conf") == 1).sum()
    )
    behave_count = (
        edges_data["data"]
        .apply(lambda x: x.get("behave_conf") and x.get("behave_conf") > 0.5)
        .sum()
    )
    tactic_count = (
        edges_data["data"]
        .apply(lambda x: x.get("tactic_conf") and x.get("tactic_conf") != [0] * 10)
        .sum()
    )
    entity_percent = entity_count / len(graph.nodes()) if len(graph.nodes()) > 0 else 0
    behave_percent = behave_count / len(graph.edges()) if len(graph.edges()) > 0 else 0
    tactic_percent = tactic_count / len(graph.edges()) if len(graph.edges()) > 0 else 0
    avg_precent = (
        (entity_percent + behave_percent + tactic_percent) / 3
        if len(graph.nodes()) > 0
        else 0
    )
    df = pd.DataFrame(
        {
            "entity_percent": [entity_percent],
            "behave_percent": [behave_percent],
            "tactic_percent": [tactic_percent],
        }
    )
    all_article_id = edges_data["data"].apply(lambda x: x.get("article_id")).tolist()
    all_tactic = edges_data["data"].apply(lambda x: x.get("tactic_conf")).tolist()
    article_entropy = entropy(all_article_id)
    tactic_entropy = entropy_one_hot(all_tactic)
    return (
        avg_precent,
        entity_percent,
        behave_percent,
        tactic_percent,
        article_entropy,
        tactic_entropy,
    )

def calculate_community_scores(listofcommunities, inputG):
    df_community_and_scores = pd.DataFrame(
        columns=[
            "community_nodes",
            "entity_percent",
            "behave_percent",
            "tactic_percent",
            "avg_precent",
            "article_entropy",
        ]
    )
    # shuffle the listofcommunities
    listofcommunities = listofcommunities.copy()
    np.random.shuffle(listofcommunities)
    for one_community in (listofcommunities):
        graph_one_community = create_new_graph(one_community, inputG)

        (
            avg_precent,
            entity_percent,
            behave_percent,
            tactic_percent,
            article_entropy,
            tactic_entropy,
        ) = calculate_graph_stats(graph_one_community)

        df_community_and_scores = pd.concat(
            [
                df_community_and_scores,
                pd.DataFrame(
                    {
                        "community_nodes": [one_community],
                        "entity_percent": [entity_percent],
                        "behave_percent": [behave_percent],
                        "tactic_percent": [tactic_percent],
                        "avg_precent": [avg_precent],
                        "article_entropy": [article_entropy],
                        "tactic_entropy": [tactic_entropy],
                    }
                ),
            ],
            ignore_index=True,
        )

    min_x = df_community_and_scores["article_entropy"].min()
    max_x = df_community_and_scores["article_entropy"].max()
    df_community_and_scores["normalized_article_entropy"] = 0
    for i, row in df_community_and_scores.iterrows():
        x_i = row["article_entropy"]
        z_i = (x_i - min_x) / (max_x - min_x)
        df_community_and_scores.loc[i, "normalized_article_entropy"] = z_i

    min_x = df_community_and_scores["tactic_entropy"].min()
    max_x = df_community_and_scores["tactic_entropy"].max()
    df_community_and_scores["normalized_tactic_entropy"] = 0
    for i, row in df_community_and_scores.iterrows():
        x_i = row["tactic_entropy"]
        if max_x == min_x:
                    z_i = 0
        else:
            z_i = (x_i - min_x) / (max_x - min_x)        
        df_community_and_scores.loc[i, "normalized_tactic_entropy"] = z_i

    df_community_and_scores["avg_score"] = (
        df_community_and_scores["entity_percent"]
        + df_community_and_scores["behave_percent"]
        + df_community_and_scores["tactic_percent"]
        + df_community_and_scores["normalized_article_entropy"]
        + df_community_and_scores["normalized_tactic_entropy"]
    ) / 5

    df_community_and_scores = df_community_and_scores.sort_values(
        by=["avg_score"], ascending=False
    )

    min_x = df_community_and_scores["avg_score"].min()
    max_x = df_community_and_scores["avg_score"].max()
    df_community_and_scores["normalized_avg_score"] = 0
    for i, row in df_community_and_scores.iterrows():
        x_i = row["avg_score"]
        if max_x == min_x:
            z_i = 0
        else:
            z_i = (x_i - min_x) / (max_x - min_x)
        df_community_and_scores.loc[i, "normalized_avg_score"] = z_i

    return df_community_and_scores

def create_new_graph(node_list, graph):
    new_graph = graph.subgraph(node_list).copy()
    isolated_nodes = list(nx.isolates(new_graph))
    new_graph.remove_nodes_from(isolated_nodes)
    return new_graph

def OLD_create_new_graph(node_list, graph):
    new_graph = graph.subgraph(node_list).copy()
    if not new_graph.nodes:
        return None
    edges_to_remove = []
    for edge in new_graph.edges:
        if edge[0] not in node_list or edge[1] not in node_list:
            edges_to_remove.append(edge)
    new_graph.remove_edges_from(edges_to_remove)
    isolated_nodes = list(nx.isolates(new_graph))
    new_graph.remove_nodes_from(isolated_nodes)
    return new_graph

def OLD_calculate_graph_stats(graph):
    entity_count = 0
    behave_count = 0
    tactic_count = 0
    all_article_id = []
    all_tactic = []
    total_edges_from_all_articles = 0
    for node in graph.nodes(data=True):
        if node[1].get("entity") and node[1].get("entity").strip() != "":
            entity_count += 1
    for edge in graph.edges(data=True):
        raw_sent_this = edge[2].get("raw_sent")
        total_edges_from_all_articles = total_edges_from_all_articles + 1
        all_tactic.append(edge[2].get("tactic_conf"))
        all_article_id.append(edge[2].get("article_id"))
        if edge[2].get("behave_conf") and edge[2].get("behave_conf") > 0.5:
            behave_count += 1
        if edge[2].get("tactic_conf") and edge[2].get("tactic_conf") != [0] * 10:
            tactic_count += 1
    entity_percent = entity_count / len(graph.nodes())
    behave_percent = behave_count / len(graph.edges()) if len(graph.edges()) > 0 else 0
    tactic_percent = tactic_count / len(graph.edges()) if len(graph.edges()) > 0 else 0
    avg_precent = (
        (entity_percent + behave_percent + tactic_percent) / 3
        if len(graph.nodes()) > 0
        else 0
    )
    df = pd.DataFrame(
        {
            "entity_percent": [entity_percent],
            "behave_percent": [behave_percent],
            "tactic_percent": [tactic_percent],
        }
    )
    article_entropy = entropy(all_article_id)
    tactic_entropy = entropy_one_hot(all_tactic)
    return (
        avg_precent,
        entity_percent,
        behave_percent,
        tactic_percent,
        article_entropy,
        tactic_entropy,
    )

def get_community_size(folder_name):
    return int(folder_name.split("_")[-1])

def own_score_erdos_renyi(in_graph, in_comms_list):
    if type(in_comms_list)!=list:
        in_comms_list=in_comms_list.communities
    m = in_graph.number_of_edges()
    n = in_graph.number_of_nodes()
    q = 0

    for community in in_comms_list:
        c = nx.subgraph(in_graph, community)
        mc = c.number_of_edges()
        nc = c.number_of_nodes()
        q += mc - (m * nc * (nc - 1)) / (n * (n - 1))

    return (1 / m) * q

def own_score_z(graph,comms_list):
    m = graph.number_of_edges()
    mmc = 0
    dc2m = 0
    for community in comms_list:
        c = nx.subgraph(graph, community)
        mc = c.number_of_edges()
        dc = 0
        for node in c:
            dc += graph.degree(node)
        mmc += mc / m
        dc2m += (dc / (2 * m)) ** 2
    res = 0
    try:
        res = (mmc - dc2m) / np.sqrt(dc2m * (1 - dc2m))
    except ZeroDivisionError:
        pass
    return res

def own_modularity_overlap(graph: nx.Graph, comms_list,weight=None):
    from collections import defaultdict
    affiliation_dict = defaultdict(list)
    for cid, coms in enumerate(comms_list):
        for n in coms:
            affiliation_dict[n].append(cid)
    mOvTotal = 0
    for nodes in comms_list:
        nCommNodes = len(nodes)
        # the contribution of communities with 1 node is 0
        if nCommNodes <= 1:
            continue
        nInwardEdges = 0
        commStrength = 0
        for node in nodes:
            degree, inwardEdges, outwardEdges = 0, 0, 0
            for u, v, data in graph.edges(node, data=True):
                w = data.get(weight, 1)
                degree += w
                if v in nodes:
                    inwardEdges += w
                    nInwardEdges += 1
                else:
                    outwardEdges += w
            affiliationCount = len(affiliation_dict[node])
            denom = degree * affiliationCount
            if denom > 0:
                commStrength += (inwardEdges - outwardEdges) / denom
        binomC = nCommNodes * (nCommNodes - 1)
        v1 = commStrength / nCommNodes
        v2 = nInwardEdges / binomC
        mOv = v1 * v2
        mOvTotal += mOv
    score = mOvTotal / len(comms_list)
    return score

#print(own_modularity_overlap(G_Value_All_tactic,communities['ownAlgorithm_threshold_0.2.pkl'])


def own_modularity_density(
    graph: nx.Graph, comms_list: object, lmbd: float = 0.5, **kwargs: dict
):
    q = 0

    for community in comms_list:
        c = nx.subgraph(graph, community)

        nc = c.number_of_nodes()
        dint = []
        dext = []
        for node in c:
            dint.append(c.degree(node))
            dext.append(graph.degree(node) - c.degree(node))

        try:
            q += (1 / nc) * (
                (2 * lmbd * np.sum(dint)) - (2 * (1 - lmbd) * np.sum(dext))
            )
        except ZeroDivisionError:
            pass

    return q

def own_newman_girvan_modularity(
    graph: nx.Graph, comms_list: object, **kwargs: dict
) -> object:
    coms = {}
    for cid, com in enumerate(comms_list):
        for node in com:
            coms[node] = cid

    inc = dict([])
    deg = dict([])
    links = graph.size(weight="weight")
    if links == 0:
        raise ValueError("A graph without link has an undefined modularity")

    for node in graph:
        try:
            com = coms[node]
            deg[com] = deg.get(com, 0.0) + graph.degree(node, weight="weight")
            for neighbor, dt in graph[node].items():
                weight = dt.get("weight", 1)
                if coms[neighbor] == com:
                    if neighbor == node:
                        inc[com] = inc.get(com, 0.0) + float(weight)
                    else:
                        inc[com] = inc.get(com, 0.0) + float(weight) / 2.0
        except:
            pass

    res = 0.0
    for com in set(coms.values()):
        res += (inc.get(com, 0.0) / links) - (deg.get(com, 0.0) / (2.0 * links)) ** 2

    return res

In [None]:
import community
import random
from collections import deque
import networkx as nx
import os
def clear_file(file_path):
    """
    """
    if os.path.exists(file_path):
        os.remove(file_path)
    with open(file_path, 'w') as f:
        pass

def NodeCTI_GetEdgesIntoDict(inputG):
    edges = {}
    for t in inputG.edges():
        if len(t) > 0:
            if t[0] != t[1]:
                if t[0] not in edges:
                    edges[t[0]] = {t[1]}
                else:
                    edges[t[0]].add(t[1])
                if t[1] not in edges:
                    edges[t[1]] = {t[0]}
                else:
                    edges[t[1]].add(t[0])
    return edges

def NodeCTI_FirstPartition(edges, first_part_file):
    OUT = open(first_part_file, "w")
    node_count = 0
    for n in edges:
        node_count = node_count + 1
        if node_count > 0:
            index = {}
            reverse_index = {}
            count = 0
            to_add_edges = []
            adj = set([])
            for neighbor in edges[n]:
                index[count] = neighbor
                reverse_index[neighbor] = count
                adj.add(neighbor)
                count = count + 1
            for m in reverse_index:
                for k in edges[m]:
                    if k in reverse_index and reverse_index[k] < reverse_index[m]:
                        to_add_edges.append((reverse_index[m], reverse_index[k]))
            G = nx.Graph()
            G.add_nodes_from([i for i in index])
            G.add_edges_from(to_add_edges)
            if len(to_add_edges) > 0:
                dict_H = community.best_partition(G)
                H = {}
                for node in dict_H:
                    if dict_H[node] not in H:
                        H[dict_H[node]] = set([])
                    H[dict_H[node]].add(node)
                for i in H:
                    comm = H[i]
                    if len(comm) > 0:
                        for c in comm:
                            OUT.write(str(index[int(c)]) + " ")
                        OUT.write(str(n))
                        OUT.write("\n")
                    elif len(comm) > 0:
                        for c in comm:
                            if index[int(c)] in edges[n]:
                                OUT.write(str(index[int(c)]) + " ")
                        OUT.write(str(n))
                        OUT.write("\n")
    OUT.close()

def NodeCTI_Jaccard(set1, set2):
        set1 = set(set1)
        set2 = set(set2)
        return float(len(set1.intersection(set2))) / float(len(set1.union(set2)))

def NodeCTI_GetMembership(first_part_file, membership_file):
    node_membership = {}
    IN = open(first_part_file, "rb")
    read_line = IN.readline()
    count = 0
    while read_line:
        t = read_line.rstrip().split()
        if len(t) >= min_comm_size:
            for mem in t:
                if mem not in node_membership:
                    node_membership[mem] = set([])
                node_membership[mem].add(count)
        count = count + 1
        read_line = IN.readline()

    IN.close()

    OUT = open(membership_file, "w")
    for n in node_membership:
        in_comms = node_membership[n]
        OUT.write(str(n) + " ")
        for c in in_comms:
            OUT.write(str(c) + " ")
        OUT.write("\n")
    OUT.close()

def get_subset_graph(in_undirected_G, percent_to_keep):
    num_nodes = len(in_undirected_G.nodes())
    num_edges = len(in_undirected_G.edges())
    num_nodes_to_keep = int(num_nodes * percent_to_keep)
    num_edges_to_keep = int(num_edges * percent_to_keep)
    nodes_to_keep = random.sample(list(in_undirected_G.nodes()), num_nodes_to_keep)
    edges_to_keep = random.sample(list(in_undirected_G.edges()), num_edges_to_keep)
    G_Value_undirected_subset = nx.Graph()
    G_Value_undirected_subset.add_nodes_from(nodes_to_keep)
    G_Value_undirected_subset.add_edges_from(edges_to_keep)
    return G_Value_undirected_subset

###2.1
import random
import networkx as nx
first_part_file = "./tmp/part1_.txt"
membership_file = "./tmp/membership_.txt"
sim_file = "./tmp/simfile_.txt"
clear_file(first_part_file)
clear_file(membership_file)
clear_file(sim_file)
G_Value_undirected_subset=G_Value_undirected
edges = NodeCTI_GetEdgesIntoDict(G_Value_undirected_subset)
min_comm_size=3
sim_threshold=0.25
global_overlap_threshold=0.25
NodeCTI_FirstPartition(edges,first_part_file)

def NodeCTI_Filter_only_MalwareCVEActor(file_path, in_entity_set):
    with open(file_path, 'r') as f:
        lines = f.readlines()
    filtered_lines = []
    for line in lines:
        line = line.strip().split(' ')
        if line[-1] in in_entity_set:
           # print(line[-1])
            filtered_lines.append(line)
    #save original file as backup
    #if backup file exists, delete it
    if os.path.exists(file_path + '.bak'):
        os.remove(file_path + '.bak')
    with open(file_path + '.bak', 'w') as f:
        for line in lines:
            f.write(line)
    #save back to original file
    with open(file_path, 'w') as f:
        for line in filtered_lines:
            f.write(' '.join(line) + '\n')
    #output the length difference between original and filtered file
    print('The length of original file:', len(lines), 'The length of filtered file:', len(filtered_lines), 'The difference:', len(lines) - len(filtered_lines))

entity_conf_set = set()

for node in G_Value.nodes():
    #if G_Value.nodes[node]['entity']=='malware' or G_Value.nodes[node]['entity']=='cve' or G_Value.nodes[node]['entity']=='CVE':
    if G_Value.nodes[node]['entity']!='':
        entity_conf_set.add(node)

entity_conf_list = list(entity_conf_set)

NodeCTI_Filter_only_MalwareCVEActor(first_part_file, entity_conf_set)
NodeCTI_GetMembership(first_part_file, membership_file)


In [None]:
import tiktoken

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

num_tokens_from_string("tiktoken is great!", "cl100k_base")

with open('./tmp/part1_.txt', 'r') as f:
    lines = f.readlines()
num_tokens = 0
for line in lines:
    num_tokens += num_tokens_from_string(line, "cl100k_base")
    
print(num_tokens)
print(num_tokens/1000*0.0004)

import pandas as pd
df = pd.DataFrame(lines, columns=['data'])
from langchain.embeddings import OpenAIEmbeddings
import os

os.environ['OPENAI_API_KEY'] = "<APIKeyHere>"

with open('./part1_.txt', 'r') as f:
    lines = f.readlines()
    
embeddings = OpenAIEmbeddings()
lines_result=embeddings.embed_documents(lines)
#save the result as dict then save it as pickle file
embeddings_dict = {}
for i in range(len(lines_result)):
    embeddings_dict[lines[i]] = lines_result[i]
import pickle
with open('embedding_BY_OPENAI.pickle', 'wb') as handle:
    pickle.dump(embeddings_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

import pickle
with open('./part1_.txt', 'r') as f:
    lines = f.readlines()
lines_result=[]
with open('embedding_BY_OPENAI.pickle', 'rb') as handle:
    embeddings_dict = pickle.load(handle)
for i in lines:
    lines_result.append(embeddings_dict[i])
    
lines_result_possible=[]
lines_result_possible_truevalue=[]
for i in range(len(lines_result)):
    if lines[i].count(' ')>=2:
        lines_result_possible_truevalue.append(lines[i])
        lines_result_possible.append(lines_result[i])
        
import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial.distance import pdist, squareform
from numpy.linalg import norm
np.random.seed(42)

def cosine_similarity(a, b):
    """
    """
    return np.dot(a, b) / (norm(a) * norm(b))

dist = pdist(lines_result_possible, metric=cosine_similarity)

dist_mat = squareform(dist)

sims = np.mean(dist_mat, axis=0)
plt.hist(sims, bins=50, color='blue', edgecolor='black')
plt.xlabel('Cosine similarity')
plt.ylabel('Frequency')
plt.title('Similarity distribution')
plt.show()

In [None]:
def getmap(setvalue):
    threshold_V=setvalue
    '''
    import pandas as pd
    df_sims=pd.DataFrame(sims, columns=['sims'])
    # assuming df_sims is already defined

    df_threshold=pd.DataFrame(columns=['top N%','minimum sims threshold'])
    df_threshold['top N%']=[10,20,30,40,50]
    df_threshold['minimum sims threshold']=[threshold_percentile(sims, 10),threshold_percentile(sims, 20),threshold_percentile(sims, 30),threshold_percentile(sims, 40),threshold_percentile(sims, 50)]
    '''
    def threshold_percentile(sims, percentile):
        """
        """
        threshold = sorted(sims, reverse=True)[int(len(sims) * percentile / 100)]
        return threshold
    #if 'tmp/simfile_.txt' exists, delete it
    import os
    if os.path.exists('tmp/simfile_.txt'):
        os.remove('tmp/simfile_.txt')

    value_p1file_index={}
    for line_this in lines_result_possible_truevalue:
        index_loc=lines.index(line_this)
        value_p1file_index[line_this]=index_loc
        
    result_newp1=[]
    #add tqdm for i in range(len(dist_mat)):
    total_sim=[]
    for i in range(len(dist_mat)):
        for j in range(i,len(dist_mat)):
            total_sim.append(dist_mat[i][j])

    threshold_this=threshold_percentile(total_sim, threshold_V)
    #print threshold_this
    print('threshold_this:',threshold_this)
    import tqdm
    for i in tqdm.tqdm(range(len(dist_mat))):
        for j in range(i,len(dist_mat)):
            if i!=j:
                #if dist_mat[i][j]>0.85:
                if dist_mat[i][j]>threshold_this:
                    aindex=value_p1file_index[lines_result_possible_truevalue[i]]
                    bindex=value_p1file_index[lines_result_possible_truevalue[j]]
                    #split a by space, b by space
                    aindex_split=lines_result_possible_truevalue[i].split(' ')
                    bindex_split=lines_result_possible_truevalue[j].split(' ')
                    #remove \n in last element of a and b
                    aindex_split[-1]=aindex_split[-1].replace('\n','')
                    bindex_split[-1]=bindex_split[-1].replace('\n','')
                    union_len=len(set(aindex_split).intersection(set(bindex_split)))
                    a_len=len(aindex_split)
                    b_len=len(bindex_split)
                    if a_len <10 and b_len <10 and union_len>=1:
                        result_newp1.append(str(aindex)+' '+str(bindex)+' '+str(dist_mat[i][j])+' '+str(union_len)+' '+str(a_len)+' '+str(b_len)+' '+str(dist_mat[i][j]))
    #save the result as txt file as tmp/simfile_.txt
    with open('tmp/simfile_.txt', 'w') as f:
        for item in result_newp1:
            f.write("%s\n" % item)
            
    def NodeCTI_SecondPartition(overlap_threshold, sim_file, first_part_file):
            return_vals =NodeCTI_ModClusteringSingleBig(
                sim_file, first_part_file, overlap_threshold
            )
            return return_vals

    def NodeCTI_ModClusteringSingleBig(sim_file, first_part_file, overlap_threshold=0):
        overlap_threshold=global_overlap_threshold
        IN = open(sim_file, "r")
        read_line = IN.readline()
        num_lines = 0
        comm_edges = {}
        to_add_edges = []
        while read_line:
            t = read_line.rstrip().split()
            num_lines += 1
            if len(t) > 0:
                node1 = int(t[0])
                node2 = int(t[1])
                sim = t[2]
                overlap = t[3]
                if node1 not in comm_edges:
                    comm_edges[node1] = set([])
                if node2 not in comm_edges:
                    comm_edges[node2] = set([])
                if node2 not in comm_edges[node1]:
                    comm_edges[node1].add(node2)
                    comm_edges[node2].add(node1)
                    weight = sim
                    to_add_edges.append((node1, node2, {"weight": float(weight)}))

            read_line = IN.readline()
        IN.close()
        G = nx.Graph()
        G.add_nodes_from(range(len(comm_edges)))
        G.add_edges_from(to_add_edges)
        dict_H = community.best_partition(G)
        H1 = {}
        for e in dict_H:
            if dict_H[e] not in H1:
                H1[dict_H[e]] = set([])
            H1[dict_H[e]].add(e)
        H = []
        for e in H1:
            H.append(H1[e])

        IN = open(first_part_file, "rb")
        line_offset = {}
        offset = 0
        count = 0
        for line in IN:
            line_offset[count] = offset
            count = count + 1
            offset += len(line)
        IN.close()

        IN = open(first_part_file, "rb")
        all_comms = {}
        i = 0
        for big_comm in H:
            comm_members = {}
            for comm in big_comm:
                IN.seek(line_offset[int(comm)])
                read_line = IN.readline()
                t = read_line.rstrip().split()
                if len(t) > 0:
                    for t1 in t:
                        if t1 not in comm_members:
                            comm_members[t1] = 0
                        comm_members[t1] += 1
            all_comms[i] = set([])
            for t1 in comm_members:
                if comm_members[t1] >= 0:
                    all_comms[i].add(t1)

            i += 1

        return all_comms

    def NodeCTI_GetModComms(G):
        dict_H = community.best_partition(G)
        H1 = {}
        for e in dict_H:
            if dict_H[e] not in H1:
                H1[dict_H[e]] = set([])
            H1[dict_H[e]].add(e)
        H = []
        for e in H1:
            H.append(H1[e])
        return H

    def NodeCTI_CleanComms(to_clean):
        comms = {}
        count = 0
        idx = {}
        for t in to_clean.values():
            if len(t) > 0:
                comms[count] = set(t)
                for i in t:
                    if i not in idx:
                        idx[i] = set([])

                    idx[i].add(count)
                count += 1
            elif len(t) > 0:
                comms[count] = set(t)
                count += 1
        coms = []
        for i in range(count):
            C = comms[i]
            if len(C) > 0:
                poss = set([])
                found = 0
                for n in C:
                    poss = poss.union(idx[n])
                for j in poss:
                    if j < i:
                        if (
                            len(comms[j]) == len(comms[i])
                            and len(comms[j].difference(comms[i])) == 0
                        ):
                            found = 1
                if found != 1:
                    coms.append([t.decode("utf-8") for t in C])
            else:
                coms.append([t.decode("utf-8") for t in C])
        return coms

    import networkx as nx
    import community
    sim_file='tmp/simfile_.txt'
    first_part_file='tmp/part1_.txt'
    global_overlap_threshold=0
    return_vals = NodeCTI_SecondPartition(0, sim_file, first_part_file)
    coms = NodeCTI_CleanComms(return_vals)
    print('Find communities number:',len(coms))

    #save as pkl file name "ownAlgorithm_threshold_"+str(threshold_V)+".pkl"
    import pickle
    fianlname='ownAlgorithm_threshold_'+str(threshold_V)+'.pkl'
    with open(fianlname, 'wb') as f:
        pickle.dump(coms, f)
    print('save as pkl file name:',fianlname)

    import networkx as nx
    import pickle
    import copy
    from cdlib import algorithms
    import os
    import time
    from cdlib import algorithms
    with open('./SECCG generated Knowledge Graph.pkl', 'rb') as f:
        G_Value = pickle.load(f)
    undirected_G_Value=copy.deepcopy(G_Value)
    undirected_G_Value=G_Value.to_undirected()
    G_Value_undirected=undirected_G_Value.to_undirected()

    import networkx as nx


    nx.set_edge_attributes(G_Value, name="tactic_conf", values=nx.get_edge_attributes(G_Value, "tactic"))

    import networkx as nx
    nx.set_edge_attributes(G_Value, name="relation", values=nx.get_edge_attributes(G_Value, "rel"))
    
    return coms

In [None]:
#coms_sizein5to10=[i for i in coms if len(i)>=5 and len(i)<=10]
df=pd.DataFrame(columns=['value','Average Size','Average Edge','Average Articles','Average Linked CTI Node'])
for i in [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]:
    coms_sizein5to10=getmap(i)
    big_label_list=['Initial Access', 'Execution', 'Defense Evasion', 'Command and Control', 'Privilege Escalation', 'Persistence','Lateral Movement','DataLeak','Exfiltration','Impact']
    import random
    average_size=[]
    #add tqdm for community in coms_sizein5to10:
    import tqdm
    average_edge=[]
    average_linked_cti_edge=[]
    average_article=[]
    import networkx as nx

    def count_edges_with_nonempty_entity(graph):
        count = 0

        for edge in graph.edges():
            node1, node2 = edge

            if graph.nodes[node1]['entity'] != '' and graph.nodes[node2]['entity'] != '':
                count += 1

        return count
    for community in tqdm.tqdm(coms_sizein5to10):
        graph_com=create_new_graph(community,G_Value)
        if graph_com.number_of_edges()>2 and graph_com.number_of_nodes()>=1 :
                import networkx as nx
                edge_attributes = nx.get_edge_attributes(graph_com, 'article_id')
                unique_article_ids = len(set(edge_attributes.values()))
                linked_cti_edge=count_edges_with_nonempty_entity(graph_com)
                average_edge.append(graph_com.number_of_edges())
                average_article.append(unique_article_ids)
                average_size.append(graph_com.number_of_nodes())
                average_linked_cti_edge.append(linked_cti_edge)
                
    print('average_size:',sum(average_size)/len(average_size))
    print('average_edge:',sum(average_edge)/len(average_edge))
    print('average_article:',sum(average_article)/len(average_article))
    print('average_linked_cti_edge:',sum(average_linked_cti_edge)/len(average_linked_cti_edge))
    #contact the df
    new_row = {'value':i,'Average Size':sum(average_size)/len(average_size),'Average Edge':sum(average_edge)/len(average_edge),'Average Articles':sum(average_article)/len(average_article),'Average Linked CTI Node':sum(average_linked_cti_edge)/len(average_linked_cti_edge)}
    df=pd.concat([df,pd.DataFrame([new_row])])
df


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl

mpl.rcParams['font.size'] = 20

plt.figure(figsize=(14, 10))

for column in df.columns[1:]:
    plt.plot(df['value'], df[column], marker='o', label=column)

plt.title('Community Metrics with Different Thresholds')
plt.xlabel('Threshold Value')
plt.ylabel('Metric Valve')
plt.xticks(df['value'])
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
plt.legend()

plt.show()

In [None]:
with open('./SECCG generated Knowledge Graph.pkl', 'rb') as f:
    G_full = pickle.load(f)
import copy
undirected_G_Value_full = copy.deepcopy(G_full)
dict_sol_com = {}
dict_sol_com['CTIKG0.5'] = getmap(0.5)
undirected_G_Value_full = undirected_G_Value_full.to_undirected()

# Create a new undirected graph
undirected_G_Value_full_foUmstmo = nx.Graph()

# Copy all edges into a list to avoid changing the graph structure during iteration
edges_list = list(undirected_G_Value_full.edges())

# Iterate through each set of edges between node pairs
for u, v in edges_list:
    # Get all edges between u and v
    edges = list(undirected_G_Value_full.get_edge_data(u, v).values())
    
    # Randomly select one edge
    selected_edge = random.choice(edges)
    
    # Add it to the new undirected graph
    undirected_G_Value_full_foUmstmo.add_edge(u, v, **selected_edge)

# Now undirected_G_Value_full_foUmstmo is a copy of undirected_G_Value_full with random edges selected

dict_sol_com['Core'] = algorithms.core_expansion(undirected_G_Value_full).communities
dict_sol_com['leiden'] = algorithms.leiden(undirected_G_Value_full).communities
print('leiden done')
dict_sol_com['Umstmo'] = algorithms.umstmo(undirected_G_Value_full_foUmstmo).communities
dict_sol_com['angel'] = algorithms.angel(undirected_G_Value_full, threshold=0.25).communities
dict_sol_com['coach'] = algorithms.coach(undirected_G_Value_full).communities

dict_sol_com['Umstmo'] = algorithms.umstmo(undirected_G_Value_full_foUmstmo).communities
dict_sol_com['angel'] = algorithms.angel(undirected_G_Value_full, threshold=0.25).communities
dict_sol_com['coach'] = algorithms.coach(undirected_G_Value_full).communities


In [None]:
#coms_sizein5to10 = [i for i in coms if len(i) >= 5 and len(i) <= 10]
def get_table_on_names(coms_this, name):
    df = pd.DataFrame(columns=['name', 'Average Size', 'Average Edge', 'Average Articles', 'Average Linked CTI Node', 'Number of Communities'])
    big_label_list = ['Initial Access', 'Execution', 'Defense Evasion', 'Command and Control', 'Privilege Escalation', 'Persistence', 'Lateral Movement', 'DataLeak', 'Exfiltration', 'Impact']
    import random
    average_size = []
    
    # Add tqdm for community in coms_sizein5to10:
    import tqdm
    average_edge = []
    average_linked_cti_edge = []
    average_article = []
    import networkx as nx
    
    def count_edges_with_nonempty_entity(graph):
        # Initialize counter
        count = 0

        # Iterate over each edge in the graph
        for edge in graph.edges():
            # Get the two endpoints of the edge
            node1, node2 = edge

            # Check if the 'entity' attribute of both endpoints is not an empty string
            if graph.nodes[node1]['entity'] != '' and graph.nodes[node2]['entity'] != '':
                # Increment counter
                count += 1

        # Return the result
        return count
    
    for community in tqdm.tqdm(coms_this):
        if 'CTIKG' in name:
            print('Use CTIKG remove graph')
            graph_com = create_new_graph(community, G_Value)
        else:
            print('Use full graph')
            graph_com = create_new_graph(community, G_full)
            
        if graph_com.number_of_edges() > 2 and graph_com.number_of_nodes() >= 1:
            edge_attributes = nx.get_edge_attributes(graph_com, 'article_id')
            unique_article_ids = len(set(edge_attributes.values()))
            linked_cti_edge = count_edges_with_nonempty_entity(graph_com)
            average_edge.append(graph_com.number_of_edges())
            average_article.append(unique_article_ids)
            average_size.append(graph_com.number_of_nodes())
            average_linked_cti_edge.append(linked_cti_edge)
    
    # Print unique value counts
    print('average_size:', sum(average_size) / len(average_size))
    print('average_edge:', sum(average_edge) / len(average_edge))
    print('average_article:', sum(average_article) / len(average_article))
    print('average_linked_cti_edge:', sum(average_linked_cti_edge) / len(average_linked_cti_edge))
    
    new_row = {
        'name': name,
        'Average Size': sum(average_size) / len(average_size),
        'Average Edge': sum(average_edge) / len(average_edge),
        'Average Articles': sum(average_article) / len(average_article),
        'Average Linked CTI Node': sum(average_linked_cti_edge) / len(average_linked_cti_edge),
        'Number of Communities': len(coms_this)
    }
    
    df = pd.concat([df, pd.DataFrame([new_row])])
    return df

df_overall = []
for key, value in dict_sol_com.items():
    print(key)
    try:
        df = get_table_on_names(value, key)
        df_overall.append(df)
        print(df)
    except:
        pass

df_overall = pd.concat(df_overall)
df_overall


In [None]:
df_overall