In [None]:
import networkx as nx
from collections import defaultdict
from csv import DictReader
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import adjusted_mutual_info_score, homogeneity_score, rand_score, adjusted_rand_score, pair_confusion_matrix
import random
import time
from igraph import *
import numpy as np
from datetime import datetime
from pathlib import Path
from scipy.stats import entropy
import tqdm
import matplotlib.pyplot as plt




In [None]:
p = Path("./data/edgeData").iterdir()

############# OUR CODE BEGINS ##############

files = [x for x in p if x.is_file() and "edges_2012_2" in x.stem or "edges_2014_2" in x.stem]
heuristics = ["h0","h1", "h2", "h3", "h4", "h5", "h6"]

############# OUR CODE ENDS ##############

for file_name in files:

        print('file_name: ',file_name)

        df = pd.read_csv(file_name)

        ############# OUR CODE BEGINS ##############
        

        for h in heuristics:
            
            if h == "h0":
                df["sum"] = df[["h0"]]
                name = f"h0"
                
            else:
                df["sum"] = df[["h0", h]].sum(axis=1)
                name = f"h0_{h}"
            
            
            ############# OUR CODE ENDS ##############

            df_gt = pd.read_csv('data/ground_truth_id.csv.zip')
            # print(df_gt.shape)

            df_gt = df_gt.dropna(axis=0, how='any', subset=["address", "entity"])
            # print(df_gt.shape)

            df_gt = df_gt.drop_duplicates(subset="address", keep=False)
            # print(df_gt.shape)

            entity_counts = df_gt["entity"].value_counts()
            rare_entities = entity_counts[entity_counts < 10].index
            df_gt = df_gt.loc[~df_gt["entity"].isin(rare_entities), :]
            # print(df_gt.shape)

            gt_addr = set(df_gt["address"])
            sample_addr = set(df["node1"]).union(set(df["node2"]))
            sample_known_addr = sample_addr.intersection(gt_addr)

            # print('ground truth addresses',len(gt_addr))
            # print('sample addressses:',len(sample_addr))

            df_gt_known = df_gt.loc[df_gt["address"].isin(sample_addr), ['address', 'entity']]

            known_entities = set(df_gt_known['entity'])

            # known_entity_addr_dict = {idx: set(df_gt_known.loc[df_gt_known["entity"] == e, "address"]) for idx, e in enumerate(known_entities)}

            known_addr_entity_dict_list = [{a: idx for a in set(df_gt_known.loc[df_gt_known["entity"] == e, "address"])} for idx, e in enumerate(known_entities)]

            #print('known_addr_entity_dict_list: ', known_addr_entity_dict_list)

            known_addr_entity_dict = {}
            [known_addr_entity_dict.update(d) for d in known_addr_entity_dict_list]
            known_entity_counts = df_gt_known['entity'].value_counts().rename("count").to_frame()

            # print('ground truth entities in sample:',len(known_entities))

            known_entity_counts["file"] = file_name.stem
            known_entity_counts["n_nodes_graph"] = len(sample_addr)
            known_entity_counts["n_edges_graph"] = len(df)

            now = datetime.now()
            date_str = now.strftime("%Y%m%d_%H%M")

            file_name_output = f"known_entity_counts_{file_name.stem}_res_{date_str}.csv"
            output_path = Path(f"data/res/{file_name_output}")

            #print('output_path: ', output_path)

            known_entity_counts.to_csv(output_path, index=True)

            def get_labels(cs_addr, known_addr_entity_dict):
                    res = {}
                    for idx, c in enumerate(cs_addr):
                            for a in c:
                                    if a in sample_known_addr:
                                            res[a] = idx

                    labels_true = []
                    labels_pred = []

                    for a in res.keys():
                            labels_true.append(known_addr_entity_dict[a])
                            labels_pred.append(res[a])

                    return labels_true, labels_pred

            edge_tuples = df[["node1", "node2", "sum"]].itertuples(index=False)
            g = Graph.TupleList(edge_tuples, directed=False, weights=True)
            g_weights = g.es["weight"]

            def get_initial_and_fixed(sample_size, seed=32512163):
                    known_idxs = [idx for idx, addr in enumerate(g.vs["name"]) if addr in sample_known_addr]

                    initial = np.zeros(len(g.vs), dtype=np.int)


                    rng = np.random.RandomState(seed)

                    sample_known_idxs = rng.choice(known_idxs, size=sample_size, replace=False) # TODO: set seed
                    non_sample_idxs = np.setdiff1d(range(len(g.vs)), sample_known_idxs)

                    for idx in sample_known_idxs:
                            initial[idx] = known_addr_entity_dict[g.vs[idx]["name"]]

                    available_labels = np.setdiff1d(range(len(g.vs)), initial[sample_known_idxs])[:len(g.vs) - sample_size]

                    initial[non_sample_idxs] = available_labels

                    fixed = np.zeros(len(g.vs), dtype=np.int)
                    fixed[sample_known_idxs] = 1
                    fixed = fixed.astype(bool)

                    return initial, fixed


            props = [0, 0.1] + list(np.geomspace(start=0.01, stop=0.4, num=15)) # np.arange(0, 1, 0.05)
            props = sorted(props)
            # print(props)
            # print()

            sizes = [int(p * len(g.vs)) for p in props if int(p * len(g.vs)) <= len(sample_known_addr)]
            #print(sizes)

            for seed in range(101):
                total_rnd_iter = 1
                for rnd_iter in range(total_rnd_iter):
                        print(f"{rnd_iter + 1} / {total_rnd_iter}")

                        if rnd_iter > 0:
                                print("randomizing...")
                                g.rewire(n=4*len(g.es), mode="simple")
                                print("done rewiring")
                                g.es["weight"] = g_weights
                                print("randomized")

                        res_cols = ["file", "n_nodes_graph", "n_edges_graph", "prop_graph", "prop_known",
                                                "n_clusters", "cluster_sizes", "ami", "homog", "mod", "ars", "urs",
                                                # "entropy_vs_cluster_entropy", 
                                                # "entropy_vs_cluster_sizes_known_addr", 
                                                # "entropy_vs_cluster_n_entities", 
                                                # "entropy_vs_cluster_sizes_labels_true", 
                                                # "entropy_vs_entity_entropy",
                                                # "entropy_vs_entity_abs_size",
                                                # "entropy_vs_entity_rel_size"
                                                ]

                        res = {c: [] for c in res_cols}

                        for i, size in enumerate(sizes):
                                initial, fixed = get_initial_and_fixed(size, seed=seed)
                                cs = g.community_label_propagation(weights="weight", initial=initial, fixed=fixed)

                                cs_addr = sorted([g.vs.select(c)["name"] for c in cs], key=len)
                                cs_sizes = [len(c) for c in cs_addr]

                                labels_true, labels_pred = get_labels(cs_addr=cs_addr, known_addr_entity_dict=known_addr_entity_dict)

                                ami = adjusted_mutual_info_score(labels_true=labels_true, labels_pred=labels_pred)
                                urs = rand_score(labels_true=labels_true, labels_pred=labels_pred)

                                # ars = adjusted_rand_score(labels_true=labels_true, labels_pred=labels_pred)
                                (tn, fp), (fn, tp) = pair_confusion_matrix(labels_true, labels_pred)

                                if fn == 0 and fp == 0:
                                        ars = 1.0
                                else:
                                        ars = 2. * (tp.astype(np.float64) * tn.astype(np.float64) - fn.astype(np.float64) * fp.astype(np.float64)) / ((tp.astype(np.float64) + fn.astype(np.float64)) * (fn.astype(np.float64) + tn.astype(np.float64)) + (tp.astype(np.float64) + fp.astype(np.float64)) * (fp.astype(np.float64) + tn.astype(np.float64)))

                                homog = homogeneity_score(labels_true=labels_true, labels_pred=labels_pred)
                                mod = cs.modularity

                                res["file"].append(file_name.stem)
                                res["n_nodes_graph"].append(len(sample_addr))
                                res["n_edges_graph"].append(len(df))
                                res["prop_graph"].append(props[i])
                                res["prop_known"].append(size/len(sample_known_addr))
                                res["n_clusters"].append(len(cs))
                                res["cluster_sizes"].append(cs_sizes)
                                res["ami"].append(ami)
                                res["ars"].append(ars)
                                res["urs"].append(urs)
                                res["homog"].append(homog)
                                res["mod"].append(mod)

                                print(f"{props[i]:.2%} \t {size/len(sample_known_addr):.2%}   \t ami: {ami:.2f} \t urs: {urs:.2f} \t ars: {ars:.2f} \t homog: {homog:.2f} \t {mod:.2f}")

                                continue

                                entropy_vs_entity_entropy = []
                                entropy_vs_entity_abs_size = []
                                entropy_vs_entity_rel_size = []

                                entity_cluster_distribution_dict = defaultdict(lambda: defaultdict(int))
                                for entity, cluster in zip(labels_true, labels_pred):
                                        entity_cluster_distribution_dict[entity][cluster] += 1

                                for entity, cluster_dist in entity_cluster_distribution_dict.items():
                                        e = entropy(list(cluster_dist.values()), base=2)
                                        entropy_vs_entity_entropy.append(e)
                                        entropy_vs_entity_abs_size.append(sum([v for v in cluster_dist.values()]))
                                        entropy_vs_entity_rel_size.append(sum([v for v in cluster_dist.values()]) / len(sample_known_addr))

                                res["entropy_vs_entity_entropy"].append(entropy_vs_entity_entropy)
                                res["entropy_vs_entity_abs_size"].append(entropy_vs_entity_abs_size)
                                res["entropy_vs_entity_rel_size"].append(entropy_vs_entity_rel_size)

                                n_entities = []
                                entropys = []
                                cluster_sizes_known_addr = []
                                cluster_sizes_labels_true = []
                                for c in cs_addr:
                                        labels_true, labels_pred = get_labels(cs_addr=[c], known_addr_entity_dict=known_addr_entity_dict)

                                        if len(labels_pred) == 0:
                                                continue

                                        _, counts = np.unique(labels_true, return_counts=True)

                                        e = entropy(counts, base=2)
                                        entropys.append(e)

                                        n_entity = len(np.unique(labels_true))
                                        n_entities.append(n_entity)

                                        cluster_sizes_known_addr.append(len(labels_true))
                                        cluster_sizes_labels_true.append(len(c))

                                res["entropy_vs_cluster_n_entities"].append(n_entities)
                                res["entropy_vs_cluster_entropy"].append(entropys)
                                res["entropy_vs_cluster_sizes_known_addr"].append(cluster_sizes_known_addr)
                                res["entropy_vs_cluster_sizes_labels_true"].append(cluster_sizes_labels_true)

                        df_res = pd.DataFrame(res)
                        #print(df_res.info())

                        now = datetime.now()
                        date_str = now.strftime("%Y%m%d_%H%M_%s")

                        ############# OUR CODE STARTS ##############
                        
                        #file_name_output = f"{file_name.stem}_{seed}_res_{date_str}.csv"
                        file_name_output = f"{file_name.stem}_${name}$_{seed}_res_{date_str}.csv"
                        
                        ############# OUR CODE ENDS ###############
                        
                        output_path = Path(f"data/res/{file_name_output}")

                        df_res.to_csv(output_path, index=False)




