In [None]:
import concurrent
import concurrent.futures
import ipaddress
import pickle
import re
import statistics
import warnings
from collections import Counter

import matplotlib.cm as cm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import regex
import torch
import tqdm
from featgenerator import group_features
from featgenerator.config import Config
from featgenerator.exif_feat import ExifFeatures
from featgenerator.featurizer import Featurizer
from featgenerator.floss_general_feat import FlossFeatures
from featgenerator.floss_regex import FlossRegexFeatures
from featgenerator.group_features import GroupAttributionFeatures
from featgenerator.lief_features import LiefFeatures, get_features_from_function_lists
from featgenerator.malcat import MalcatFeatures
from featgenerator.util import ClusteringMetrics, DataProcessor, MinHashLSHForest, Util
from itables import init_notebook_mode, show
from keras import models
from keras.layers import Dense, Input, Reshape
from keras.models import Model
from scipy.signal import normalize
from sklearn.cluster import AffinityPropagation, AgglomerativeClustering, KMeans
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import (
    adjusted_mutual_info_score,
    adjusted_rand_score,
    auc,
    classification_report,
    davies_bouldin_score,
    roc_auc_score,
    roc_curve,
    silhouette_score,
    v_measure_score,
)
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import (
    LabelEncoder,
    QuantileTransformer,
    label_binarize,
    normalize,
)
from tqdm import tqdm
from transformers import (
    AutoModel,
    AutoTokenizer,
    BatchEncoding,
    BertModel,
    BertTokenizer,
    PreTrainedModel,
    PreTrainedTokenizer,
)

from featgenerator import util
from importlib import reload
util = reload(util)

In [None]:
from itables import init_notebook_mode

init_notebook_mode(all_interactive=True)

## Before loading config make sure you have the right root_dir in the config file

In [None]:
from importlib import reload
from featgenerator import util
from typing import Optional, Tuple

util = reload(util)

In [None]:
# Load and prepare datasets
exif_features, malcat_features, joined_df, adversary_dataset = group_features.load_and_prepare_datasets()
# Process features and merge datasets
final_features = group_features.process_and_merge_features(exif_features, malcat_features, joined_df, adversary_dataset)

In [None]:
# Generate the string embeddings based on the floss features. We might not necessarily use this.
floss_feat = FlossFeatures()
string_embedding_processor = group_features.StringEmbeddingProcessor(joined_df=joined_df)
string_embedding_df_features = string_embedding_processor.process()
string_embedding_df_features.columns = string_embedding_df_features.columns.astype(str) 

In [None]:
merged_adversary_experiment_final = adversary_dataset[['hash', 'Normalized_Tag']].merge(joined_df, on="hash")

In [None]:
all_features = joined_df.merge(adversary_dataset, on = "hash")


In [None]:
feat = final_features.drop(columns=['Normalized_Tag'])

In [None]:
combined = pd.concat([
    feat.reset_index(drop=True),
    string_embedding_df_features.reset_index(drop=True)
], axis=1)


In [None]:
n_clusters= list(np.arange(5, 120, 5))
modelling = util.Modelling()
# To combine the string embedding with our features from the Group Attribution pipeline
all_params, best_param, best_truth_matrix = modelling.find_best_agglo(combined, n_clusters,all_features[['hash', 'Normalized_Tag']], 'Normalized_Tag')
# Results without the string embeddings
# all_params, best_param, best_truth_matrix = modelling.find_best_agglo(feat, n_clusters,all_features[['hash', 'Normalized_Tag']], 'Normalized_Tag')
# Results with just the string embeddings
# all_params, best_param, best_truth_matrix = modelling.find_best_agglo(string_embedding_df_features, n_clusters,all_features[['hash', 'Normalized_Tag']], 'Normalized_Tag')


In [None]:
show(pd.DataFrame(all_params))