In [1]:
import pandas as pd
import math
from statistics import mean
from math import log
import joblib
import warnings
import time
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.metrics import classification_report,  RocCurveDisplay
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from tqdm import tqdm
from IPython.display import display
tqdm.pandas()
plt.style.use('seaborn-darkgrid')
warnings.filterwarnings('always')
warnings.simplefilter("always")
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)

In [2]:
df = pd.read_csv("Data/Data for Malware Query Generation/output.csv")

In [3]:
df

Unnamed: 0,cardno,encode,attack,data_combined
0,8.638540e+15,cac4c1cac7c6c2c5c1c4c1c3cac3cbc4,1,cac4c1cac7c6c2c5c1c4c1c3cac3cbc4.trtsport.cz
1,7.106420e+15,c5c3c2c4c6c0c1cbc5c2cbc1c3c7c3c7,1,c5c3c2c4c6c0c1cbc5c2cbc1c3c7c3c7.trtsport.cz
2,6.492570e+15,c4c6cbc0c7c4c7c7cac0c6c3c1c7c1c2,1,c4c6cbc0c7c4c7c7cac0c6c3c1c7c1c2.trtsport.cz
3,2.868560e+15,c0cac4cac7c4c2c4c7c3c7c0c7c5c2c4,1,c0cac4cac7c4c2c4c7c3c7c0c7c5c2c4.trtsport.cz
4,1.438690e+15,c3c6c1cac4cbc2c4c0c7c2cbcac0c3cb,1,c3c6c1cac4cbc2c4c0c7c2cbcac0c3cb.trtsport.cz
...,...,...,...,...
396,3.168660e+15,c7cbc5cacac6c6cbc3c3c1cbc6c7c4c7,1,c7cbc5cacac6c6cbc3c3c1cbc6c7c4c7.energy-utama.com
397,4.012780e+15,c6c2c3c0c5c5c5c6c4c4cbc2c5c2c7c2,1,c6c2c3c0c5c5c5c6c4c4cbc2c5c2c7c2.energy-utama.com
398,7.677590e+15,c5c4c5c5c7cbc1c5c0c4c5c3c1c4cac2,1,c5c4c5c5c7cbc1c5c0c4c5c3c1c4cac2.energy-utama.com
399,9.458110e+15,c6c2c6c2c5c6c4c6c6c2c2c2a1c6c6c5,1,c6c2c6c2c5c6c4c6c6c2c2c2a1c6c6c5.energy-utama.com


In [4]:
df['data_combined']
df_queries=pd.DataFrame(df[['data_combined','attack']])
df_queries

Unnamed: 0,data_combined,attack
0,cac4c1cac7c6c2c5c1c4c1c3cac3cbc4.trtsport.cz,1
1,c5c3c2c4c6c0c1cbc5c2cbc1c3c7c3c7.trtsport.cz,1
2,c4c6cbc0c7c4c7c7cac0c6c3c1c7c1c2.trtsport.cz,1
3,c0cac4cac7c4c2c4c7c3c7c0c7c5c2c4.trtsport.cz,1
4,c3c6c1cac4cbc2c4c0c7c2cbcac0c3cb.trtsport.cz,1
...,...,...
396,c7cbc5cacac6c6cbc3c3c1cbc6c7c4c7.energy-utama.com,1
397,c6c2c3c0c5c5c5c6c4c4cbc2c5c2c7c2.energy-utama.com,1
398,c5c4c5c5c7cbc1c5c0c4c5c3c1c4cac2.energy-utama.com,1
399,c6c2c6c2c5c6c4c6c6c2c2c2a1c6c6c5.energy-utama.com,1


In [5]:
df[df.attack==1].sample(5)

Unnamed: 0,cardno,encode,attack,data_combined
261,7633800000000000.0,c6c5c2c1c6cac2c2c1c6c6c1c4cac0c2,1,c6c5c2c1c6cac2c2c1c6c6c1c4cac0c2.lakesideresor...
27,1410960000000000.0,c3c6c3c2cbc4c0c0c4c2cbc0c1c6c6c7,1,c3c6c3c2cbc4c0c0c4c2cbc0c1c6c6c7.xo3fhvm5lcvzy...
396,3168660000000000.0,c7cbc5cacac6c6cbc3c3c1cbc6c7c4c7,1,c7cbc5cacac6c6cbc3c3c1cbc6c7c4c7.energy-utama.com
258,6085980000000000.0,cac4cbc3c1cbc1cbc3cac7cbc7cbc1c2,1,cac4cbc3c1cbc1cbc3cac7cbc7cbc1c2.didarmarket.com
54,7665250000000000.0,c5c4c4c7c0c6c4cac4c3c7cac1c7c3c3,1,c5c4c4c7c0c6c4cac4c3c7cac1c7c3c3.christinelebe...


# Extracting all the required stateless features for the models by Mahdavifar et al.

In [6]:
# Code adapted from https://stackoverflow.com/questions/2979174/how-do-i-compute-the-approximate-entropy-of-a-bit-string

def entropy(string):
    "Calculates the Shannon entropy of a string"

    # get probability of chars in string
    prob = [ float(string.count(c)) / len(string) for c in dict.fromkeys(list(string)) ]

    # calculate the entropy
    entropy = - sum([ p * math.log(p) / math.log(2.0) for p in prob ])

    return entropy


# Code adapted from https://stackoverflow.com/questions/8870261/how-to-split-text-without-spaces-into-list-of-words
# Build a cost dictionary, assuming Zipf's law and cost = -math.log(probability).

words = open(".\Data\Other DNS Exfiltration Tools\words-by-frequency.txt").read().split()
wordcost = dict((k, log((i+1)*log(len(words)))) for i,k in enumerate(words))
maxword = max(len(x) for x in words)
def infer_spaces(s):
    """Uses dynamic programming to infer the location of spaces in a string
    without spaces."""

    # Find the best match for the i first characters, assuming cost has
    # been built for the i-1 first characters.
    # Returns a pair (match_cost, match_length).
    def best_match(i):
        candidates = enumerate(reversed(cost[max(0, i-maxword):i]))
        return min((c + wordcost.get(s[i-k-1:i], 9e999), k+1) for k,c in candidates)

    # Build the cost array.
    cost = [0]
    for i in range(1,len(s)+1):
        c,k = best_match(i)
        cost.append(c)

    # Backtrack to recover the minimal-cost string.
    out = []
    i = len(s)
    while i>0:
        c,k = best_match(i)
        assert c == cost[i]
        out.append(s[i-k:i])
        i -= k

    return " ".join(reversed(out))

def extract_features(row):
    labels = row["data_combined"].split(".")

    row["FQDN_count"] = len(row)
    row["subdomain"] = len(labels) >= 2
    if row["subdomain"]:
        subdomain = ".".join((row["data_combined"]).split(".")[:-2])
    else:
        subdomain = ""
    row["subdomain_length"] = len(subdomain)
    upper, lower, numeric, special = 0, 0, 0, 0
    for c in row["data_combined"]:
        if c.islower():
            lower += 1
        if c.isupper():
            upper += 1
        if c.isnumeric():
            numeric += 1
        if not c.isalnum():
            special += 1
    row["lower"] = lower
    row["upper"] = upper
    row["numeric"] = numeric
    row["special"] = special
    row["entropy"] = entropy(row["data_combined"])
    row["labels"] = len(labels)
    row["labels_max"] = max([len(label) for label in labels])
    row["labels_average"] = mean([len(label) for label in labels])
    row["sld"] = labels[-2]
    row["len"] = row["subdomain_length"] + len(labels[-2])
    extracted_words = []
    for label in labels:
        extracted_words += (infer_spaces(''.join(filter(str.isalpha, label)))).split()
    row["longest_word"] = max(extracted_words, key=len)
    return row


  words = open(".\Data\Other DNS Exfiltration Tools\words-by-frequency.txt").read().split()


In [7]:
feature_extraction_start_time = time.time()
df = df.progress_apply(extract_features, axis=1)
feature_extraction_end_time = time.time()

100%|███████████████████████████████████████████████████████████████████████████████| 401/401 [00:03<00:00, 126.58it/s]


In [8]:
feature_extraction_total_time = feature_extraction_end_time - feature_extraction_start_time
print(f"Time taken to extract features from the data: {feature_extraction_total_time:.3f} for {df.shape[0]} queries")
per_query_extraction_time = feature_extraction_total_time/df.shape[0]
print(f"Time taken to extract features per query: {per_query_extraction_time:.3f}")

Time taken to extract features from the data: 3.171 for 401 queries
Time taken to extract features per query: 0.008


In [9]:
df.to_csv("Data/Data for Malware Query Generation/malware_extracted_features.csv", index=None)

In [23]:
# Use if starting from saved data:
df = pd.read_csv("Data/Data for Malware Query Generation/malware_extracted_features.csv")

# Load saved models:

## LR Model baseline

In [25]:
lr_baseline_column_transformer = joblib.load('models/lr_baseline_column_transformer.pkl')
lr_baseline_model = joblib.load('models/lr_baseline_model.pkl')

In [26]:
print(lr_baseline_model)

Pipeline(steps=[('lr', LogisticRegression(max_iter=500, random_state=0))])


## RF Model baseline

In [27]:
rf_baseline_column_transformer = joblib.load('models/rf_baseline_column_transformer.pkl')
rf_baseline_model = joblib.load('models/rf_baseline_model.pkl')

In [28]:
print(rf_baseline_model)

Pipeline(steps=[('rf', RandomForestClassifier(random_state=0))])


# Evaluating the models on the queries from other DNS tools:

## Baseline LR

In [29]:
df = pd.read_csv("Data/Data for Malware Query Generation/malware_extracted_features.csv")
X = df[[col for col in df.columns if col != "attack"]]
y = df[["attack"]]

X["longest_word_islower"] = X["longest_word"].apply(lambda x: str(x).islower())
X["longest_word_isnumeric"] = X["longest_word"].apply(lambda x: str(x).isnumeric())
X["sld_islower"] = X["sld"].apply(lambda x: str(x).islower())
X["sld_isnumeric"] = X["sld"].apply(lambda x: str(x).isnumeric())
X = lr_baseline_column_transformer.transform(X)
predictions = lr_baseline_model.predict(X)
df["predictions"] = predictions

display(pd.DataFrame(classification_report(df["attack"],  df["predictions"], output_dict=True)).T)

Unnamed: 0,precision,recall,f1-score,support
0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,401.0
accuracy,0.0,0.0,0.0,0.0
macro avg,0.0,0.0,0.0,401.0
weighted avg,0.0,0.0,0.0,401.0


## Baseline RF

In [30]:
df = pd.read_csv("Data/Data for Malware Query Generation/malware_extracted_features.csv")
X = df[[col for col in df.columns if col != "attack"]]

y = df[["attack"]]

X = rf_baseline_column_transformer.transform(X)
predictions = rf_baseline_model.predict(X)
df["predictions"] = predictions

display(pd.DataFrame(classification_report(df["attack"],  df["predictions"], output_dict=True)).T)

Unnamed: 0,precision,recall,f1-score,support
0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,401.0
accuracy,0.0,0.0,0.0,0.0
macro avg,0.0,0.0,0.0,401.0
weighted avg,0.0,0.0,0.0,401.0


# Using the models trained on optimized features:

## LR

In [31]:
lr_optimized_features_column_transformer = joblib.load('models/lr_optimized_features_column_transformer.pkl')
lr_optimized_features_model = joblib.load('models/lr_optimized_features_model.pkl')

In [32]:
df = pd.read_csv("Data/Data for Malware Query Generation/malware_extracted_features.csv")
X = df[[col for col in df.columns if col != "attack"]]
y = df[["attack"]]

X["longest_word_islower"] = X["longest_word"].apply(lambda x: str(x).islower())
X["longest_word_isnumeric"] = X["longest_word"].apply(lambda x: str(x).isnumeric())
X["sld_islower"] = X["sld"].apply(lambda x: str(x).islower())
X["sld_isnumeric"] = X["sld"].apply(lambda x: str(x).isnumeric())

X = lr_optimized_features_column_transformer.transform(X)
predictions = lr_optimized_features_model.predict(X)
df["predictions"] = predictions

display(pd.DataFrame(classification_report(df["attack"],  df["predictions"], output_dict=True)).T)

Unnamed: 0,precision,recall,f1-score,support
0,0.0,0.0,0.0,0.0
1,1.0,0.229426,0.373225,401.0
accuracy,0.229426,0.229426,0.229426,0.229426
macro avg,0.5,0.114713,0.186613,401.0
weighted avg,1.0,0.229426,0.373225,401.0


## RF

In [33]:
rf_optimized_features_column_transformer = joblib.load('models/rf_optimized_features_column_transformer.pkl')
rf_optimized_features_model = joblib.load('models/rf_optimized_features_model.pkl')

In [34]:
df = pd.read_csv("Data/Data for Malware Query Generation/malware_extracted_features.csv")
X = df[[col for col in df.columns if col != "attack"]]
y = df[["attack"]]

X["longest_word_islower"] = X["longest_word"].apply(lambda x: str(x).islower())
X["longest_word_isnumeric"] = X["longest_word"].apply(lambda x: str(x).isnumeric())
X["sld_islower"] = X["sld"].apply(lambda x: str(x).islower())
X["sld_isnumeric"] = X["sld"].apply(lambda x: str(x).isnumeric())

X = rf_optimized_features_column_transformer.transform(X)
predictions = rf_optimized_features_model.predict(X)
df["predictions"] = predictions

display(pd.DataFrame(classification_report(df["attack"],  df["predictions"], output_dict=True)).T)

Unnamed: 0,precision,recall,f1-score,support
0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,401.0
accuracy,0.0,0.0,0.0,0.0
macro avg,0.0,0.0,0.0,401.0
weighted avg,0.0,0.0,0.0,401.0


Both baseline and optimized logistic regression models perform well on all the tools except tuns. Both Random forest models are unable to classify any of the queries from other DNS Exfiltration tools.

In [35]:
optimized_features = ['FQDN_count', 'subdomain_length', 'lower', 'numeric', 'special',
       'labels', 'longest_word_islower',
       'longest_word_isnumeric', 'sld_islower', 'sld_isnumeric']

# Testing malware queries on models trained on other DNS Exfiltration tools:

# LR

In [37]:
lr_other_tools_column_transformer = joblib.load('models/lr_other_dns_tools_column_transformer.pkl')
lr_other_tools_model = joblib.load('models/lr_other_dns_tools_model.pkl')

In [38]:
df = pd.read_csv("Data/Data for Malware Query Generation/malware_extracted_features.csv")
X = df[[col for col in df.columns if col != "attack"]]
y = df[["attack"]]

X["longest_word_islower"] = X["longest_word"].apply(lambda x: str(x).islower())
X["longest_word_isnumeric"] = X["longest_word"].apply(lambda x: str(x).isnumeric())
X["sld_islower"] = X["sld"].apply(lambda x: str(x).islower())
X["sld_isnumeric"] = X["sld"].apply(lambda x: str(x).isnumeric())
X = lr_other_tools_column_transformer.transform(X)
predictions = lr_other_tools_model.predict(X)
df["predictions"] = predictions

display(pd.DataFrame(classification_report(df["attack"],  df["predictions"], output_dict=True)).T)

Unnamed: 0,precision,recall,f1-score,support
1,1.0,1.0,1.0,401.0
accuracy,1.0,1.0,1.0,1.0
macro avg,1.0,1.0,1.0,401.0
weighted avg,1.0,1.0,1.0,401.0


# RF

In [39]:
rf_other_tools_column_transformer = joblib.load('models/rf_other_dns_tools_column_transformer.pkl')
rf_other_tools_model = joblib.load('models/rf_other_dns_tools_model.pkl')

In [40]:
df = pd.read_csv("Data/Data for Malware Query Generation/malware_extracted_features.csv")
X = df[[col for col in df.columns if col != "attack"]]
y = df[["attack"]]

X["longest_word_islower"] = X["longest_word"].apply(lambda x: str(x).islower())
X["longest_word_isnumeric"] = X["longest_word"].apply(lambda x: str(x).isnumeric())
X["sld_islower"] = X["sld"].apply(lambda x: str(x).islower())
X["sld_isnumeric"] = X["sld"].apply(lambda x: str(x).isnumeric())

X = rf_other_tools_column_transformer.transform(X)
predictions = rf_other_tools_model.predict(X)
df["predictions"] = predictions
display(pd.DataFrame(classification_report(df["attack"],  df["predictions"], output_dict=True)).T)

Unnamed: 0,precision,recall,f1-score,support
1,1.0,1.0,1.0,401.0
accuracy,1.0,1.0,1.0,1.0
macro avg,1.0,1.0,1.0,401.0
weighted avg,1.0,1.0,1.0,401.0
