In [None]:
!pip install scikit-multilearn

Collecting scikit-multilearn
  Downloading scikit_multilearn-0.2.0-py3-none-any.whl (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.4/89.4 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-multilearn
Successfully installed scikit-multilearn-0.2.0


In [None]:
!pip install xgboost



In [None]:
import pandas as pd
from xgboost import XGBClassifier
from skmultilearn.problem_transform import ClassifierChain
from sklearn.model_selection import train_test_split
from sklearn.metrics import (classification_report, recall_score, hamming_loss, jaccard_score,
                             multilabel_confusion_matrix, f1_score, precision_score, accuracy_score,
                             roc_curve, auc,zero_one_loss)
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.pipeline import make_pipeline
from sklearn.multioutput import MultiOutputClassifier
from skmultilearn.problem_transform import BinaryRelevance
from skmultilearn.problem_transform import LabelPowerset
from sklearn.multiclass import OneVsRestClassifier
from skmultilearn.adapt import MLkNN
from scipy.sparse import csr_matrix, lil_matrix
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# Load the dataset
file_path = 'Technique_Dataset.csv'
df = pd.read_csv(file_path)

In [None]:
# Drop the column 'Lateral Movement' from the DataFrame
df = df.drop(columns=['T1458','T1660','T1456','T1631','T1664','T1663','T1461','T1661','T1639','T1641','T1474','T1603','T1638'])

# Verify the column has been removed
print(df.columns)

Index(['Hash Name', 'com.mp4.videodownloader.receiver.restartservicereceiver',
       'com.dktools.liteforfb.mainactivity',
       'com.google.android.libraries.cast.companionlibrary.remotecontrol.videointentreceiver',
       'com.example.myapplicationtest.composesmsactivity',
       'com.sangcall.kchtmlactivity', 'ir.ali.korosh.hakh.mainactivity',
       'whuxyrlapwrkoxmzamujn.tpgbbryjtpo.ooegchzjunneeobe.wzymtmufclabfoxrtdhdq',
       'sun.photoalbum1.sunservice.sun9.send.whatsupdoc.llll1j1',
       'ehioinw.kjnt.dqpkcxd.qegehdservice',
       ...
       'T1533', 'T1516', 'T1406', 'T1575', 'T1637', 'T1414', 'T1513', 'T1509',
       'T1541', 'T1517'],
      dtype='object', length=13373)


In [None]:
# Assume the last n columns are the labels
n_labels = 48  # replace with the actual number of label columns

In [None]:
# Separate features and labels
X = df.iloc[:, 1:-n_labels]
y = df.iloc[:, -n_labels:]

In [None]:
# Sum of occurrences of '1' in each label
label_sums = y.sum(axis=0)
print("Sum of occurrences of '1' in each label:\n", label_sums)

Sum of occurrences of '1' in each label:
 T1424     231
T1604     262
T1532      93
T1404     984
T1422    1832
T1577      18
T1437    2073
T1512    1925
T1430    1988
T1418    1991
T1624      18
T1645    1018
T1429    1982
T1417     482
T1481      17
T1426    2075
T1644    1671
T1471      26
T1420      65
T1623    1170
T1655    2004
T1544      69
T1643     202
T1617      52
T1630    1578
T1646      87
T1640      52
T1398      21
T1521    1917
T1582     328
T1421      66
T1642      39
T1409    1732
T1616    1655
T1636    2372
T1407     991
T1633     129
T1662      10
T1533    1962
T1516      86
T1406    2123
T1575    1778
T1637     295
T1414      18
T1513    1778
T1509      23
T1541      38
T1517     370
dtype: int64


In [None]:
# Get label names
label_names = y.columns.tolist()
label_names

['T1424',
 'T1604',
 'T1532',
 'T1404',
 'T1422',
 'T1577',
 'T1437',
 'T1512',
 'T1430',
 'T1418',
 'T1624',
 'T1645',
 'T1429',
 'T1417',
 'T1481',
 'T1426',
 'T1644',
 'T1471',
 'T1420',
 'T1623',
 'T1655',
 'T1544',
 'T1643',
 'T1617',
 'T1630',
 'T1646',
 'T1640',
 'T1398',
 'T1521',
 'T1582',
 'T1421',
 'T1642',
 'T1409',
 'T1616',
 'T1636',
 'T1407',
 'T1633',
 'T1662',
 'T1533',
 'T1516',
 'T1406',
 'T1575',
 'T1637',
 'T1414',
 'T1513',
 'T1509',
 'T1541',
 'T1517']

In [None]:
# Display the shape of the features and labels
print(f"Features shape: {X.shape}")
print(f"Labels shape: {y.shape}")

Features shape: (2774, 13324)
Labels shape: (2774, 48)


In [None]:

# Evaluate the models
def evaluate_model(y_test, predictions, model_name, file, random_seed):
    file.write(f"\n\n********** Random Seed: {seed} **********\n\n")
    file.write(f"\nEvaluating {model_name}")
    file.write(f"\nAccuracy: {accuracy_score(y_test, predictions)}")
    file.write(f"\nMacro F1 Score: {f1_score(y_test, predictions, average='macro')}")
    file.write(f"\nWeighted F1 score: {f1_score(y_test, predictions, average='weighted')}")
    file.write(f"\nMicro F1 score: {f1_score(y_test, predictions, average='micro')}")
    file.write(f"\nMacro Precision: {precision_score(y_test, predictions, average='macro')}")
    file.write(f"\nWeighted Precision: {precision_score(y_test, predictions, average='weighted')}")
    file.write(f"\nMicro Precision: {precision_score(y_test, predictions, average='micro')}")
    file.write(f"\nMacro Recall: {recall_score(y_test, predictions, average='macro')}")
    file.write(f"\nWeighted Recall: {recall_score(y_test, predictions, average='weighted')}")
    file.write(f"\nMicro Recall: {recall_score(y_test, predictions, average='micro')}")
    file.write(f"\nHamming Loss: {hamming_loss(y_test, predictions)}")
    file.write(f"\nZero One Loss: {zero_one_loss(y_test, predictions)}")
    file.write(f"\nJaccard Similarity: {jaccard_score(y_test, predictions, average='samples')}")
    file.write(f"\nClassification Report:\n{classification_report(y_test, predictions,target_names=label_names)}")
    file.write(f"\nMultilabel Confusion Matrix:\n{multilabel_confusion_matrix(y_test, predictions)}")
    file.write("\n")

    print(f"\n\n********** Random Seed: {seed} **********\n\n")
    print(f"Evaluating {model_name}")
    print(f"Accuracy: {accuracy_score(y_test, predictions)}")
    print(f"Macro F1 Score: {f1_score(y_test, predictions, average='macro')}")
    print(f"Weighted F1 score: {f1_score(y_test, predictions, average='weighted')}")
    print(f"Micro F1 score: {f1_score(y_test, predictions, average='micro')}")
    print(f"Macro Precision: {precision_score(y_test, predictions, average='macro')}")
    print(f"Weighted Precision: {precision_score(y_test, predictions, average='weighted')}")
    print(f"Micro Precision: {precision_score(y_test, predictions, average='micro')}")
    print(f"Macro Recall: {recall_score(y_test, predictions, average='macro')}")
    print(f"Weighted Recall: {recall_score(y_test, predictions, average='weighted')}")
    print(f"Micro Recall: {recall_score(y_test, predictions, average='micro')}")
    print(f"Hamming Loss: {hamming_loss(y_test, predictions)}")
    print(f"Zero One Loss: {zero_one_loss(y_test, predictions)}")
    print(f"Jaccard Similarity: {jaccard_score(y_test, predictions, average='samples')}")
    print(f"Classification Report:\n{classification_report(y_test, predictions,target_names=label_names)}")
    print(f"Multilabel Confusion Matrix:\n{multilabel_confusion_matrix(y_test, predictions)}")
    print("\n")

In [None]:
# Define different random seeds to evaluate
random_seeds = [42, 1433, 2396, 451, 995, 98, 262, 354, 560, 1600]

In [None]:
# Open a single file to write all evaluations
with open("XGBoost_model_evaluations.txt", "w") as file:

    for seed in random_seeds:


        # Split the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)



        # Binary Relevance
        br_model = BinaryRelevance(XGBClassifier(gamma=1.679,learning_rate=0.124,max_depth=14,n_estimators=156,random_state=42))
        br_model.fit(X_train, y_train)
        br_predictions = br_model.predict(X_test)


        # initialize classifier chains ClassifierChain classifier
        chain_model = ClassifierChain(XGBClassifier(gamma=1.896,learning_rate=0.253,max_depth=12,n_estimators=54,random_state=42))
        # Training logistic regression model on train data
        chain_model.fit(X_train, y_train)
        # predict
        chain_predictions = chain_model.predict(X_test)



        # Label Powerset
        # initialize label powerset LabelPowerset classifier
        lb_model = LabelPowerset(XGBClassifier(gamma=0,learning_rate=0.3,max_depth=6,n_estimators=100,random_state=42))
        # train
        lb_model.fit(X_train, y_train)
        # predict
        lb_predictions = lb_model.predict(X_test)


        evaluate_model(y_test, br_predictions, "Binary Relevance", file, seed)
        evaluate_model(y_test, chain_predictions, "Classifier Chain", file, seed)
        evaluate_model(y_test, lb_predictions, "Label Powerset", file, seed)

# Print path to the saved file
print("Evaluations saved to XGBoost_model_evaluations.txt")