In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
!pip install pefile
!pip install PyGitHub
!pip install nltk
!pip install scipy

# **TASK 1**

In [50]:
import sys
import hashlib
filename = "/content/drive/MyDrive/ColabNotebooks/Lab_2/python-3.10.0-amd64.exe"
BUF_SIZE = 65536
md5 = hashlib.md5()
sha256 = hashlib.sha256()
with open(filename, "rb") as f:
 while True:
  data = f.read(BUF_SIZE)
  if not data:
    break
  md5.update(data)
  sha256.update(data)
print("MD5: {0}".format(md5.hexdigest()))
print("SHA256: {0}".format(sha256.hexdigest()))

MD5: c3917c08a7fe85db7203da6dcaa99a70
SHA256: cb580eb7dc55f9198e650f016645023e8b2224cf7d033857d12880b46c5c94ef


# **TASK 8**

**Bước 1 – Tạo list mẫu và gán nhãn**


In [71]:
import os
from os import listdir
directories_and_labels = [("/content/drive/MyDrive/ColabNotebooks/Lab_2/Benign PE Samples 2", 0),
("/content/drive/MyDrive/ColabNotebooks/Lab_2/Malicious PE Samples 2", 1)]
list_of_samples = []
labels = []
N_spec = 2 # For N-grams
for dataset_path, label in directories_and_labels:
  samples = [f for f in listdir(dataset_path)]
  for sample in samples:
    file_path = os.path.join(dataset_path, sample)
    list_of_samples.append(file_path)
    labels.append(label)
print(list_of_samples[:5]) # Get the first 5 samples
print(list_of_samples[len(list_of_samples) -5: len(list_of_samples)]) # Get the last 5 samples
print(labels[:5]) # Get the first 5 samples
print(labels[len(list_of_samples) -5: len(list_of_samples)]) # Get the last 5 samples


['/content/drive/MyDrive/ColabNotebooks/Lab_2/Benign PE Samples 2/chkntfs.exe', '/content/drive/MyDrive/ColabNotebooks/Lab_2/Benign PE Samples 2/ComSvcConfig.exe', '/content/drive/MyDrive/ColabNotebooks/Lab_2/Benign PE Samples 2/ComputerDefaults.exe', '/content/drive/MyDrive/ColabNotebooks/Lab_2/Benign PE Samples 2/CompMgmtLauncher.exe', '/content/drive/MyDrive/ColabNotebooks/Lab_2/Benign PE Samples 2/CompatTelRunner.exe']
['/content/drive/MyDrive/ColabNotebooks/Lab_2/Malicious PE Samples 2/5a765351046fea1490d20f25.exe', '/content/drive/MyDrive/ColabNotebooks/Lab_2/Malicious PE Samples 2/abba_-_happy_new_year_zaycev_net.exe', '/content/drive/MyDrive/ColabNotebooks/Lab_2/Malicious PE Samples 2/7ZipSetup.exe', '/content/drive/MyDrive/ColabNotebooks/Lab_2/Malicious PE Samples 2/gchrome.exe', '/content/drive/MyDrive/ColabNotebooks/Lab_2/Malicious PE Samples 2/aapt.exe']
[0, 0, 0, 0, 0]
[1, 1, 1, 1, 1]


**Bước 2 – Chia tập Train/Test theo tỷ lệ 70% Train – 30% Test**

In [72]:
from sklearn.model_selection import train_test_split
samples_train, samples_test, target_train, target_test = train_test_split(
  list_of_samples,
  labels,
  test_size=0.3,
  stratify = labels,
  random_state = 11
)

**Bước 3 – Xây dựng hàm trích xuất thuộc tính**

In [73]:
import collections
from nltk import ngrams
import numpy as np
import pefile
def read_file(file_path):
  with open(file_path, "rb") as bin_file:
    data = bin_file.read()
    return data

def byte_seq_to_Ngrams(byte_seq, N_par):
  Ngrams_par = ngrams(byte_seq, N_par)
  return list(Ngrams_par)

def bin_file_to_Ngrams_count(file_path, N_par):
  file_seq = read_file(file_path)
  file_Ngrams = byte_seq_to_Ngrams(file_seq, N_par)
  return collections.Counter(file_Ngrams)

def get_Ngrams_features_from_samples(sample, K1_most_freq_Ngrams_list):
  K1 = len(K1_most_freq_Ngrams_list)
  feature_vector = K1 * [0]
  file_Ngrams = bin_file_to_Ngrams_count(sample, N_spec)
  for i in range(K1):
    feature_vector[i] = file_Ngrams[K1_most_freq_Ngrams_list[i]]
  return feature_vector

def preprocess_imports(list_of_DLLs):
  """ Normalize the name of the imports of a PE file. """
  temp = [x.decode().split(".")[0].lower() for x in list_of_DLLs] # View the transforming of below example
  return " ".join(temp)

def get_imports(pe):
  """ Get a list of the imports of a PE file """
  list_of_imports = []
  for entry in pe.DIRECTORY_ENTRY_IMPORT:
    list_of_imports.append(entry.dll)
  return preprocess_imports(list_of_imports)


def get_section_names(pe):
  """ Get a list of the section names of a PE file """
  list_of_sections = []
  for sect in pe.sections:
    normalized_name = sect.Name.decode().replace("\x00", "").lower()
    list_of_sections.append(normalized_name)
  return "".join(list_of_sections)

**Bước 4 – Lấy 100 N-grams phổ biến nhất**

In [74]:
Ngrams_count_all = collections.Counter([])
for sample in samples_train:
  Ngrams_count_all += bin_file_to_Ngrams_count(sample, N_spec)
K1 = 100
K1_most_common_Ngrams = Ngrams_count_all.most_common(K1)
K1_most_common_Ngrams_list = [x[0] for x in K1_most_common_Ngrams]

**Bước 5 – Trích xuất thuộc tính cho tập Train**


In [75]:
imports_corpus_train = []
num_sect_train = []
sect_name_train = []
Ngram_feat_list_train = []

y_train = []

In [76]:

for i in range(len(samples_train)):
  sample = samples_train[i]
  try:
    # Get all required parameters with predefined functions
    Ngram_features = get_Ngrams_features_from_samples(sample, K1_most_common_Ngrams_list)
    pe = pefile.PE(sample)
    imports = get_imports(pe)
    n_sections = len(pe.sections)
    sec_names = get_section_names(pe)

    # Put above value into lists
    imports_corpus_train.append(imports)
    num_sect_train.append(n_sections)
    sect_name_train.append(sec_names)
    Ngram_feat_list_train.append(Ngram_features)

    # Target train
    y_train.append(target_train[i])
  except Exception as e:
    print(sample + ":")
    print(e)

/content/drive/MyDrive/ColabNotebooks/Lab_2/Benign PE Samples 2/cmak.exe:
'DOS Header magic not found.'
/content/drive/MyDrive/ColabNotebooks/Lab_2/Benign PE Samples 2/dcdiag.exe:
'DOS Header magic not found.'


**Bước 6 - Sử dụng hàm băm tfidf để chuyển imports, section names từ văn bản thành dạng số**


In [77]:
from sklearn.feature_extraction.text import HashingVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline

In [78]:
imports_featurizer = Pipeline(
    [
        ("vect", HashingVectorizer(input = "content", ngram_range=(1,2))),
        ("tfidf", TfidfTransformer(use_idf = True,)),
    ]
)

In [79]:
sect_name_featurizer = Pipeline(
    [
        ("vect", HashingVectorizer(input = "content", ngram_range= (1,2))),
        ("tfidf", TfidfTransformer(use_idf = True))
    ]
)

In [80]:
imports_corpus_train_transformed = imports_featurizer.fit_transform(imports_corpus_train)
sect_name_train_transformed = sect_name_featurizer.fit_transform(sect_name_train)


**Bước 7 - Kết hợp các vector thuộc tính thành 1 mảng**


In [81]:
from scipy.sparse import hstack, csr_matrix


In [82]:
X_train = hstack(
    [
        Ngram_feat_list_train,
        imports_corpus_train_transformed,
        sect_name_train_transformed,
        csr_matrix(num_sect_train).transpose(),
    ]
)


**Bước 8 - Huấn luyện  Random Forest cho tập**



In [83]:
from sklearn.ensemble import RandomForestClassifier


In [84]:
clf = RandomForestClassifier(n_estimators = 100)
clf = clf.fit(X_train, y_train)

**Bước 9 - Thu thập các thuộc tính của tập test, giống như tập huấn luyện**

In [85]:
import_corpus_test = []
num_sect_test = []
sect_names_test = []
Ngram_feat_list_test = []

y_test = []

In [None]:
for i in range(len(samples_test)):
  test = samples_test[i]
  try:
    # Get all required parameters with predefined functions
    # The input when getting N-grams features is still "sample"
    Ngram_features = get_Ngrams_features_from_samples(sample, K1_most_common_Ngrams_list)
    pe = pefile.PE(test) # Get test PE file
    imports = get_imports(pe)
    n_sections = len(pe.sections)
    sec_names = get_section_names(pe)

    # Put above value into lists
    import_corpus_test.append(imports)
    num_sect_test.append(n_sections)
    sect_names_test.append(sec_names)
    Ngram_feat_list_test.append(Ngram_features)

    # Target train
    y_test.append(target_test[i])
  except Exception as e:
    print(sample + ":")
    print(e)


In [87]:
y_test


[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

**Bước 10 - Chuyển đổi vector từ thuộc tính test, và kiểm tra kết quả của trình phân loại**


In [90]:
import_corpus_test_transformed = imports_featurizer.transform(import_corpus_test)
sect_names_test_transformed = imports_featurizer.transform(sect_names_test)
X_test = hstack(
    [
        Ngram_feat_list_test,
        import_corpus_test_transformed,
        sect_names_test_transformed,
        csr_matrix(num_sect_test).transpose()
    ]
)

In [91]:
print("The score of our classifier is as follow: ")
print(clf.score(X_test, y_test))


The score of our classifier is as follow: 
0.9310344827586207
