# Requirements and Python imports

In [None]:
!pip install fasttext

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.8/68.8 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.10.4-py3-none-any.whl (222 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp310-cp310-linux_x86_64.whl size=4393354 sha256=ed287186fcfbb4d262ba02bda03d0f52008b4bd16ad388cb2dcf094881430da8
  Stored in directory: /root/.cache/pip/wheels/a5/13/75/f811c84a8ab36eedbaef977a6a58a98990e8e0f1967f98f394
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Successfully installed fasttext-0.9.2 pybind11-2.10.4


In [None]:
import pandas as pd
import numpy as np
import io
import os
import random
import lzma
import fasttext
from google.colab import drive

# GDrive and working paths

In [None]:
# Mount Google Drive
drive.mount('gdrive')

# Input and Output directories for logs
dir_logs_compressed = "gdrive/MyDrive/ACS/dga-dns-preproc/logs_compressed"
dir_logs = "gdrive/MyDrive/ACS/dga-dns-preproc/logs"

# Output directories for DNS records
dir_dns = "gdrive/MyDrive/ACS/dga-dns-preproc/dns"

# Output files for uni-grams, di-grams and tri-grams
file_unigrams = "gdrive/MyDrive/ACS/dga-dns-preproc/grams/unigrams.log"
file_digrams = "gdrive/MyDrive/ACS/dga-dns-preproc/grams/digrams.log"
file_trigrams = "gdrive/MyDrive/ACS/dga-dns-preproc/grams/trigrams.log"

# Output binary files for FastText models
model_ft_uni = "gdrive/MyDrive/ACS/dga-dns-preproc/ft_models/unigrams_ft_model.bin"
model_ft_di = "gdrive/MyDrive/ACS/dga-dns-preproc/ft_models/digrams_ft_model.bin"
model_ft_tri = "gdrive/MyDrive/ACS/dga-dns-preproc/ft_models/trigrams_ft_model.bin"

# Output vectors files for FastText models ("DGA-Mixed-Embeddings-Ensemble" git repository folder)
vec_ft_uni = "/content/gdrive/MyDrive/ACS/dga-mixed-embeddings-ensemble/Code/vec/char.vec"
vec_ft_di = "/content/gdrive/MyDrive/ACS/dga-mixed-embeddings-ensemble/Code/vec/bigram.vec"
vec_ft_tri = "/content/gdrive/MyDrive/ACS/dga-mixed-embeddings-ensemble/Code/vec/trigram.vec"

Mounted at gdrive


# Decompress log files

In [None]:
def decompress_xz_files(input_folder, output_folder):
  # Check if the input folder exists
  if not os.path.isdir(input_folder):
    print("The specified input folder does not exist.")
    return

  # Create the output folder if it doesn't exist
  if not os.path.exists(output_folder):
    os.makedirs(output_folder)

  # Get the list of all files in the input folder
  files = os.listdir(input_folder)

  # Filter files with .xz extension
  xz_files = [file for file in files if file.endswith(".xz")]

  # Extract the contents of each .xz file
  log_extracted = 0
  for xz_file in xz_files:
    xz_file_path = os.path.join(input_folder, xz_file)
    extraction_file_path = os.path.join(output_folder, os.path.splitext(xz_file)[0])  # Remove the .xz extension

    with lzma.open(xz_file_path, "rb") as extraction_file, open(extraction_file_path, "wb") as output_file:
      output_file.write(extraction_file.read())

    log_extracted += 1  # Increment the log file count
    print(f"Content decompressed from {xz_file} successfully. Saved at {extraction_file_path}")

  print("Logs extracted:", log_extracted)

In [None]:
decompress_xz_files(dir_logs_compressed, dir_logs)

Content decompressed from pdns_2023_01_17_00.log.xz successfully. Saved at gdrive/MyDrive/ACS/dga-dns-preproc/logs/pdns_2023_01_17_00.log
Content decompressed from pdns_2023_01_17_01.log.xz successfully. Saved at gdrive/MyDrive/ACS/dga-dns-preproc/logs/pdns_2023_01_17_01.log
Content decompressed from pdns_2023_01_17_02.log.xz successfully. Saved at gdrive/MyDrive/ACS/dga-dns-preproc/logs/pdns_2023_01_17_02.log
Content decompressed from pdns_2023_01_17_03.log.xz successfully. Saved at gdrive/MyDrive/ACS/dga-dns-preproc/logs/pdns_2023_01_17_03.log
Content decompressed from pdns_2023_01_17_04.log.xz successfully. Saved at gdrive/MyDrive/ACS/dga-dns-preproc/logs/pdns_2023_01_17_04.log
Content decompressed from pdns_2023_01_17_05.log.xz successfully. Saved at gdrive/MyDrive/ACS/dga-dns-preproc/logs/pdns_2023_01_17_05.log
Content decompressed from pdns_2023_01_17_06.log.xz successfully. Saved at gdrive/MyDrive/ACS/dga-dns-preproc/logs/pdns_2023_01_17_06.log
Content decompressed from pdns_202

# Extract DNS from logs

In [None]:
def extract_dns(dir_input, dir_output):
  # Counter for the number of DNS extracted
  dns_extracted = 0  

  for filename in os.listdir(dir_input):
    file_path = os.path.join(dir_input, filename)

    # Skip directories and non-file items in the input directory
    if not os.path.isfile(file_path):
        continue  

    # List to store cleaned log lines
    clean_lines = []  
    with open(file_path, "r") as file:
      for line in file:
        elem = line.split(";")
        # Skip lines with less than 5 elements
        if len(elem) < 5:
          continue  
        # Extract the DNS and remove whitespace
        dns = elem[5].strip()  
        # Add clean log lines to the list
        if is_log_clean(dns) and len(elem) == 10:
          clean_lines.append(line)  

    try:
      df = pd.read_csv(io.StringIO('\n'.join(clean_lines)), sep=';', header=None, lineterminator='\n')
      df = df.drop_duplicates(subset=[5])
      df = df.sample(frac=0.2)
      dns_extracted += len(df)  # Increment the DNS count

      output_file_path = os.path.join(dir_output, os.path.splitext(filename)[0] + ".log")
      df.loc[:, 5].str[:-1].to_csv(output_file_path, sep='\n', index=False, header=False)
      # Write the extracted DNS to the output file
    except pd.errors.EmptyDataError:
      print('Empty log file:', filename)
    except Exception as e:
      print('Error while processing file:', filename)
      print(e)

    print(f"DNS extracted from {filename} successfully. Saved at {output_file_path}")

  print("DNS extracted:", dns_extracted)

def is_log_clean(dns):
  substring = 'arpa'
  fields = dns.split(".")
  return not (
    (len(dns) > 100)
    or (substring in dns)
    or (len(fields) == 4 and all(c.isdigit() for c in fields))
    or (len(fields) == 3 and fields[0] == '' and all(c.isdigit() for c in fields[1:]))
  )

In [None]:
extract_dns(dir_logs, dir_dns)

DNS extracted from pdns_2023_01_17_00.log successfully. Saved at gdrive/MyDrive/ACS/dga-dns-preproc/dns/pdns_2023_01_17_00.log
DNS extracted from pdns_2023_01_17_01.log successfully. Saved at gdrive/MyDrive/ACS/dga-dns-preproc/dns/pdns_2023_01_17_01.log
DNS extracted from pdns_2023_01_17_02.log successfully. Saved at gdrive/MyDrive/ACS/dga-dns-preproc/dns/pdns_2023_01_17_02.log
DNS extracted from pdns_2023_01_17_03.log successfully. Saved at gdrive/MyDrive/ACS/dga-dns-preproc/dns/pdns_2023_01_17_03.log
DNS extracted from pdns_2023_01_17_04.log successfully. Saved at gdrive/MyDrive/ACS/dga-dns-preproc/dns/pdns_2023_01_17_04.log
DNS extracted from pdns_2023_01_17_05.log successfully. Saved at gdrive/MyDrive/ACS/dga-dns-preproc/dns/pdns_2023_01_17_05.log
DNS extracted from pdns_2023_01_17_06.log successfully. Saved at gdrive/MyDrive/ACS/dga-dns-preproc/dns/pdns_2023_01_17_06.log
DNS extracted from pdns_2023_01_17_07.log successfully. Saved at gdrive/MyDrive/ACS/dga-dns-preproc/dns/pdns_20

#DNS n-gram tokenizer

In [None]:
# Function to create n-grams
def create_ngrams(line, n):
    # Remove the dot character and any whitespace at the beginning or end of the line
    line = line.replace('.', '').strip()
    # Create n-grams for the line and return them as a string with spaces between each n-gram
    return ' '.join([line[i:i+n] for i in range(len(line)-n+1)])

def gramsplit_dns(dir_input, unigram_out, digram_out, trigram_out, dns_percentage):
    # Get a list of all files in the input folder
    file_list = os.listdir(dir_input)
    
    # Shuffle the file list to randomly select files
    random.shuffle(file_list)
    
    # Calculate the number of files to process based on the specified percentage
    num_files = int(len(file_list) * dns_percentage)
    
    # Loop through the selected number of files and create unigrams, digrams, and trigrams for each line
    with open(unigram_out, 'w') as f_out_unigrams, open(digram_out, 'w') as f_out_digrams, open(trigram_out, 'w') as f_out_trigrams:
        for filename in file_list[:num_files]:
            if filename.endswith('.log'):
                with open(os.path.join(dir_input, filename)) as f_in:
                    for line in f_in:
                        # Create unigrams, digrams, and trigrams for the current line and write them to the output files
                        unigrams = create_ngrams(line.strip(), 1)
                        f_out_unigrams.write(unigrams + '\n')
                        digrams = create_ngrams(line.strip(), 2)
                        f_out_digrams.write(digrams + '\n')
                        trigrams = create_ngrams(line.strip(), 3)
                        f_out_trigrams.write(trigrams + '\n')
                
                print("DNSs splitted:", filename)

    print("Grams splitting completed.")

In [None]:
gramsplit_dns(dir_dns, file_unigrams, file_digrams, file_trigrams, 0.25)

DNSs splitted: pdns_2023_05_05_20.log
DNSs splitted: pdns_2023_01_16_03.log
DNSs splitted: pdns_2023_02_12_10.log
DNSs splitted: pdns_2023_02_19_05.log
DNSs splitted: pdns_2023_03_13_11.log
DNSs splitted: pdns_2023_04_24_19.log
DNSs splitted: pdns_2023_03_22_00.log
DNSs splitted: pdns_2023_01_16_00.log
DNSs splitted: pdns_2023_01_17_14.log
DNSs splitted: pdns_2023_03_13_16.log
DNSs splitted: pdns_2023_04_24_05.log
DNSs splitted: pdns_2023_01_17_20.log
DNSs splitted: pdns_2023_01_29_13.log
DNSs splitted: pdns_2023_03_22_23.log
DNSs splitted: pdns_2023_01_29_07.log
DNSs splitted: pdns_2023_01_29_15.log
DNSs splitted: pdns_2023_02_19_06.log
DNSs splitted: pdns_2023_04_24_10.log
DNSs splitted: pdns_2023_03_25_23.log
DNSs splitted: pdns_2023_03_13_08.log
DNSs splitted: pdns_2023_04_12_02.log
DNSs splitted: pdns_2023_04_12_16.log
DNSs splitted: pdns_2023_01_17_12.log
DNSs splitted: pdns_2023_02_19_18.log
DNSs splitted: pdns_2023_03_25_16.log
DNSs splitted: pdns_2023_05_05_14.log
DNSs splitte

# FastText Training

In [None]:
# Save the models in VEC format (required by the classification network architecture)
# a dedicated function is needed because the attributes "WordVectorModel"
# and "save_word2vec_format" are no longer present in the fasttext module
def save_vec(model, file_path):
    # Get the words and their count and dimension
    words = model.get_words()
    word_count = len(words)
    dim = model.get_dimension()
    
    # Create an empty numpy array to hold the vectors
    vectors = np.empty((word_count, dim), dtype=np.float32)

    # Get the vectors for each word and store them in the numpy array
    for i, word in enumerate(words):
        vectors[i] = model.get_word_vector(word)

    # Save the numpy array as a .vec file
    with open(file_path, "w", encoding="utf-8") as f_out:
        # Write the header with the word count and dimension
        f_out.write(f"{word_count} {dim}\n")
        
        # Write each word and its vector to the file
        for i, word in enumerate(words):
            vector_str = " ".join(str(x) for x in vectors[i])
            f_out.write(f"{word} {vector_str}\n")
    
    print(f"Word vectors saved to {file_path}")

In [None]:
# Train FastText unsupervised model on uni-grams and save it in binary and VEC formats
uni_model = fasttext.train_unsupervised(input=file_unigrams, dim=128, model='skipgram')
uni_model.save_model(model_ft_uni)
save_vec(uni_model, vec_ft_uni)

Word vectors saved to /content/gdrive/MyDrive/ACS/dga-mixed-embeddings-ensemble/Code/vec/char.vec


In [None]:
# Train FastText unsupervised model on di-grams and save it in binary and VEC formats
di_model = fasttext.train_unsupervised(input=file_digrams, dim=128, model='skipgram')
di_model.save_model(model_ft_di)
save_vec(di_model, vec_ft_di)

Word vectors saved to /content/gdrive/MyDrive/ACS/dga-mixed-embeddings-ensemble/Code/vec/bigram.vec


In [None]:
# Train FastText unsupervised model on tri-grams and save it in binary and VEC formats
tri_model = fasttext.train_unsupervised(input=file_trigrams, dim=128, model='skipgram')
tri_model.save_model(model_ft_tri)
save_vec(tri_model, vec_ft_tri)

Word vectors saved to /content/gdrive/MyDrive/ACS/dga-mixed-embeddings-ensemble/Code/vec/trigram.vec
