In [1]:
from tqdm import tqdm

import numpy as np
import pandas as pd
import pandas_bokeh
import matplotlib.pyplot as plt
import seaborn as sns
import time, math, os, glob, wget, shutil, joblib, warnings, sys

from Crypto.Cipher import AES

time.clock = time.time
pandas_bokeh.output_notebook()

from IPython.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

from collections import Counter

%load_ext autotime



time: 114 µs (started: 2024-01-29 17:05:19 -08:00)


## File Encryption

In [53]:
def encrypt_file(filename, key, desired_ext):
    # Generate a random initialization vector (IV)
    iv = os.urandom(16)

    # Create AES cipher object with the provided key and mode
    cipher = AES.new(key, AES.MODE_CBC, iv)

    # Read the contents of the file
    with open(f'''files_dataset/original/{filename}''', 'rb') as file:
        plaintext = file.read()

    # Pad the plaintext to match the block size of AES (16 bytes)
    padded_plaintext = pad(plaintext)

    # Encrypt the padded plaintext
    ciphertext = cipher.encrypt(padded_plaintext)

    # Write the IV and ciphertext to a new file
    encrypted_filename = filename + desired_ext
    with open(f'''files_dataset/encrypted/{encrypted_filename}''', 'wb') as file:
        file.write(iv + ciphertext)

time: 3.2 ms (started: 2024-01-29 18:59:17 -08:00)


In [54]:
def pad(data):
    # Calculate the number of bytes to pad
    padding_len = AES.block_size - (len(data) % AES.block_size)

    # Pad the data with the number of bytes to pad
    padding = bytes([padding_len]) * padding_len

    return data + padding

time: 1.08 ms (started: 2024-01-29 18:59:17 -08:00)


In [56]:
folder_path = 'files_dataset/original'
key = b'Sixteen byte key'

for filename in os.listdir(folder_path):
    encrypt_file(filename, key, desired_ext='.wcry')

time: 138 ms (started: 2024-01-29 19:00:07 -08:00)


## Entropy Calculation

In [17]:
def read_bytearray(filepath):
    with open(filepath, mode='rb') as file:
        byte_data = file.read()
        return np.array(list(byte_data))

time: 2.77 ms (started: 2024-01-29 18:14:54 -08:00)


In [18]:
def array_split_given_size(a, size):
    return np.split(a, np.arange(size,len(a),size))

time: 507 µs (started: 2024-01-29 18:14:55 -08:00)


In [19]:
def calculate_histogram(block_data):
    eps = 1e-10
    counter = Counter(block_data)
    bins = np.arange(0, 256)
    counts = [counter[i] for i in bins]
    assert(bins[0] == 0)
    assert(bins[-1] == 255)
    df = pd.DataFrame({
        'bin': bins,
        'count': counts,
    })
    df['prob'] = df['count'] / np.sum(counts)
    df['prob'] = [np.max([eps, x]) for x in df['prob']]
    return df

time: 1.78 ms (started: 2024-01-29 18:14:56 -08:00)


In [20]:
def plot_histogram(df, title='Histogram', show_y='count'):
    pbar = df.plot_bokeh.bar(
        ylabel=show_y,
        title=title,
        x='bin',
        y=show_y,
        alpha=0.9,
        figsize=(1000,500),
        xticks = [0, 50, 100, 150, 200, 255]
    )
    return pbar

time: 1.24 ms (started: 2024-01-29 18:14:56 -08:00)


In [21]:
def calculate_shannon_entropy(probs):
    return -np.sum(probs * np.log2(probs))

time: 401 µs (started: 2024-01-29 18:14:58 -08:00)


In [22]:
def get_entropy(filepath):
    
    filesize = os.path.getsize(filepath)
    
    num_blocks = math.ceil(filesize / 4096)
    
    byte_array = read_bytearray(filepath)
    assert(len(byte_array) == filesize)
    
    block_datas = array_split_given_size(byte_array, 4096)
    assert(len(block_datas) == num_blocks)
    
    dfs = [calculate_histogram(block_data) for block_data in block_datas]
    entropies = [calculate_shannon_entropy(df['prob'].tolist()) for df in dfs]
    
    file_entropy = np.mean(entropies)
    
    return file_entropy

time: 944 µs (started: 2024-01-29 18:14:59 -08:00)


In [41]:
folder_path = 'files_dataset/original'
files_df = pd.DataFrame(columns = ['filename', 'filesize', 'entropy', 'extension'])

for filename in os.listdir(folder_path):
    
    if not filename.startswith('.'):
        
        filepath = os.path.join(folder_path, filename)
        name = filename.split('.')[0]
        filesize = os.path.getsize(filepath)
        entropy = get_entropy(filepath)
        extension = filename.split('.')[-1]
        
        new_row = {'filename': name, 'filesize': filesize, 'entropy': entropy, 'extension': extension}
        files_df = pd.concat([files_df, pd.DataFrame([new_row])], ignore_index=True)

time: 3.92 s (started: 2024-01-29 18:41:51 -08:00)


In [57]:
folder_path = 'files_dataset/encrypted'

for filename in os.listdir(folder_path):
    
    if not filename.startswith('.'):
        
        filepath = os.path.join(folder_path, filename)
        name = filename.split('.')[0]
        filesize = os.path.getsize(filepath)
        entropy = get_entropy(filepath)
        extension = filename.split('.')[-1]
        
        new_row = {'filename': name, 'filesize': filesize, 'entropy': entropy, 'extension': extension}
        files_df = pd.concat([files_df, pd.DataFrame([new_row])], ignore_index=True)

time: 3.82 s (started: 2024-01-29 19:00:46 -08:00)


In [58]:
files_df

Unnamed: 0,filename,filesize,entropy,extension
0,069,3002,4.600665,html
1,074,3898,4.698261,txt
2,016,466245,7.932331,png
3,017,606390,7.929483,png
4,075,2845,4.487290,txt
...,...,...,...,...
195,070,3776,7.949830,wcry
196,035,1440,7.872682,wcry
197,092,79744,7.952313,wcry
198,093,155680,7.876446,wcry


time: 2.89 ms (started: 2024-01-29 19:00:50 -08:00)
