In [1]:
import pandas as pd
import numpy as np
import time, math, os, glob, wget, shutil, joblib, warnings, sys
from collections import Counter
eps = 1e-10

In [2]:
og_file_list = glob.glob('files_dataset/original/*')

In [14]:
enc_file_list = glob.glob('files_dataset/encrypted/*')

In [4]:
def read_bytearray(filepath):
    with open(filepath, mode='rb') as file:
        byte_data = file.read()
        return np.array(list(byte_data))

In [5]:
def array_split_given_size(a, size):
    return np.split(a, np.arange(size,len(a),size))

In [6]:
def calculate_histogram(block_data):
    counter = Counter(block_data)
    bins = np.arange(0, 256)
    counts = [counter[i] for i in bins]
    assert(bins[0] == 0)
    assert(bins[-1] == 255)
    df = pd.DataFrame({
        'bin': bins,
        'count': counts,
    })
    df['prob'] = df['count'] / np.sum(counts)
    df['prob'] = [np.max([eps, x]) for x in df['prob']]
    return df

In [7]:
def calculate_shannon_entropy(probs):
    return -np.sum(probs * np.log2(probs))

In [8]:
def file_metadata(filepath):
    filesize = os.path.getsize(filepath)
    num_blocks = math.ceil(filesize / 4096)
    byte_array = read_bytearray(filepath)
    assert(len(byte_array) == filesize)
    block_datas = array_split_given_size(byte_array, 4096)
    assert(len(block_datas) == num_blocks)
    dfs = [calculate_histogram(block_data) for block_data in block_datas]
    entropies = [calculate_shannon_entropy(df['prob'].tolist()) for df in dfs]
    return filesize, np.mean(entropies)

In [9]:
file_df = pd.DataFrame(columns=['file_name','file_extension','file_size','file_entropy'])

In [15]:
enc_file_df = pd.DataFrame(columns=['file_name','file_extension','file_size','file_entropy'])

In [11]:
for i in range(len(og_file_list)):
    row = []
    filepath = og_file_list[i]
    file_name = filepath.split("/")[2]
    file_ext = file_name.split(".")[1]
    file_size, file_entropy = file_metadata(filepath)
    row.append(file_name)
    row.append(file_ext)
    row.append(file_size)
    row.append(file_entropy)
    file_df = pd.concat([file_df,pd.DataFrame([row],columns=['file_name','file_extension','file_size','file_entropy'])], ignore_index=True)
    file_df.reset_index()

In [16]:
for i in range(len(enc_file_list)):
    row = []
    filepath = enc_file_list[i]
    file_name = filepath.split("/")[2]
    file_ext = file_name.split(".")[2]
    file_size, file_entropy = file_metadata(filepath)
    row.append(file_name)
    row.append(file_ext)
    row.append(file_size)
    row.append(file_entropy)
    enc_file_df = pd.concat([enc_file_df,pd.DataFrame([row],columns=['file_name','file_extension','file_size','file_entropy'])], ignore_index=True)
    enc_file_df.reset_index()

In [17]:
file_df.head()

Unnamed: 0,file_name,file_extension,file_size,file_entropy
0,069.html,html,3002,4.600665
1,074.txt,txt,3898,4.698261
2,016.png,png,466245,7.932331
3,017.png,png,606390,7.929483
4,075.txt,txt,2845,4.48729


In [18]:
enc_file_df.head()

Unnamed: 0,file_name,file_extension,file_size,file_entropy
0,059.html.wcry,wcry,3104,7.946005
1,048.pdf.wcry,wcry,3392,7.944577
2,049.pdf.wcry,wcry,3248,7.95383
3,084.doc.wcry,wcry,2000,7.903173
4,000.wav.wcry,wcry,1408064,7.954648


In [19]:
file_df.to_csv('Original File Information.csv',index=False)
enc_file_df.to_csv('Encrypted File Information.csv',index=False)