In [1]:
#Credit to https://github.com/BYUCamachoLab/ottoeplitz
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
import ottoeplitz
from scipy.stats import chi2, binom


""" 
Toeplitz Hashing Example
======================

In this example, we generate a large Gaussian input data set. We plot the data before
and after hashing. The data after hashing should be uniform.

"""





In [2]:
def int_to_binary_string(n, length):
    n = int(n)
    return f"{n:0{length}b}"
    
def data_to_bitstring(data, N):
    strings = [int_to_binary_string(num, N) for num in data]
    return ''.join(strings)

def bitstring_to_int_array(bitstring, N):
    return np.array([int(bitstring[i:i+N], 2) for i in range(0, len(bitstring), N)])

def read_full_file_as_str(filename):
    with open(filename, 'r') as file:
        return file.read().strip()

# Function to calculate Shannon entropy
def shannon_entropy(bitstring):
    # Convert bitstring to a numpy array of integers
    bits = np.array([int(bit) for bit in bitstring])
    
    # Calculate the frequency of 0's and 1's
    counts = np.bincount(bits)
    probabilities = counts / len(bits)
    
    # Filter out zero probabilities to avoid log2(0)
    probabilities = probabilities[probabilities > 0]
    
    # Calculate entropy
    entropy = -np.sum(probabilities * np.log2(probabilities))
    
    return entropy

def chi_squared_test(bits):
        counts = np.bincount(list(bits), minlength=2)
        observed = counts
        expected = np.array([len(bits)/2, len(bits)/2])
        chi_sq = np.sum((observed - expected) ** 2 / expected)
        # Degrees of freedom = number of categories - 1 = 1
        p_value = 1 - chi2.cdf(chi_sq, df=1)
        return chi_sq, p_value

def plot_data(data, n):
        """ Bins up data and plots. """
        N, data = ottoeplitz.Toeplitz._calculate_N(data)
        binned_data, bins = np.histogram(data, bins=2**n)
        data_digital = np.digitize(data, bins, right=True)
        fig, ax = plt.subplots()
        ax.hist(data_digital,bins=2**n, label='Digitized Raw Data')
        plt.xlabel('Random numbers')
        plt.ylabel('Frequency')
        plt.title("Plotting Data Before and After Hashing")
        plt.show()
        return binned_data, data_digital
    
def tuple_diff(a, b):
    ret = []
    for i in range(len(a)):
        ret.append(a[i]-b[i])
    return tuple(ret)

def entropize(full_bitstring, inputdata, N, verbose=True):
    e0 = shannon_entropy(full_bitstring)
    c0 = chi_squared_test(full_bitstring)
    if verbose:
        plot_data(inputdata, N)
        print("Entropy: ", e0)
        print("Chi-squared", c0)
        
    t = ottoeplitz.Toeplitz(inputdata, N)
    
    
    processed_data = t.hash()
    processed_data = von_neumann_int_array(processed_data, N)
    
    processed_bitstring = data_to_bitstring(processed_data, N)
    e1 = shannon_entropy(processed_bitstring)
    c1 = chi_squared_test(processed_bitstring)
    if verbose:
        plot_data(processed_data, N)
        print("Entropy: ", e1)
        print("Chi-squared", c1)
    return e1-e0, tuple_diff(c1,c0)

def int_index_map(lst):
    indexes = {}
    counts = {}
    for index, value in enumerate(lst):
        if value not in indexes:
            indexes[value] = []
            counts[value] = 0
        indexes[value].append(index)
        counts[value]+=1   
    return indexes, counts


def toefflize_with_extra(full_bitstring, inputdata, N):
    t = ottoeplitz.Toeplitz(inputdata, N)
    plot_data(inputdata, N)
    print("Entropy: ", shannon_entropy(full_bitstring))
    print("Chi-squared", chi_squared_test(full_bitstring))
    
    processed_data = t.hash()
    indexes, counts = int_index_map(processed_data)
    counts = list(sorted(counts.items(), key=lambda item: item[1]))
    for i in range(len(counts)//200):
        swap_dest = counts[i][0]
        num_to_swap = counts[i][1] // 200
        swap_source = counts[-1*i][0]
        for index in random.sample(indexes[swap_dest], num_to_swap):
            processed_data[index] = swap_source
    processed_data = von_neumann_int_array(processed_data, N)
    plot_data(processed_data, N)
    processed_bitstring = data_to_bitstring(processed_data, N)
    print("Entropy: ", shannon_entropy(processed_bitstring))
    print("Chi-squared", chi_squared_test(processed_bitstring))

def von_neumann_extractor(bitstring):
    extracted_bits = []
    
    # Iterate over the bitstring in pairs
    for i in range(0, len(bitstring) - 1, 2):
        pair = bitstring[i:i+2]
        
        # Only keep the result when the pair has different bits
        if pair == '01':
            extracted_bits.append('0')
        elif pair == '10':
            extracted_bits.append('1')
    
    # Return the extracted bitstring
    return ''.join(extracted_bits)

def von_neumann_int_array(int_array, N):
    bitstring = data_to_bitstring(int_array.flatten(), N)
    bitstring = von_neumann_extractor(bitstring)
    return bitstring_to_int_array(bitstring, N)
    

In [4]:
N = 10
for file in ["RNG1-2.txt", "RNG2-2.txt"]:
    print(f"Reading data from {file}")
    full_bitstring = read_full_file_as_str(file)
    dataset = np.array(bitstring_to_int_array(full_bitstring, N))
    #toefflize_with_extra(full_bitstring, dataset, N)
    ediff, chidiff = entropize(full_bitstring, dataset, N, True)
    print("Ediff, Cdiff", ediff, chidiff)


Reading data from RNG1-2.txt
Ediff, Cdiff 4.0318520758253484e-05 (-70.17708182654403, 0.4048111587534625)
Reading data from RNG1-2.txt
Ediff, Cdiff 4.0849607301884916e-05 (-70.55305074277854, 0.5728004372972081)
Reading data from RNG1-2.txt
Ediff, Cdiff 4.195577522070337e-05 (-70.72523666666667, 0.702549405575949)
Reading data from RNG2-2.txt
Ediff, Cdiff -1.680797671488321e-06 (0.5054198415442236, -0.43196359128272765)
Reading data from RNG2-2.txt
Ediff, Cdiff -1.4141192948624592e-05 (2.6424515924805, -0.7995737479025173)
Reading data from RNG2-2.txt
Ediff, Cdiff -1.8015561218387433e-06 (0.4127933206590621, -0.389550085590772)
