First convert the output of segmentation into a list of byte arrays instead of a list of binary numbers (this can also be done more directly obviously, but for now it works. 

In [1]:
def number_2_bytearray(num): 
    return bytearray(int(num,2).to_bytes((len(num)+7) // 8, byteorder='big'))

Let's take the example data as it is output by segmentation

In [2]:
data = ['0100100001100101011011000110110001101111001000000111011101101111011100100110110001100100001011000010000001001000011001010110110001101100011011110010000001110111011011110111001001101100011001000010110000100000010010000110010101101100011011000110111100100000',
'0111011101101111011100100110110001100100001011000010000001001000011001010110110001101100011011110010000001110111011011110111001001101100011001000010110000100000010010000110010101101100011011000110111100100000011101110110111101110010011011000110010000101100', '0010000001001000011001010110110001101100011011110010000001110111011011110111001001101100011001000010110000100000010010000110010101101100011011000110111100100000011101110110111101110010011011000110010000101100001000000100100001100101011011000110110001101111', '0010000001110111011011110111001001101100011001000010110000100000010010000110010101101100011011000110111100100000011101110110111101110010011011000110010000101100001000000100100001100101011011000110110001101111001000000111011101101111011100100110110001100100', '0010110000100000010010000110010101101100011011000110111100100000011101110110111101110010011011000110010000101100001000000100100001100101011011000110110001101111001000000111011101101111011100100110110001100100001011000010000001001000011001010110110001101100', '0110111100100000011101110110111101110010011011000110010000101100001000000100100001100101011011000110110001101111001000000111011101101111011100100110110001100100001011000010000001001000011001010110110001101100011011110010000001110111011011110111001001101100', '0110010000101100001000000100100001100101011011000110110001101111001000000111011101101111011100100110110001100100001011000010000001001000011001010110110001101100011011110010000001110111011011110111001001101100011001000010110000100000010010000110010101101100', '0110110001101111001000000111011101101111011100100110110001100100001011000010000001001000011001010110110001101100011011110010000001110111011011110111001001101100011001000010110000100000010010000110010101101100011011000110111100100000011101110110111101110010', '0110110001100100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000']

In [3]:
data_bytearray = []
for i in data:
    data_bytearray.append(number_2_bytearray(i))
data_bytearray

[bytearray(b'Hello world, Hello world, Hello '),
 bytearray(b'world, Hello world, Hello world,'),
 bytearray(b' Hello world, Hello world, Hello'),
 bytearray(b' world, Hello world, Hello world'),
 bytearray(b', Hello world, Hello world, Hell'),
 bytearray(b'o world, Hello world, Hello worl'),
 bytearray(b'd, Hello world, Hello world, Hel'),
 bytearray(b'lo world, Hello world, Hello wor'),
 bytearray(b'ld\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')]

Now we can start using this to implement the formation of seeds. For now I'm just going to be using the robust solition distribution as described in the original paper, if we want we can later always change this distribution depending on what we think necessary. Also, since the implementation of the LFSR is implemented seperately. I will assume I am given a seed and need to generate the corresponding droplet.

In [4]:
import random
import math
import numpy as np
import functools
test_seed = 454761243
# convert the test seed into the index that will be packaged into the droplet.
# so for example with seed = 7 = 00000000 00000000 00000000 00000111. we create a byte array of these bytes.
# first convert to bytearray.
seed_array = bytearray(number_2_bytearray(bin(test_seed)))
if len(seed_array) < 4:  
    seed_index = bytearray(4-len(seed_array)) + seed_array
elif len(seed_array) > 4:
    if seed_array[0] == 0:
        del seed_array[0]
        seed_index = seed_array
    else:
        print("seed too big")
# create a Pseudo random number generator
prng = random.Random()


In order to use a rng to select following a specific distribution, we first need to calculate this distribution. Since the robus soliton distribution is the ideal soliton distribution plus some additional probabilities and normalized, we first calculate the ideal solition distribution. Using the formulas p(1) = 1/K and p(i) = 1/(i*(i-1)). (code inspiration from https://franpapers.com/en/algorithmic/2018-introduction-to-fountain-codes-lt-codes-with-python/) but rewritten so I can actually understand it LOL

In [5]:
def ideal_soliton(K) -> list: 
    # initialize with the first two values, p(0) = 0 and p(1) = 1/K
    probabilities = [0,1/K]
    # calculate the rest of the values using p(i) = 1/(i*(i-1))
    probabilities += [1/(i*(i-1)) for i in range(2, K+1)]
    return probabilities 

We can now use this to define our robust solition distribution, by adding the following probabilities. For i = 1 to K/R-1: p(i) = R/(i*K), for i = K/R: p(i) = (R*ln(R/delta))/K and for i = K/R+1 to K: p(i) = 0. With R = c*ln^2(K/delta)*sqrt(K).
as you can tell this now adds to more parameters c and delta to the distribution. These need to be tuned to achieve optimal decoding capability.

In [34]:
import matplotlib.pyplot as plt
def robust_soliton(K,c,delta) -> list:
    #initialize with the ideal distribution
    probabilities = ideal_soliton(K)
    # Define R
    R = c*(math.log(K/delta)**2)*math.sqrt(K)
    # calculate the additional probabilities
    pivot = int(math.floor(K/R))
    robust_probabilities = [0] + [R/(i*K) for i in range(1, pivot)]
    robust_probabilities += [(R*math.log(R/delta))/K]
    robust_probabilities += [0 for i in range(pivot,K)]
    # add together
    print(K/R, robust_probabilities)
    probabilities = np.add(robust_probabilities, probabilities)
    #normalize 
    probabilities /= sum(probabilities)
    return probabilities
plt.bar(range(0,101), robust_soliton(100,0.001,0.025))

145.36730512712447 [0, 0.006879125943247657, 0.0034395629716238284, 0.0022930419810825524, 0.0017197814858119142, 0.0013758251886495313, 0.0011465209905412762, 0.0009827322776068083, 0.0008598907429059571, 0.0007643473270275174, 0.0006879125943247657, 0.000625375085749787, 0.0005732604952706381, 0.0005291635340959736, 0.0004913661388034041, 0.00045860839621651044, 0.00042994537145297856, 0.0004046544672498622, 0.0003821736635137587, 0.0003620592601709293, 0.00034395629716238283, 0.00032757742586893605, 0.0003126875428748935, 0.0002990924323151155, 0.00028663024763531906, 0.0002751650377299063, 0.0002645817670479868, 0.00025478244234250583, 0.00024568306940170206, 0.000237211239422333, 0.00022930419810825522, 0.0002219072884918599, 0.00021497268572648928, 0.00020845836191659567, 0.0002023272336249311, 0.00019654645552136162, 0.00019108683175687936, 0.00018592232279047722, 0.00018102963008546466, 0.00017638784469865788, 0.00017197814858119142, 0.00016778355959140627, 0.000163788712934468

ValueError: operands could not be broadcast together with shapes (146,) (101,) 

We can then take a selection from this distribution using the python choices() method. This will tell us how many segments to include in a droplet. We can then use the choices() method again to select which segments will be included. (the bit about max_choices is only necessary cause the sample data is very small, but might be something to consider because small amounts of data can cause issues here.) 

In [None]:
prng.seed(test_seed)
max_choices = len(data_bytearray)
amount = prng.choices(range(0,101), robust_soliton(100,0.1,0.05), k = 1)[0]
segments = prng.sample(data_bytearray,k = amount)

we can then use the python reduce() method to apply our XOR to all the segments and create the final droplet, which we then append the seed to.

In [None]:
# create the droplet
droplet = seed_index + bytearray(functools.reduce(lambda i, j: bytes(a^b for (a, b) in zip(i,j)), segments))
len(droplet) 

Last but not least we can add the reed solomon codes.

In [None]:
import reedsolo 
# prepare reedsolomon
rsc = reedsolo.RSCodec(2)
# create the encoded droplet (what will eventually be stored in DNA)
encoded = rsc.encode(droplet)
# length of the encoded droplet is 38 as expected (32 data, 4 seed, 2 RS)
print(encoded, len(encoded))

try to recover droplets from the seed.

In [None]:
prng.seed(test_seed)
amount_recovery = prng.choices(range(0,max_choices+1), robust_soliton(max_choices,0.1,0.05), k = 1)[0]
assert amount_recovery == amount 
#I can't think of a way to do this without having prior knowledge of the amount of stored data... something to discuss
segment_indices = prng.sample(range(len(data_bytearray)), k = amount_recovery)
segment_test = []
for i in segment_indices:
    segment_test.append(data_bytearray[i])
assert segment_test == segments
print(amount_recovery, segment_indices)