In [41]:
## import libraries
%load_ext autoreload
%autoreload 2
import utilities as util
import os
import reedsolo
import random
import timeit
import matplotlib.pyplot as plt
import math
import numpy as np
import hashlib
# import Levenshtein as lev #requires microsoft visual studio build tools

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [42]:
##SET GLOBAL VARIABLES never changing
# maximum allowed homopolymer length
max_homopolymer = 3
# range for allowed CG-content
cg_content = (0.45,0.55)
# number of error correcting code bytes per droplet
ecc_bytes = 4
# parameters of Robust Soliton (c and delta)
robust_soliton_para = (0.001, 0.025)
# expected loss of droplets during storage and synthesis
loss_of_droplets = 0.05 # experimentally determined and purposefully rounded up

In [43]:
## set variables
# path to file to be encoded
path = os.path.relpath("C:\\Users\\20192903\\Documents\\DB_OGO\\input_data\\Lorem ipsum 2mb.txt")
# percentage extra segments created (so if extra = 1.07, a file of 10000 segments gets turned into 10700 droplets)
# extra = 1.08#can be better calculated with the Robust soliton distribution and approximated rate of failure in storage/synthesis

# filename of encoded droplet DNA strands
DNA_filename = "DNA_strands.txt"
# filename of results from simulation
collapsed_filename = "sequenced.txt"
#filename of final result
result_filename = "recovered.txt"
# Whether or not to correct the droplets using RS
correct = True

In [44]:
def Encode(file_path, robust_soliton_para, loss_of_droplets, max_homopolymer = 3, cg_content= (0.45,0.55), ecc_bytes=4) -> list:
    """ Read and encode a file using the luby transform, returning a list of DNA sequences
    INPUT:
    file_path: path to the file to be encoded
    seed:  seed for the randomization of data
    extra: amount of extra droplets compared to data (default 5%, so 1.05)
    max_homopolymer: maximum length of homopolymer sequences allowed (default 3)
    OUTPUT:
    droplets: list of DNA sequences
    """
    # first read file
    with open("checked_seedlist.txt" , "r") as f:
        seedlist = f.read().splitlines()
    data = util.OpenFile(file_path)
    # make bytearray from data
    data = bytearray(data)
    # determine how many segments the data will consist of and use it as a seed for randomization
    data = util.Bytearray2Binary(data)
    randomise_seed = math.ceil(len(data)/256)
    data = util.Binary2Bytearray(data)
    # randomize the data 
    data = util.RandomizeMessage(data, randomise_seed)
    # data needs to be converted to string for segmentation
    data = util.Bytearray2Binary(data)
    # add padding and segment
    data = util.Segment(data)
    # now the luby transform needs to be applied
    
    total_segments = len(data)
#     print(total_segments)
    distribution_size = total_segments
    
    redundancy = util.RobustSoliton(distribution_size, robust_soliton_para, True)/(1-loss_of_droplets)
    
    # first convert segments to bytearrays
    data_bytearray = [util.Binary2Bytearray(segment) for segment in data]
    
    # create a Pseudo random number generator 
    prng = random.Random()
    seeds_lfsr = util.LFSR() #starting an lfsr with a certain state and a polynomial for 32bits.
    nr_droplets_probablities = util.RobustSoliton(distribution_size, robust_soliton_para)
    droplets = []
    droplets_required = int(redundancy*total_segments)
    i = 0 
    while len(droplets) < droplets_required:
        # generate a seed for droplet making
        seed = int(seedlist[i])
        # create droplet
        current_drop = util.MakeDroplet(data_bytearray, seed, prng, nr_droplets_probablities, ecc_bytes)
        # convert drop to string and then DNA
        current_drop = util.Binary2DNA(util.Bytearray2Binary(current_drop))
        # check biochemical requirements and append to list of droplets if ok
        if util.CheckBiochemicalRequirements(current_drop, max_homopolymer, cg_content):
            droplets.append(current_drop)
        i+=1
    
    ##make metadata-strands
    metastrand = util.CreateMetaStrand(total_segments,ecc_bytes)
    
    for x in range(10): # now 10 for adding it in 10 times
        droplets.append(metastrand)
    
    info = (total_segments, redundancy, i, len(droplets))
    return droplets, info

In [None]:
starttime = timeit.default_timer()
print("The start time of encoding is :",starttime)

encoded, info = Encode(path, robust_soliton_para, loss_of_droplets, max_homopolymer, cg_content, ecc_bytes)
f = open(DNA_filename, "w")
for i in encoded:
    f.write(i + "\n")
f.close()

print("The time of encoding is :", timeit.default_timer() - starttime)
print(info)

The start time of encoding is : 2891.4869547


# SIMULATION

In [None]:
def sim_seq_simple(sequences, dropout = 0.01, insert_rate = 0.00042, del_rate = 0.00188, sub_rate = 0.00407):
    # Takes sequences as a list of unique DNA sequences and simulates: synthesis, PCR and NGS
    # Dropout is the dropout rate (i.e. the number of sequences that are not recovered)
    # For baselevel errors we have insert_rate for insertions, del_rate for deletions and sub_rate for substitutions.
    # Default values for base errors are taken from https://www.nature.com/articles/nbt.4079
    s = sequences.copy()
    random.shuffle(s)
    for _ in range(0, int(dropout*len(s))):
        s.pop()
    
    for i in range(len(s)):
        seq_l = list(s[i])
        mod = 0 # Inserting messes with indexing so we skip over inserted bases using this counter
        for base_i in range(len(seq_l)):
            mutate_rand = random.random()
            if mutate_rand < insert_rate:
                # Insert random nucleotide after this base
                seq_l.insert(base_i+mod, random.choice(['A','C', 'T', 'G']))
                mod+=1
            elif mutate_rand > insert_rate and mutate_rand < (del_rate+insert_rate):
                # Delete this nucleotide (later)
                seq_l[base_i+mod] = '_'
            elif mutate_rand > (insert_rate+del_rate) and mutate_rand < (del_rate+insert_rate+sub_rate):
                # Substitute base
                seq_l[base_i+mod] = random.choice(['A', 'C', 'T', 'G'])
        if '_' in seq_l:
            for dels in range(seq_l.count('_')):
                seq_l.remove('_')
        s[i] = ''.join(seq_l)
    return s

In [None]:
with open("DNA_strands.txt", 'r') as f:
    dropletlist = [line.rstrip('\n') for line in f]
sequenced = sim_seq_simple(dropletlist, dropout = 0.001, insert_rate = 0.000042, del_rate = 0.000188, sub_rate = 0.000407)
with open("sequenced.txt", "w") as file:
    for i in sequenced:
        file.write(i + "\n")

# DECODING

In [None]:
starttime = timeit.default_timer()

with open(collapsed_filename, 'r') as f:
    dropletlist = [line.rstrip('\n') for line in f]

n=0
#check if all the droplets meet our requirements otherwise don't add them to the list of OKdroplets
metadata = []
OKdroplets = []
for strand in dropletlist:
#     if util.CheckBiochemicalRequirements(strand):
    if util.CheckOligoLength (strand,length=144+ecc_bytes*4):
#             if lev.distance("ACGTACGTACGTACGT", strand[:16]) <= 1:
        if strand[:16] == "ACGTACGTACGTACGT":
            metadata.append(strand)
        else:
            OKdroplets.append(strand)
                
total_segments_list = [util.DecodeMetaStrand(metastrand,ecc_bytes) for metastrand in metadata]
total_segments = max(set(total_segments_list), key = total_segments_list.count)
print(total_segments)
simplified_droplets = []
corrected_droplets = []
rsc = reedsolo.RSCodec(ecc_bytes)
for strand in OKdroplets:
    binary_strand = util.DNA2Binary(strand)
    bytearray_strand = util.Binary2Bytearray(binary_strand)
#convert the list of strings to a bytearray 
    if rsc.check(bytearray_strand)[0]:
        droplet_seed = int(binary_strand[0:32], 2)
        segment_amount, segment_indices = util.RecoverSeed(droplet_seed, total_segments, robust_soliton_para)
        payload = binary_strand[32:288]
        simplified_droplets.append((segment_indices, payload))
        n+=1
    elif correct:
        try:
            binary_strand = util.Bytearray2Binary(rsc.decode(bytearray_strand)[0])
            droplet_seed = int(binary_strand[0:32], 2)
            segment_amount, segment_indices = util.RecoverSeed(droplet_seed, total_segments, robust_soliton_para)
            payload = binary_strand[32:288]
            corrected_droplets.append((segment_indices, payload))
        except Exception:
            pass
simplified_droplets += corrected_droplets

print("The time of checking droplets is :", timeit.default_timer() - starttime)
print(len(simplified_droplets)-n,len(simplified_droplets),len(OKdroplets),len(dropletlist))
#if the strand is correct according to reed-solomon, 
#make a list including the segment indices of the droplet and #their combined XOR

In [None]:
starttime = timeit.default_timer()

output_data = {}
input_data = simplified_droplets
newsolves = 1
while newsolves > 0:
    input_data, output_data, newsolves = util.Decode(input_data, output_data)
    
print("The time of decoding checked droplets is :", timeit.default_timer() - starttime)

In [None]:
if len(output_data)==total_segments:
    solution = ''.join([output_data[x] for x in range(len(output_data))])
    solution = util.RemovePadding(solution)
    solution = util.RandomizeMessage(util.Binary2Bytearray(solution),len(output_data))
else:
    solution = bytearray(0)
    print('solution could not be reached, use another more robust method' + str(len(output_data)))

In [None]:
file_path = path
data = util.OpenFile(file_path)
# make bytearray from data
data = bytearray(data)
data == solution

In [None]:
start_time = time.time()
output_data = {}
input_data = []
y = []

for x in range(len(simplified_droplets)):
    input_data.append(simplified_droplets[x])
    newsolves = 1
    while newsolves > 0:
        input_data, output_data, newsolves = util.Decode(input_data, output_data)
    y.append(len(output_data))
print(time.time()-start_time)
plt.plot(y)


In [None]:
for x in range(len(y)):
    if y[x] == 18783:
        print(x)
    

In [None]:
def IdealSoliton(K) -> list: 
    """ Generate a list of probalities of length K, following ideal soliton distribution
    INPUT:
        K: length of list
    OUTPUT:
        probabilities: list of probabilities following ideal soliton distribution
    """
    # initialize with the first two values, p(0) = 0 and p(1) = 1/K
    probabilities = [0, 1/K]
    # calculate the rest of the values using p(i) = 1/(i*(i-1))
    probabilities += [1/(i*(i-1)) for i in range(2, K+1)]
    return probabilities 

def RobustSoliton(K,c,delta, get_redundancy=False) -> list:
    """ Generates a list of probalities of length K, following an robust soliton distribution with variables c and delta.
    INPUT:
        K: length of list
        c: value of c variable in distribution
        delta: value of delta variable in distribution
    OUTPUT: either of the two
        probabilities: list of probabilities following robust soliton distribution
        Z: the factor of amount of droplets needed to decode this with a certainty delta 
    """
    #initialize with the ideal distribution
    probabilities = IdealSoliton(K)
    # Define R
    R = c*(math.log(K/delta)**2)*math.sqrt(K)
    # calculate the additional probabilities
    pivot = int(math.floor(K/R))
    robust_probabilities = [0] + [R/(i*K) for i in range(1, pivot)]
    robust_probabilities += [(R*math.log(R/delta))/K]
    robust_probabilities += [0 for i in range(pivot,K)]
    # add together
    probabilities = np.add(robust_probabilities, probabilities)
    #normalize 
    Z=np.sum(probabilities)
    probabilities /= np.sum(probabilities)
    if get_redundancy:
        return Z
    else:
        return probabilities

Z=RobustSoliton(19000, 0.001, 0.025, True)
plt.plot(Z)
print(Z, Z/0.94)
