# READ ME

This file contains the main code for encoding and decoding digital information to and from DNA. Two additional files are needed for running it. (utilities.py and checked_seedlist.txt) This second file must be unpacked from the .rar file to .txt in your working directory. In case this unpacking does not work the code can be adjusted slightly at the cost of speed. For this two lines in the Encode funtion have to be uncommented and 3 other lines have to be removed. Further info is commented in the function. 

Certain pieces of code have been copied and adjusted
- about palindromes: https://www.geeksforgeeks.org/find-number-distinct-palindromic-sub-strings-given-string/
- def sim_seq_simple: our mentor 
- Linear Feedback Shift Register (LFSR) used for generating the seedlist: Erlich et al. https://github.com/TeamErlich/dna-fountain/blob/master/lfsr.py

The reedsolo library can be installed running the following in the windows command line:

In [None]:
# pip install --upgrade reedsolo

# Importing libraries and setting variables

In [None]:
## import libraries
%load_ext autoreload
%autoreload 2
import utilities as util
import os
import reedsolo
import random
import timeit
import matplotlib.pyplot as plt
import math
import numpy as np

In [None]:
## SET GLOBAL VARIABLES
max_homopolymer = 3                     # maximum allowed homopolymer length
cg_content = (0.45,0.55)                # range (tuple) for allowed CG-content
ecc_bytes = 4                           # number of error correcting code bytes per droplet
robust_soliton_para = (0.001, 0.025)    # parameters of Robust Soliton (c and delta)
loss_of_droplets = 0.06                 # expected percentage of droplets lost during synthesis simulation
correct = True                          # correct faults in recovered droplets boolean

In [None]:
## set variables for file names/directory

# path to file to be encoded (change this!)
path = os.path.abspath('Input Data\\Lorem ipsum.txt')

# path to decoded file (change this!)
decoded_path = os.path.abspath("Output Data\\Lorem ipsum recovered.txt")

# path to encoded droplet DNA strands (shouldn't need to change)
DNA_path = os.path.abspath("Temp Data\\DNA_strands.txt")

# path to DNA strands resulting from simulation (shouldn't need to change)
sequenced_path = os.path.abspath("Temp Data\\sequenced.txt")

# # filename of recovered file
# result_filename = "recovered.txt"


# Encoding

structure
- read randomize and segment the data
- get number of droplets to be made
- make enough droplets passing the biochemical requirements

Run these cells to encode the file.

In [None]:
def Encode(file_path, robust_soliton_para, loss_of_droplets, max_homopolymer, cg_content, ecc_bytes) -> list:
    """ Read and encode a file using the luby transform, returning a list of DNA sequences
    INPUT:
        file_path: path to the file to be encoded
        robust_soliton_para: parameters c and delta for the soliton distibution
        loss_of_droplets: expected loss of droplets in synthesis simulation
        max_homopolymer: maximum length of homopolymer sequences allowed
        cg_content: tuple of minimum and maximum CG content allowed
        ecc_bytes: number of error correcting bytes per droplet
    OUTPUT:
        droplets: list of DNA sequences
        info: contains total segments, redundancy, number of generated droplets, number of accepted droplets
    """
    # read, randomize and segment original file to binary segments
    data = util.OpenFile(file_path)
    data = bytearray(data)
    randomise_seed = math.ceil(len(data)/32)
    data = util.RandomizeMessage(data, randomise_seed)
    data = util.Bytearray2Binary(data)
    data = util.Segment(data)
    data_bytearray = [util.Binary2Bytearray(segment) for segment in data]
    
    # calculate number of droplets to be made
    total_segments = len(data)
    redundancy = util.RobustSoliton(total_segments, robust_soliton_para, True)/(1-loss_of_droplets)
    droplets_required = int(redundancy*total_segments)
    
    # initiate lfsr to hop through possible seeds.
    with open("checked_seedlist.txt" , "r") as f:
        seedlist = f.read().splitlines()
#     seeds_lfsr = util.LFSR() #use if checked_seedlist.txt is not available and remove two lines prior
       
    # calculate probabilities of number of segments in a droplet
    nr_droplets_probablities = util.RobustSoliton(total_segments, robust_soliton_para)
    
    # make droplets
    droplets = []
    i = 0 
    while len(droplets) < droplets_required:
        seed = int(seedlist[i])
#         seed = next(seeds_lfsr) #use if checked_seedlist.txt is not available and remove one line prior
        current_drop = util.MakeDroplet(data_bytearray, seed, nr_droplets_probablities, ecc_bytes)
        current_drop = util.Binary2DNA(util.Bytearray2Binary(current_drop))
        if util.CheckBiochemicalRequirements(current_drop, max_homopolymer, cg_content):
            droplets.append(current_drop)
        i+=1
    
    # make metadatastrands
    for x in range(10): # now 10 for adding it in 10 times
        metastrand = util.CreateMetaStrand(total_segments,ecc_bytes)
        droplets.append(metastrand)
    
    info = (total_segments, redundancy, i, len(droplets))
    return droplets, info

In [None]:
starttime = timeit.default_timer()

encoded, info = Encode(path, robust_soliton_para, loss_of_droplets, max_homopolymer, cg_content, ecc_bytes)
f = open(DNA_path, "w")
for i in encoded:
    f.write(i + "\n")
f.close()

print("The time of encoding is :", timeit.default_timer() - starttime)
print(info)

# SIMULATION

Run the following cells to simulate PCR on the encoded file.

In [None]:
def sim_seq_simple(sequences, dropout = 0.01, insert_rate = 0.00042, del_rate = 0.00188, sub_rate = 0.00407):
    # Takes sequences as a list of unique DNA sequences and simulates: synthesis, PCR and NGS
    # Dropout is the dropout rate (i.e. the number of sequences that are not recovered)
    # For baselevel errors we have insert_rate for insertions, del_rate for deletions and sub_rate for substitutions.
    # Default values for base errors are taken from https://www.nature.com/articles/nbt.4079
    s = sequences.copy()
    random.shuffle(s)
    for _ in range(0, int(dropout*len(s))):
        s.pop()
    
    for i in range(len(s)):
        seq_l = list(s[i])
        mod = 0 # Inserting messes with indexing so we skip over inserted bases using this counter
        for base_i in range(len(seq_l)):
            mutate_rand = random.random()
            if mutate_rand < insert_rate:
                # Insert random nucleotide after this base
                seq_l.insert(base_i+mod, random.choice(['A','C', 'T', 'G']))
                mod+=1
            elif mutate_rand > insert_rate and mutate_rand < (del_rate+insert_rate):
                # Delete this nucleotide (later)
                seq_l[base_i+mod] = '_'
            elif mutate_rand > (insert_rate+del_rate) and mutate_rand < (del_rate+insert_rate+sub_rate):
                # Substitute base
                seq_l[base_i+mod] = random.choice(['A', 'C', 'T', 'G'])
        if '_' in seq_l:
            for dels in range(seq_l.count('_')):
                seq_l.remove('_')
        s[i] = ''.join(seq_l)
    return s

In [None]:
with open(DNA_path, 'r') as f:
    dropletlist = [line.rstrip('\n') for line in f]
sequenced = sim_seq_simple(dropletlist, dropout = 0.001, insert_rate = 0.000042, del_rate = 0.000188, sub_rate = 0.000407)
with open(sequenced_path, "w") as file:
    for i in sequenced:
        file.write(i + "\n")

# DECODING

Run the cells below to decode the file, the decoded file will also be compared to the original file.

In [None]:
starttime = timeit.default_timer()

with open(sequenced_path, 'r') as f:
    dropletlist = [line.rstrip('\n') for line in f]


# check the length of droplets and select metadata strands 
metadata = []
OKdroplets = []
for strand in dropletlist:
    if util.CheckOligoLength (strand,length=144+ecc_bytes*4):
        if strand[:16] == "ACGTACGTACGTACGT":
            metadata.append(strand)
        else:
            OKdroplets.append(strand)

# get the total number of segments
total_segments_list =[]
for metastrand in metadata:
    total_segments_list = util.DecodeMetaStrand(metastrand, total_segments_list)
total_segments = max(set(total_segments_list), key = total_segments_list.count)

# retrieve payload and which segments are in each droplet
simplified_droplets = []
corrected_droplets = []
rsc = reedsolo.RSCodec(ecc_bytes)
for strand in OKdroplets:
    binary_strand = util.DNA2Binary(strand)
    bytearray_strand = util.Binary2Bytearray(binary_strand)
    if rsc.check(bytearray_strand)[0]:
        droplet_seed = int(binary_strand[0:32], 2)
        segment_amount, segment_indices = util.RecoverSeed(droplet_seed, total_segments, robust_soliton_para)
        payload = binary_strand[32:288]
        simplified_droplets.append((segment_indices, payload))
    elif correct:
        try:
            binary_strand = util.Bytearray2Binary(rsc.decode(bytearray_strand)[0])
            droplet_seed = int(binary_strand[0:32], 2)
            segment_amount, segment_indices = util.RecoverSeed(droplet_seed, total_segments, robust_soliton_para)
            payload = binary_strand[32:288]
            corrected_droplets.append((segment_indices, payload))
        except Exception:
            pass

simplified_droplets += corrected_droplets

print("The time of fixing droplets and simplifying droplets is :", timeit.default_timer() - starttime)
print(len(corrected_droplets), len(simplified_droplets), len(dropletlist))


In [None]:
starttime = timeit.default_timer()

output_data = {}
input_data = simplified_droplets
newsolves = 1
while newsolves > 0:
    input_data, output_data, newsolves = util.Decode(input_data, output_data)
    
print("The time of decoding simplified droplets is :", timeit.default_timer() - starttime)

In [None]:
if len(output_data)==total_segments:
    # reconstruct segments to original file
    solution = ''.join([output_data[x] for x in range(len(output_data))])
    solution = util.RemovePadding(solution)
    solution = util.RandomizeMessage(util.Binary2Bytearray(solution),len(output_data))
    
    # compare reconstructed file to original
    file_path = path
    data = bytearray(util.OpenFile(file_path))
    if data == solution:
        print("file recovery was 100% succesfull!!")
    else:
        bad=0
        for x in range(len(solution)):
            if solution[x]!=data[x]:
                bad+=1
        bad/len(solution)
        print("All segments were solved but contained errors. Mistakes as factor of entire reconstructed file" + bad/len(solution))
else:
    solution = bytearray(0)
    print("Solution could not be reached, increase redundancy. " + str(len(output_data))+ " segments were solved.")

# Writing decoded file.

In [None]:
with open(decoded_path, "wb") as file:
    file.write(solution)

# Visualisation

Graph solved segments vs incoming droplets. For a large number of segments this may take a while

input needed is only simplified_droplets

In [None]:
output_data = {}
input_data = []
y = []

for x in range(len(simplified_droplets)):
    input_data.append(simplified_droplets[x])
    newsolves = 1
    while newsolves > 0:
        input_data, output_data, newsolves = util.Decode(input_data, output_data)
    y.append(len(output_data))
plt.plot(y)
plt.xlabel("Recieved droplets")
plt.ylabel("Solved segments")
plt.title("Solved segments during decoding the luby transform")