# Decode DNA-segments back to the original file

# Imports

In [None]:
import random
import math
import numpy as np
import functools
import reedsolo 

# DNA requirements

In [None]:
def CheckBiochemicalRequirements(s, max_length=3, check=False): 
    if s.find((max_length+1)*'C') ==-1 & s.find((max_length+1)*'G') ==-1 & s.find((max_length+1)*'T')==-1 & s.find((max_length+1)*'A')==-1:
#         print('yes, no homopolymers')
        nr_CG = s.count('C')+s.count('G')
        per_CG=nr_CG/len(s)
        if (per_CG < 0.55) & (per_CG > 0.45):
#             print('Suitable CG content')
            check = True
#         else:
#             print('CG content not in accepted ranges')
#     else:
#         print('not a valid sequence: it contains homopolymers longer than three bases')
    return check

In [None]:
#check voor de lengte van de oligo toegevoegd, als dit niet klopt proberen te solven met reed-solomon
def check_oligo_length(s,length=152,check_lenght=False):
    if len(s) == length:
        check_length = True
    else:
        if len(s) < length:
        print('oligo length is too short')
        else:
            print('oligo length is too long')
    return check_length

# Convert DNA to binary

In [None]:
def DNA2Binary(s):
    bs = ""
    for x in s:
        if x == "A":
            bs = bs + "00"
        if x == "C":
            bs = bs + "01"
        if x == "G":
            bs = bs + "10"
        if x == "T":
            bs = bs + "11"
    return bs

In [None]:
def string_to_bytearray(bs):
    """
    INPUTS: 
    bs: string in utf-8 encoding)
    OUTPUT:
    ba: bytearray 
    Converts string to a bytearray
    """
    ba = bytarray(bs,'utf-8')
    return ba

# Reed-Solomon

In [None]:
def Reed_solo(ba):
    """"
    INPUT:
    ba: droplet bytearray with rs encoding.
    OUTPUT:
    decoded_rs: decoded (so errors corrected) droplet 
    Uses RS bits to solve errors in a droplet.
    """
    decoded_rs = rsc.decode(ba)[0]
    return decoded_rs

# Recover seed

In [2]:
def ideal_soliton(K) -> list: 
    """
    INPUT:
    K: length of list
    OUTPUT:
    probabilities: list of probabilities following ideal soliton distribution
    Generates a list of probalities of length K, following an ideal soliton distribution
    """
    # initialize with the first two values, p(0) = 0 and p(1) = 1/K
    probabilities = [0, 1/K]
    # calculate the rest of the values using p(i) = 1/(i*(i-1))
    probabilities += [1/(i*(i-1)) for i in range(2, K+1)]
    return probabilities 

In [None]:
def robust_soliton(K,c,delta) -> list:
    """
    INPUT:
    K: length of list
    c: value of c variable in distribution
    delta: value of delta variable in distribution
    OUTPUT:
    probabilities: list of probabilities following ideal soliton distribution
    Generates a list of probalities of length K, following an robust soliton distribution with variables c and delta.
    """
    #initialize with the ideal distribution
    probabilities = ideal_soliton(K)
    # Define R
    R = c*(math.log(K/delta))*math.sqrt(K)
    # calculate the additional probabilities
    robust_probabilities = [0] + [R/(i*K) for i in range(1, int(K/R)-1)]
    robust_probabilities += [(R*math.log(R/delta))/K]
    robust_probabilities += [0 for i in range(int(K/R),K+1)]
    # add together
    probabilities = np.add(robust_probabilities, probabilities)
    #normalize 
    probabilities /= sum(probabilities)
    return probabilities

In [3]:
def recover_seed(decoded_rs, total_segments):
    """
    INPUT: 
    decoded_rs: decoded droplet, in bytearray format
    total_segments: total number of segments in input file.
    OUTPUT: 
    amount_recovery: amount of segments in the droplet
    segment_indices: indices of the segments in the droplet
    takes a droplet and uses it's seed to determine which segments were XORd into it.
    """
    prng = random.Random()
    droplet_seed = int(decoded_rs[0:32],2)
    prng.seed(droplet_seed)
    amount = prng.choices(range(0,101), robust_soliton(100,0.1,0.05), k = 1)[0]
    segment_indices = prng.sample(range(total_segments),k = amount)
    return (amount, segment_indices)

# Solve Luby-transformations

In [None]:
input_data = [] # deze moet in de mainfile komen
def input_data(segment_indices, decoded_rs):
    segment = decoded_rs[32:len(decoded_rs)]
    segmentlist += [segment_indices, segment]
    return input_data

In [None]:
nr_segments = 5 #deze komt uit de extra oligo die we willen maken
output_data = {} # moet in de mainfile

def Decoding_step1and2(input_data, output_data):
    for droplet in input_data:
        segment_indices = droplet[0]
        XOR = droplet[1]
        remaining_segments = []
        for i in range(len(segment_indices)):
            if segment_indices[i] in output_data:
                dif = int(XOR,2) ^ int(output_data[segment_indices[i]],2)
                XOR = '{0:0{1}b}'.format(dif,len(droplet[1]))
            else:
                remaining_segments.append(segment_indices[i])    

        if len(remaining_segments)==1:
            output_data[remaining_segments[0]] = XOR
            newsolves=1
            while newsolves>0: 
                startsolves = len(output_data)
                Decoding_step1and2(input_data, output_data)
                endsolves = len(output_data)
                newsolves = endsolves-startsolves
    return output_data

# Van dit stuk hieronder weet ik niet of het ook gewoon in de def zou kunnen?                
output_data = Decoding_step1and2(input_data, output_data)

##add in a check to see if all segments have been solved.
#are all values from 0 to last segment keys in the dictionary
if len(output_data)==nr_segments:
    solution = ''.join([output_data[x] for x in range(nr_segments)])
else:
    solution = 'solution could not be reached, use another more robust method'
solution


# Unrandomize

In [4]:
def unrandomize(solution, keygenseed = 7) -> bytearray:
    """
    INPUT: 
    solution: randomized file, in bytearray format
    keygenseed: integer value of the seed to be used, default 7 
    OUTPUT:
    Unrandomize the message one byte at a time, using a known keygenseed.
    """
    unrandomized = bytearray(len(solution))
    for i in range(len(solution)):
        key = generate_key(keygenseed)
        keygenseed += 1
        unrandomized[i] = key ^ solution[i]
    return unrandomized

# Return original file

In [None]:
def original_data(unrandomized):
    original_data = unrandomized.decode()
    return original_data

# Mainscript

In [None]:
def decode(dropletlist,K,c,delta,total_segments,):
    data = dropletlist
    #put the droplet list in variable data
    OKdroplets = []
    newlistcounter = 0
    for i in length(data):
        s = data[i]
        if CheckBiochemicalRequirements(s):
            newlistcounter += 1
            OKdroplets[newlistcounter]=s
    #put all droplets that have the correct biomedical requirements in variable OKdroplets
    checked_droplets = []
    newlistcounter = 0
    for j in length(OKdroplets):
        if check_oglio_length(OKdroplets):
            newlistcounter += 1
            checked_droplets[newlistcounter] = OKdroplets[j]
    #put all the OKdroplets that have the correct lenght in variable checked_droplets
    for k in length(checked_droplets):
        checked_droplets[k] = DNA2Binary(checked_droplets[k])
    #convert DNA to binary
    for l in length(checked_droplets):
        checked_droplets[k] = string_to_bytearray(checked_droplets[k])
    #get a list of byte arrays
    for l in lenght(checked_droplets):
        checked_droplets[k] = Reed_solo(checked_droplets[k])
    #Decode using reed-solomon
    """
    Ik weet even niet wat de bedoeling is met de recover seed. 
    Kunnen we het hier tijdens de vergadering over hebben?
    """
    probabilities = robust_solution(K,c,delta)
    #recover seed
    input_data = []
    for m in length(checked_droplets):
        
        
    