### DNA to image

In [4]:
import math
import numpy as np
import matplotlib as plt
from skimage import io
import pandas as pd
from embeddings import read_parquet

def kmer_to_integer(kmer):
    
    m = {'A':0, 'C':1, 'G':2, 'T':3}
    x = 0
    for ch in kmer:
        x = (x << 2) | m[ch] if ch in m else 0  # multiply by 4 and add digit
    
    return str(x)




def dna2rgb(dna, kmer_size, base_kmer = 4): # kmer_size = 4,8 or 12
    height_width = math.ceil(math.sqrt(len(dna)//kmer_size))
    print(f'{height_width=} x {height_width}')
	
  
    offset = kmer_size//(kmer_size//base_kmer)

    #kmer_ints = [kmer_to_integer(dna[i:i+kmer_size]) for i in range(0, len(dna), kmer_size)]

    # print(kmer_ints)
    # offset = 3
    # RGB = [[kmer[i:i+offset] if kmer[i:i+offset] != "" else "0" for i in range(0, len(kmer),offset)] for kmer in kmer_ints]
    # print(RGB)
    # RGB = np.array(RGB)
    # RGB.reshape((height_width, height_width))

    if 4**kmer_size <= 256:
        datatype = np.uint8
    elif 4**kmer_size <= 2**16:
        datatype = np.uint16
    else:
        datatype = np.uint32

    if kmer_size >= base_kmer:
        # Slice the embeddings
        R = [kmer_to_integer(dna[i:i+offset]) for i in range(0,len(dna),kmer_size)]
        R = np.array(R, dtype=datatype)
        R.resize(height_width*height_width)
        R = R.reshape(height_width,height_width)
        #print(R)
        #R.reshape(height_width, height_width, 1)
        print(R)
        
        if kmer_size == base_kmer:
            G = np.zeros((height_width, height_width), dtype = datatype)
            B = G.copy()
            return np.dstack([R,G,B])
    
    if kmer_size >= base_kmer*2:
        # offset = 8 // (8 // 4) = 4
        
        #R = [kmer_to_integer(dna[i:i+offset]) for i in range(0,len(dna),kmer_size)]
        G = [kmer_to_integer(dna[i+offset:i+offset*2]) for i in range(0,len(dna),kmer_size)]
       
        G = np.array(G, dtype=datatype)
        G.resize(height_width*height_width)
        G = G.reshape(height_width,height_width)
        print(G)
        if kmer_size == base_kmer*2:
            B = np.zeros((height_width, height_width), dtype = datatype)
            return np.dstack([R,G,B])

        
    
    if kmer_size >= base_kmer*3:
        # offset = 12 // (12 // 4) = 4
       
        B = [kmer_to_integer(dna[i+offset*2:i+offset*3]) for i in range(0,len(dna),kmer_size)]
        B = np.array(B, dtype=datatype)
        B.resize(height_width*height_width)
        B = B.reshape(height_width,height_width)
        print(B)

        return np.dstack([R,G,B])

    
    return None  

dna = """TTTTAAAAAGATAGATAACACACTCATCAGTGTGTCAGTCGATCGTAGTATATACGCGATCAGCTAGCTAGCTAGCTGACACACACACATCGCGTAAGGAGATAGATAGACGCATGCATCGATCGA"""



image = dna2rgb(dna, 4, base_kmer=4)

io.imsave('dna.png', image)



height_width=6 x 6
[[255   0  35  35   4  71]
 [ 77  46 237  45 141 178]
 [204 198  99  73 201 201]
 [201 225  17  17  54 108]
 [ 40 140 140 134  78  77]
 [141   8   0   0   0   0]]


In [2]:
df = read_parquet("../downloads/train_01.parquet")

Loading parquet dataset: ../downloads/train_01.parquet


In [5]:
for i in range(len(df)):
    image = dna2rgb(df.iloc[i].dna_sequence.replace(" ","").replace("N",""), 4, base_kmer=4)
    io.imsave(f'parq_{i}.png', image)


height_width=836 x 836
[[164  41 166 ...  88 144 164]
 [185 173 182 ...  69 155  91]
 [151 144 171 ...  93  38   8]
 ...
 [102  54 101 ... 219  93 134]
 [239 119 104 ...   0   0   0]
 [  0   0   0 ...   0   0   0]]
height_width=810 x 810
[[233 190 186 ...  14  52 139]
 [172 253   4 ...  97  24 158]
 [ 24  65  78 ... 182 112 169]
 ...
 [ 28 174 217 ... 248 229 149]
 [184 129 244 ...   5  64 192]
 [222  63  70 ...   0   0   0]]
height_width=801 x 801
[[169 148 108 ...  75 218  62]
 [ 43 121   7 ...  96  45 160]
 [114 225  89 ... 167  70  27]
 ...
 [ 55 160 100 ... 157 129 249]
 [139 150  42 ... 229 101  29]
 [111  93  52 ...   0   0   0]]
height_width=810 x 810
[[125 138 202 ... 214  26  62]
 [194  68 107 ... 101 161 139]
 [ 54 125 234 ... 135 102  57]
 ...
 [103  79  73 ... 100 105 154]
 [ 68 145 138 ... 185   4 166]
 [249 237 233 ...   0   0   0]]
height_width=905 x 905
[[ 27 123  22 ... 163  62  98]
 [107 158 254 ... 148  40 239]
 [111 215 163 ... 141  31  74]
 ...
 [165 214 230 ... 1

KeyboardInterrupt: 