In [16]:
import numpy as np
import random
from numpy.core.numeric import ones_like
from torch_geometric.data import Data, DataLoader
import torch
# s=str(input("Enter the DNA sequence:"))
# k=int(input("k to from k mers:"))

class DeBruijnGraph:
    def chop(self,st,k):
        a=np.empty([0])
        b=np.empty([0])
        c=np.empty([0])
        for i in range(0, len(st)-(k-1)):
            a=np.append(a,[st[i:i+k]],axis=0)
            b=np.append(b,[st[i:i+k-1]],axis=0)
            c=np.append(c,st[i+1:i+k])
        return a,b,c

    def generate(self,st,k):
        if k<=1 :
            print("invalid value of k returning empty graph")
            return
        if len(st)<k:
            print("insufficient size of string input returning empty graph")
            return
        a,b,c= self.chop(st,k)
        hash={b[0]:0}
        j=1
        for i in range (0,a.shape[0]):
            if c[i] in hash :
               self.edge_index=np.append(self.edge_index,[[hash[b[i]]],[hash[c[i]]]],axis=1)
            else:
                hash[c[i]]=j
                j=j+1
                self.edge_index=np.append(self.edge_index,[[hash[b[i]]],[hash[c[i]]]],axis=1) 
        for h in hash:
            self.x=np.append(self.x,[[h]],axis=0)

    def reverse(self): #gives back the DNA sequence from the graph
        #print(self.edge_index.shape)
        if self.edge_index.shape[1]==0 or self.x.shape[0]==0:
            return ''
        a=self.x[self.edge_index[0][0]][0]
        b=self.x[self.edge_index[1][0]][0]
        kmer=a[0:len(a)-1]+b
        st=kmer
        for i in range (1,self.edge_index.shape[1]):
            a=self.x[self.edge_index[0][i]][0]
            b=self.x[self.edge_index[1][i]][0]
            kmer=a[0:len(a)-1]+b
            st=st+kmer[len(kmer)-1]
        return st

    def one_hot_encode(self, seq):
    	mapping = dict(zip("ACGT", range(4)))    
    	seq2 = [mapping[i] for i in seq]
    	return np.eye(4)[seq2]    
    
    def __init__(self,st,k):
        x=np.empty([0,1])
        edge_index=np.empty([2,0],dtype=int)
        self.x=x
        self.edge_index=edge_index
        self.generate(st,k)

In [29]:
if __name__=='__main__':
    kmer, DATALIST = 3, []
    for i in range(100):
        onehot_x = []
        # Feed DNA sequences
        s =''.join(random.choices(['A','T','G','C'], k=200))
        d=DeBruijnGraph(s,kmer)    
        for node in d.x.flatten():
            one_hot_ = d.one_hot_encode(node)
            onehot_x.append(one_hot_.tolist())
        
        # Arrays to pytorch tensors
        onehot_x_tensor = torch.tensor(np.array(onehot_x), dtype=torch.float)
        onehot_edge_index_tensor = torch.tensor(d.edge_index, dtype=torch.long)
        
        # Add tensors to torch_geometric data object
        data = Data(x=onehot_x_tensor, edge_index=onehot_edge_index_tensor)

        DATALIST.append(data)

    print('Datapoints:', len(DATALIST))

    print('x_shape:', DATALIST[0].x.shape)
    print('edge_index_shape:', DATALIST[1].edge_index.shape)
    print()
    
    data = DATALIST[1]  # Get the first graph object.

    print()
    print(data)
    print('=============================================================')

    # Gather some statistics about the first graph.
    print(f'Number of nodes: {data.num_nodes}')
    print(f'Number of edges: {data.num_edges}')
    print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
    print(f'Contains self-loops: {data.contains_self_loops()}')
    print(f'Is undirected: {data.is_undirected()}')
    
    # pytorch dataloader usage
    loader = DataLoader(DATALIST, batch_size=16)

Datapoints: 100
x_shape: torch.Size([16, 2, 4])
edge_index_shape: torch.Size([2, 198])


Data(edge_index=[2, 198], x=[16, 2, 4])
Number of nodes: 16
Number of edges: 198
Average node degree: 12.38
Contains self-loops: True
Is undirected: False
