In [1]:
import numpy as np
import random
from numpy.core.numeric import ones_like
from torch_geometric.data import Data, DataLoader
import torch
from tqdm import tqdm

# s=str(input("Enter the DNA sequence:"))
# k=int(input("k to from k mers:"))

class DeBruijnGraph:
    def chop(self,st,k):
        a=np.empty([0])
        b=np.empty([0])
        c=np.empty([0])
        for i in range(0, len(st)-(k-1)):
            a=np.append(a,[st[i:i+k]],axis=0)
            b=np.append(b,[st[i:i+k-1]],axis=0)
            c=np.append(c,st[i+1:i+k])
        return a,b,c

    def generate(self,st,k):
        if k<=1 :
            print("invalid value of k returning empty graph")
            return
        if len(st)<k:
            print("insufficient size of string input returning empty graph")
            return
        a,b,c= self.chop(st,k)
        hash={b[0]:0}
        j=1
        for i in range (0,a.shape[0]):
            if c[i] in hash :
               self.edge_index=np.append(self.edge_index,[[hash[b[i]]],[hash[c[i]]]],axis=1)
            else:
                hash[c[i]]=j
                j=j+1
                self.edge_index=np.append(self.edge_index,[[hash[b[i]]],[hash[c[i]]]],axis=1) 
        for h in hash:
            self.x=np.append(self.x,[[h]],axis=0)

    def reverse(self): #gives back the DNA sequence from the graph
        #print(self.edge_index.shape)
        if self.edge_index.shape[1]==0 or self.x.shape[0]==0:
            return ''
        a=self.x[self.edge_index[0][0]][0]
        b=self.x[self.edge_index[1][0]][0]
        kmer=a[0:len(a)-1]+b
        st=kmer
        for i in range (1,self.edge_index.shape[1]):
            a=self.x[self.edge_index[0][i]][0]
            b=self.x[self.edge_index[1][i]][0]
            kmer=a[0:len(a)-1]+b
            st=st+kmer[len(kmer)-1]
        return st

    def one_hot_encode(self, seq):
    	mapping = dict(zip("ACGT", range(4)))    
    	seq2 = [mapping[i] for i in seq]
    	return np.eye(4)[seq2]    
    
    def __init__(self,st,k):
        x=np.empty([0,1])
        edge_index=np.empty([2,0],dtype=int)
        self.x=x
        self.edge_index=edge_index
        self.generate(st,k)

In [2]:
# Prepare Data
kmer, DATALIST = 3, []
for i in tqdm(range(10000)):
    onehot_x = []
    # Generate random DNA sequences
    s =''.join(random.choices(['A','T','G','C'], k=200))
    d=DeBruijnGraph(s,kmer)    
    for node in d.x.flatten():
        one_hot_ = d.one_hot_encode(node).flatten()
        onehot_x.append(one_hot_.tolist())

    # Arrays to pytorch tensors
    onehot_x_tensor = torch.tensor(np.array(onehot_x), dtype=torch.float)
    onehot_edge_index_tensor = torch.tensor(d.edge_index, dtype=torch.long)
    
    # generate random label
    y_tensor = torch.tensor(random.choice([0, 1]))

    # Add tensors to torch_geometric data object
    data = Data(x=onehot_x_tensor, edge_index=onehot_edge_index_tensor, y=y_tensor)

    DATALIST.append(data)

print('Datapoints:', len(DATALIST))

print('x_shape:', DATALIST[0].x.shape)
print('edge_index_shape:', DATALIST[1].edge_index.shape)
print()

100%|██████████| 10000/10000 [00:56<00:00, 176.36it/s]

Datapoints: 10000
x_shape: torch.Size([16, 8])
edge_index_shape: torch.Size([2, 198])






In [3]:
data = DATALIST[1]  # Get the first graph object.

print()
print(data)
print('=============================================================')

# Gather some statistics about the first graph.
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Contains self-loops: {data.contains_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')

# pytorch dataloader usage
loader = DataLoader(DATALIST, batch_size=16)


Data(edge_index=[2, 198], x=[16, 8], y=0)
Number of nodes: 16
Number of edges: 198
Average node degree: 12.38
Contains self-loops: True
Is undirected: False


In [4]:
# Split data
torch.manual_seed(11)
# dataset = random.shuffle(DATALIST)

train_dataset = DATALIST[:7500]
test_dataset = DATALIST[7500:]

print('Number of training graph:', len(train_dataset))
print('Number of testing graph:', len(test_dataset))

Number of training graph: 7500
Number of testing graph: 2500


In [5]:
from torch_geometric.data import DataLoader
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=True)

for step, data in enumerate(train_loader):
    print(f'Step {step+1}:')
    print('=====')
    print(f'Number of graphs in current batch: {data.num_graphs}')
    print(data)
    print()

Step 1:
=====
Number of graphs in current batch: 128
Batch(batch=[2048], edge_index=[2, 25344], ptr=[129], x=[2048, 8], y=[128])

Step 2:
=====
Number of graphs in current batch: 128
Batch(batch=[2048], edge_index=[2, 25344], ptr=[129], x=[2048, 8], y=[128])

Step 3:
=====
Number of graphs in current batch: 128
Batch(batch=[2048], edge_index=[2, 25344], ptr=[129], x=[2048, 8], y=[128])

Step 4:
=====
Number of graphs in current batch: 128
Batch(batch=[2048], edge_index=[2, 25344], ptr=[129], x=[2048, 8], y=[128])

Step 5:
=====
Number of graphs in current batch: 128
Batch(batch=[2048], edge_index=[2, 25344], ptr=[129], x=[2048, 8], y=[128])

Step 6:
=====
Number of graphs in current batch: 128
Batch(batch=[2048], edge_index=[2, 25344], ptr=[129], x=[2048, 8], y=[128])

Step 7:
=====
Number of graphs in current batch: 128
Batch(batch=[2048], edge_index=[2, 25344], ptr=[129], x=[2048, 8], y=[128])

Step 8:
=====
Number of graphs in current batch: 128
Batch(batch=[2048], edge_index=[2, 25

In [6]:
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.nn import global_mean_pool

class GCN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super(GCN, self).__init__()
        torch.manual_seed(5)
        self.conv1 = GCNConv(8, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, hidden_channels)
        self.lin = Linear(hidden_channels, 2)
        
    def forward(self, x, edge_index, batch):
        # Get node embeeding
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        x = x.relu()
        x = self.conv3(x, edge_index)
        
        # Readout alayer
        x = global_mean_pool(x, batch)
        
        # Out layer
        # x = F.dropout(x, p=0.4, training=self.training)
        x = self.lin(x)
        
        return x
    
model = GCN(hidden_channels=256)
print(model)

GCN(
  (conv1): GCNConv(8, 256)
  (conv2): GCNConv(256, 256)
  (conv3): GCNConv(256, 256)
  (lin): Linear(in_features=256, out_features=2, bias=True)
)


In [7]:
for data in train_loader:
    print(data.edge_index.shape)
    print(data.x.shape)
    print(data.batch.shape)
    break

torch.Size([2, 25344])
torch.Size([2048, 8])
torch.Size([2048])


In [8]:
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
print(device)
torch.cuda.set_device(1)
print('Current cuda device ID:',torch.cuda.current_device())
print('Current cuda device name:', torch.cuda.get_device_name())

cuda:1
Current cuda device ID: 1
Current cuda device name: Tesla V100-PCIE-32GB


In [9]:
# Train/test
model = GCN(hidden_channels=256)
model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
loss_func = torch.nn.CrossEntropyLoss()

def to_device(data, device):
    return data.to(device, non_blocking=True)

def train():
    model.train()
    for data in train_loader:
        a = to_device(data.x, device)
        b = to_device(data.edge_index, device)
        c = to_device(data.batch, device)
        d = to_device(data.y, device)
        
        out = model(a, b, c)
        loss = loss_func(out, d)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    
def test(loader):
    model.eval()
    correct = 0
    for data in loader:
        a = to_device(data.x, device)
        b = to_device(data.edge_index, device)
        c = to_device(data.batch, device)
        d = to_device(data.y, device)
        
        out = model(a, b, c)
        pred = out.argmax(dim=1)
        correct += int((pred==d).sum())
    return correct/len(loader.dataset)

In [11]:
train_, test_ = [], []
for epoch in range(1, 10):
    train()
    train_acc = test(train_loader)
    test_acc = test(test_loader)
    
    train_.append(train_acc)
    test_.append(test_acc)
    
    print(f'Epoch:{epoch:03d}, Train acc: {train_acc:.4f}, Test acc: {test_acc:.4f}') 

Epoch:001, Train acc: 0.4993, Test acc: 0.5020
Epoch:002, Train acc: 0.5007, Test acc: 0.4980
Epoch:003, Train acc: 0.4993, Test acc: 0.5020
Epoch:004, Train acc: 0.5007, Test acc: 0.4980
Epoch:005, Train acc: 0.4993, Test acc: 0.5020
Epoch:006, Train acc: 0.5007, Test acc: 0.4980
Epoch:007, Train acc: 0.4993, Test acc: 0.5020
Epoch:008, Train acc: 0.5007, Test acc: 0.4980
Epoch:009, Train acc: 0.4993, Test acc: 0.5020


## loader.dataset[1]