In [1]:
import os
import math
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

import torch.optim as optim

from sklearn.preprocessing import LabelEncoder

In [2]:
csv_path = "/home/richard/labrotation/processed_sample_data/human_origins_labels.csv"
array_path = "//home/richard/labrotation/processed_sample_data/arrays"
index = 0

In [3]:
class sampleDataset(Dataset):
    def __init__(self, csv_file, array_dir, transform=None):
        self.annotations = pd.read_csv(csv_file)
        self.array_dir = array_dir
        self.transform = transform
    
    def __len__(self):
        return len(self.annotations)
    
    def __getitem__(self, index):
        array_path = os.path.join(self.array_dir, self.annotations.iloc[index, 0]+".npy")
        array = np.load(array_path)
        array = array.astype('float32')
        y_label = annotations.iloc[index,1]

        if self.transform:
            y_label = self.transform(y_label)
        
        return array, y_label

In [4]:
annotations = pd.read_csv(csv_path)
array_dir = array_path

In [5]:
def transformation(x):
    x = le.transform((x,))
    x = x[0]
    return(x)

In [6]:
classes = annotations['Origin'].unique()
le = LabelEncoder()
le.fit(classes)
dataset = sampleDataset(csv_file=csv_path, array_dir=array_dir, transform=transformation)
train_size = int(0.7 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])
train_loader = DataLoader(dataset=train_dataset, batch_size=5, shuffle=True, drop_last=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=5, shuffle=True, drop_last=True)

In [7]:
test_batch = next(iter(test_loader))
test_array = test_batch[0]
test_batch[1]

tensor([0, 0, 2, 0, 1])

In [8]:
'''
expected input: Tensor of dimensions N*C*in_features
output: Tensor of dimensions N*out_sets*⌈(in_features/m)⌉
N: batch size
C: number of channels (e.g. 4 for one-hot-encoding of SNPs)
in_features: number of input features (e.g. SNP positions)
out_sets: number of output sets (new channels)
m: how many in_features to group together
kernel_size: kernel of flat tensor: m*C
padding: should we padd at the end of the dataframe if in_features%m !=0? 
'''
class LocallyConnectedLayer(torch.nn.Module):
    def __init__(self, in_features, m, C=4, padding=True, bias=False, out_sets=4):
        super().__init__()
        self.in_features = in_features
        self.C = C
        self.m = m
        self.kernel_size = m*C
        self.padding = (m-(in_features%m))%m*C if padding else 0
        self.weight = nn.Parameter(torch.randn(1,self.kernel_size, out_sets))
        self.bias = nn.Parameter(torch.randn(1,out_sets)) if bias else None # with batchnorm we do not need bias
    
    def forward(self, x):
        x = x.transpose(-1,-2) # we need to transpose first to ensure that the channel values of one in_feature are next to each other after flattening
        x = x.flatten(1) # dim(N,in_features*C)
        x = F.pad(x,(0,self.padding))
        x = x.unfold(-1, size=self.kernel_size, step=self.kernel_size)
        x = torch.matmul(x,self.weight)
        if self.bias is not None:
            x = x+self.bias
        x = x = x.transpose(-1,-2) # transpose back to have the more convenient dimension order
        return x


In [9]:
class LCBlock(nn.Module):
    def __init__(self, in_features, m, out_sets=4, p=0.0):
        super().__init__()
        self.bn = nn.BatchNorm1d(out_sets)
        self.silu = nn.SiLU()
        self.drop = nn.Dropout(p)
        self.LCL1 = LocallyConnectedLayer(in_features, m=m, padding=True, out_sets=out_sets)
        self.LCL2 = LocallyConnectedLayer(in_features=math.ceil(in_features/m),m=m, padding=True, out_sets=out_sets)
        self.identity_downsample = nn.Linear(in_features, out_features=math.ceil(in_features/m**2)) if m!=1 else None

    def forward(self, x):
        identity = x

        x = self.bn(x)
        x = self.silu(x)
        x = self.LCL1(x)
        x = self.bn(x)
        x = self.silu(x)
        x = self.drop(x)
        x = self.LCL2(x)

        if self.identity_downsample is not None:
            identity = self.identity_downsample(identity)

        x = x+identity
        return x

In [10]:
'''
expected input: flat tensor of shape N*in_features
expected output: flat tensor of shape N*out_features
N: batch size
'''
class FCBlock(nn.Module):
    def __init__(self, in_features, out_features, p=0.5):
        super().__init__()
        self.bn1 = nn.BatchNorm1d(in_features)
        self.bn2 = nn.BatchNorm1d(out_features)
        self.silu = nn.SiLU()
        self.drop = nn.Dropout(p)
        self.FCL1 = nn.Linear(in_features=in_features, out_features=out_features)
        self.FCL2 = nn.Linear(in_features=out_features, out_features=out_features)
        self.identity_downsample = nn.Linear(in_features, out_features=out_features) if in_features != out_features else None

    def forward(self, x):
        identity = x

        x = self.bn1(x)
        x = self.silu(x)
        x = self.FCL1(x)
        x = self.bn2(x)
        x = self.silu(x)
        x = self.drop(x)
        x = self.FCL2(x)

        if self.identity_downsample is not None:
            identity = self.identity_downsample(identity)

        x = x+identity
        return x

In [11]:
class GLN(nn.Module):
    def __init__(self, in_features, num_classes, num_residual_blocks=2, m1=2, m2=2, C=4, num_predictor_blocks=4):
        super().__init__()
        self.m1 = m1
        self.m2 = m2
        self.num_residual_blocks = num_residual_blocks
        self.num_predictor_blocks = num_residual_blocks
        self.LCL0 = LocallyConnectedLayer(in_features, m=m1)
        Output1 = math.ceil(in_features/m1)
        self.LCLayers = self.make_LCLayers(Output1)
        Output2 = math.ceil(Output1/(2*m2)**num_residual_blocks)*C # we flatten after the last block TO DO: IMPLEMENT ENVIRONMENT CONCATENATION
        self.FCLayers = self.make_predictorLayers(in_features=Output2)
        self.bn = nn.BatchNorm1d(256)
        self.silu = nn.SiLU()
        self.drop = nn.Dropout(p=0.5)
        self.Linear = nn.Linear(256,num_classes)
        

    def make_LCLayers(self, in_features):
        layers = []
        for block in range(self.num_residual_blocks):
            layers.append(LCBlock(in_features=in_features, m=self.m2))
            in_features = math.ceil(in_features/self.m2**2)
        return nn.Sequential(*layers)

    def make_predictorLayers(self, in_features):
        layers = []
        layers.append(FCBlock(in_features=in_features, out_features=256))
        for block in range(self.num_predictor_blocks):
            layers.append(FCBlock(in_features=256, out_features=256))
        return nn.Sequential(*layers)

    
    def forward(self,x):
        x = self.LCL0(x)
        x = self.LCLayers(x)
        x = x.flatten(1)
        x = self.FCLayers(x)
        x = self.bn(x)
        x = self.silu(x)
        x = self.drop(x)
        x = self.Linear(x)
        return x



In [12]:
# set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [13]:
# Hyperparameters
learning_rate = 1e-4
batch_size = 64


In [16]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(),lr=learning_rate)

In [17]:
def check_accuracy(loader, model):
    num_correct = 0
    num_samples = 0
    model.eval()

    with torch.no_grad():
        for x,y in loader:
            x = x.to(device=device)
            y = y.to(device=device)
            scores = model(x)
            predictions = torch.argmax(scores, dim=1)
            

            num_correct += (predictions==y).sum()
            num_samples += predictions.shape[0]
        #acc = float(num_correct/num_samples)*100
        #print(f' Got {num_correct} / {num_samples} with accuracy {float(num_correct/num_samples)*100}')

    model.train()
    return(num_correct,num_samples)

In [15]:
model = GLN(in_features=1000, num_classes=6,num_residual_blocks=2)

In [18]:
for epoch in range(151):
    losses = []

    for batch_idx, (data,targets) in enumerate(train_loader):
        data = data.to(device=device)
        targets = targets.flatten().to(device=device)

        # forwards
        scores = model(data)
        loss = criterion(scores, targets)
        losses.append(loss.item())

        # backwards
        optimizer.zero_grad()
        loss.backward()

        # gradient descent
        optimizer.step()
    
    mean_loss = sum(losses)/len(losses)
    
    if epoch % 5 ==0:
        print(f'loss at epoch {epoch} was {mean_loss:.5f}')
        num_correct, num_samples  = check_accuracy(loader = train_loader, model=model)
        print(f' Evaluation with training data got {num_correct} / {num_samples} with accuracy {float(num_correct/num_samples)*100:.5f}')
        num_correct, num_samples  = check_accuracy(loader = test_loader, model=model)
        print(f' Evaluation at testing data got {num_correct} / {num_samples} with accuracy {float(num_correct/num_samples)*100:.5f}')
        


loss at epoch 0 was 1.48890
 Evaluation with training data got 2402 / 3770 with accuracy 63.71353
 Evaluation at testing data got 1010 / 1615 with accuracy 62.53870
loss at epoch 5 was 0.64230
 Evaluation with training data got 3079 / 3770 with accuracy 81.67109
 Evaluation at testing data got 1203 / 1615 with accuracy 74.48916
loss at epoch 10 was 0.42667
 Evaluation with training data got 3539 / 3770 with accuracy 93.87268
 Evaluation at testing data got 1273 / 1615 with accuracy 78.82353
loss at epoch 15 was 0.23138
 Evaluation with training data got 3674 / 3770 with accuracy 97.45358
 Evaluation at testing data got 1271 / 1615 with accuracy 78.69969
loss at epoch 20 was 0.17150
 Evaluation with training data got 3727 / 3770 with accuracy 98.85942
 Evaluation at testing data got 1273 / 1615 with accuracy 78.82353
loss at epoch 25 was 0.10838
 Evaluation with training data got 3749 / 3770 with accuracy 99.44297
 Evaluation at testing data got 1275 / 1615 with accuracy 78.94737
loss a

In [43]:
model.eval()
scores = model(test_array)
print(torch.argmax(scores, dim=1), test_batch[1].flatten())
print(scores.max(1))
model.train()

tensor([1, 4, 5, 4, 1]) tensor([2, 0, 0, 2, 0])
torch.return_types.max(
values=tensor([-0.0575,  0.1082,  0.2198,  0.4747,  0.5401], grad_fn=<MaxBackward0>),
indices=tensor([1, 4, 5, 4, 1]))


GLN(
  (LCL0): LocallyConnectedLayer()
  (LCLayers): Sequential(
    (0): LCBlock(
      (bn): BatchNorm1d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (silu): SiLU()
      (drop): Dropout(p=0.0, inplace=False)
      (LCL1): LocallyConnectedLayer()
      (LCL2): LocallyConnectedLayer()
      (identity_downsample): Linear(in_features=500, out_features=125, bias=True)
    )
    (1): LCBlock(
      (bn): BatchNorm1d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (silu): SiLU()
      (drop): Dropout(p=0.0, inplace=False)
      (LCL1): LocallyConnectedLayer()
      (LCL2): LocallyConnectedLayer()
      (identity_downsample): Linear(in_features=125, out_features=32, bias=True)
    )
  )
  (FCLayers): Sequential(
    (0): FCBlock(
      (bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (bn2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (silu): SiLU()
   

In [57]:
acc, num = check_accuracy(loader = test_loader, model=model)

27.399999618530273


In [20]:
torch.save(model.state_dict(), 'my_model')