In [1]:
import glob

import torch
from torch import nn
from torch.nn import MSELoss, BCELoss, CrossEntropyLoss, NLLLoss
from torch.utils.data import TensorDataset, Dataset, DataLoader, random_split
from torchvision import transforms
from torch.optim.lr_scheduler import ReduceLROnPlateau

from sklearn.model_selection import train_test_split
from sklearn import preprocessing

from tqdm import tqdm
from schedulers import *
import copy



def load_methylation_pkl(files):    
    '''
    Load methylation data from csv file.

    Note: We set nrows=5000 for test.
    If you want to use full data, it is recommended to read csv file by chunks 
    or other methods since the csv file is very large.
    Note the memory usage when you read csv file.

    We fill nan with 0, you can try other methods.
    '''
    import joblib

    methylation = np.concatenate([joblib.load(file) for file in files], axis= 0)
    # methylation = pd.read_csv(methy_dir, sep=',', index_col=0, nrows=50000)
#     methylation.fillna(0, inplace=True)
    # methylation = methylation.values.T.astype(np.float16)
    
#     methylation = np.nan_to_num(methylation, nan = 0)
    return methylation


disease_mapping = {
  'control': 0,
  "Alzheimer's disease": 1,
  "Graves' disease": 2,
  "Huntington's disease": 3,
  "Parkinson's disease": 4,
  'rheumatoid arthritis': 5,
  'schizophrenia': 6,
  "Sjogren's syndrome": 7,
  'stroke': 8,
  'type 2 diabetes': 9
}
sample_type_mapping = {'control': 0, 'disease tissue': 1}

In [2]:
def load_idmap(idmap_dir):
    idmap = pd.read_csv(idmap_dir, sep=',')
    age = idmap.age.to_numpy()
    age = age.astype(np.float16)
    sample_type = idmap.sample_type.replace(sample_type_mapping)
    return age, sample_type


In [3]:
idmap_train_dir = '/opt/tml/tmp/dataphin-data/user/wangshulin/天池比赛/train/trainmap.csv'
idmap_test_dir = '/opt/tml/tmp/dataphin-data/user/wangshulin/天池比赛/test/testmap.csv'
methy_train_dir = '/opt/tml/tmp/dataphin-data/user/wangshulin/天池比赛/train'
methy_test_dir = '/opt/tml/tmp/dataphin-data/user/wangshulin/天池比赛/test'

age, sample_type = load_idmap(idmap_train_dir)
age = [[i] for i in age]
indices = np.arange(len(age))

In [4]:
# 加载 feature_select 特征
output_filename = '/root/notebook/model_code/dna/data/pearsonr/methylation_4W9.pkl'

methylation = joblib.load(output_filename)

In [5]:
# 分成训练集和验证集
methylation = methylation.to_numpy()

[indices_train, indices_valid, age_train,
age_valid] = train_test_split(indices, age, test_size=0.3, random_state= 0)

methylation_train, methylation_valid = methylation[
indices_train], methylation[indices_valid]

sample_type_train, sample_type_valid = sample_type[
indices_train], sample_type[indices_valid]

feature_size = methylation_train.shape[1]


In [6]:
if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'

In [7]:
# 根据Tensor创建数据集

from sklearn import datasets 
# iris = datasets.load_iris()
ds_train = TensorDataset(torch.tensor(methylation_train, dtype=torch.float32).to(device),torch.tensor(age_train,dtype=torch.float32))
ds_valid = TensorDataset(torch.tensor(methylation_valid,dtype=torch.float32).to(device),torch.tensor(age_valid,dtype=torch.float32))


# 分割成训练集和预测集
# n_train = int(len(methylation)*0.8)
# n_valid = len(methylation) - n_train
# ds_train,ds_valid = random_split(methylation,[n_train,n_valid])

print(ds_valid)

<torch.utils.data.dataset.TensorDataset object at 0x7fb406fe9b10>


In [8]:
# 使用DataLoader加载数据集
dl_train,dl_valid = DataLoader(ds_train,batch_size = 128),DataLoader(ds_valid,batch_size = 128)

for features,labels in dl_train:
    print(features,features[0].shape,labels[0])
    break

tensor([[-0.4138, -2.0898, -0.7354,  ..., -4.6914, -6.1641, -3.3145],
        [-2.6328, -1.9727, -1.9365,  ..., -5.0938, -5.7734, -3.2559],
        [-0.7349,  0.4641, -0.3262,  ..., -5.4922, -9.2109, -3.5078],
        ...,
        [-0.8901,  0.2249,  0.2168,  ..., -6.8125, -9.2109, -3.5078],
        [-2.0117, -1.3428, -0.6147,  ..., -4.9414, -6.8125, -3.0527],
        [-2.7676, -1.8828, -0.8853,  ..., -5.2734, -9.2109, -2.6328]],
       device='cuda:0') torch.Size([50553]) tensor([82.])


In [9]:
class DNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout_p=0.1, binary=False):
        super(DNN, self).__init__()
        self.topology = [input_size] + hidden_size + [output_size]
        fc_layers = [nn.Linear(self.topology[i], self.topology[i+1]) for i in range(len(self.topology)-1)]
        for layer in fc_layers:
            nn.init.kaiming_uniform_(layer.weight)
#         fc_net = [nn.Sequential(i, nn.ReLU(), nn.Dropout(dropout_p)) for i in fc_layers]
        fc_net = [nn.Sequential(i, nn.ReLU()) for i in fc_layers]
        if binary:
            out_layer = nn.Sigmoid()
            fc_net.append(out_layer)
        self.dnn = nn.Sequential(*fc_net)
        
    def forward(self, x):
        return self.dnn(x)
        
    

In [10]:
model = DNN(50553, hidden_size=[1024,512,256,128,64,32,16,8,4], output_size=1, dropout_p=0.1).to(device)
model

DNN(
  (dnn): Sequential(
    (0): Sequential(
      (0): Linear(in_features=50553, out_features=1024, bias=True)
      (1): ReLU()
    )
    (1): Sequential(
      (0): Linear(in_features=1024, out_features=512, bias=True)
      (1): ReLU()
    )
    (2): Sequential(
      (0): Linear(in_features=512, out_features=256, bias=True)
      (1): ReLU()
    )
    (3): Sequential(
      (0): Linear(in_features=256, out_features=128, bias=True)
      (1): ReLU()
    )
    (4): Sequential(
      (0): Linear(in_features=128, out_features=64, bias=True)
      (1): ReLU()
    )
    (5): Sequential(
      (0): Linear(in_features=64, out_features=32, bias=True)
      (1): ReLU()
    )
    (6): Sequential(
      (0): Linear(in_features=32, out_features=16, bias=True)
      (1): ReLU()
    )
    (7): Sequential(
      (0): Linear(in_features=16, out_features=8, bias=True)
      (1): ReLU()
    )
    (8): Sequential(
      (0): Linear(in_features=8, out_features=4, bias=True)
      (1): ReLU()
    )
 

In [18]:
class Trainer:
    def __init__(self, model, val_dataloader, n_epoch=100, optimizer=torch.optim.Adam, lr=1e-3, loss_fn=nn.MSELoss()):
        self.model = model
        self.val_dataloader = val_dataloader
        self.n_epoch = n_epoch
        self.loss_fn = loss_fn
        self.optimizer = optimizer(model.parameters(), lr=lr)
        self.scheduler = ReduceLROnPlateau(self.optimizer, mode='min', factor=0.1, patience=1000,verbose = True)


    def train_loop(self, train_dataloader, metric=0, verbose=True):
        self.model.train()

        total_loss = 0.

        for i,(X,y) in enumerate(train_dataloader):
            if torch.cuda.is_available():
                X = X.cuda()
                y = y.cuda()
            y_pred = self.model(X)
            self.optimizer.zero_grad()
            loss = self.loss_fn(y_pred, y)

            loss.backward() # loss 是个grad的tensor,获取数值需要loss.item()
            self.optimizer.step()

            total_loss += loss.item()
            self.scheduler.step(loss) # 每个epoch/batch 更新一次lr
        
        epoch_loss = total_loss/(i+1)

        return epoch_loss


    def val_loop(self, val_dataloader):
        self.model.eval()

        epoch_loss = 0.
        with torch.no_grad():
            for i,(X,y) in enumerate(val_dataloader):
                if torch.cuda.is_available():
                    X = X.cuda()
                    y = y.cuda()

                pred = self.model(X)
                loss = self.loss_fn(pred, y)
                epoch_loss += loss.item()

            return epoch_loss/(i+1)

    # test_dataloader should be wiht no y label             
    def test_loop(self, test_dataloader):
        self.model.eval()

        y_pred = []
        for i,batch_data in enumerate(test_dataloader):
            # TODO
            X = batch_data # or batch_data[0]

        
    def fit(self, train_dataloader, verbose=True):
        for epoch in range(self.n_epoch):
            train_loss = self.train_loop(train_dataloader)
            val_loss = self.val_loop(self.val_dataloader)
            print(f'epoch_{epoch} train_loss: {train_loss},val_loss:{val_loss}')     
            
            if verbose:
                current_lr = self.optimizer.param_groups[0]['lr']
                print(f'Epoch {epoch}: Learning Rate = {current_lr}')
        


In [19]:
DNN_trainer=Trainer(model, val_dataloader=dl_valid, n_epoch=1000, optimizer=torch.optim.AdamW, lr=0.001, loss_fn=nn.MSELoss())

In [20]:
DNN_trainer.fit(dl_train)

epoch_0 train_loss: 3517.6009999150815,val_loss:3573.098742675781
Epoch 0: Learning Rate = 0.001
epoch_1 train_loss: 3517.6009999150815,val_loss:3573.098742675781
Epoch 1: Learning Rate = 0.001
epoch_2 train_loss: 3517.6009999150815,val_loss:3573.098742675781
Epoch 2: Learning Rate = 0.001
epoch_3 train_loss: 3517.6009999150815,val_loss:3573.098742675781
Epoch 3: Learning Rate = 0.001
epoch_4 train_loss: 3517.6009999150815,val_loss:3573.098742675781
Epoch 4: Learning Rate = 0.001
epoch_5 train_loss: 3517.6009999150815,val_loss:3573.098742675781
Epoch 5: Learning Rate = 0.001


KeyboardInterrupt: 

In [224]:
# class fc(nn.Module):
#     def __init__(self, 
#                  input_size, 
#                  hidden_sizes, 
#                  output_size, 
#                  dropout, 
#                  if_bn=True):
#         super().__init__()
        
#         torch.manual_seed(123)
#         torch.cuda.manual_seed(123)
#         self.input_layer = nn.Linear(input_size, hidden_sizes[0])
#         self.hidden_layers = nn.ModuleList([nn.Linear(hidden_sizes[i], hidden_sizes[i+1]) for i in range(len(hidden_sizes)-1)])
#         self.bn_layers = nn.ModuleList([nn.BatchNorm1d(hidden_sizes[i+1]) for i in range(len(hidden_sizes)-1)])
        
#         self.dp = nn.Dropout(dropout)
#         self.output_layer = nn.Linear(hidden_sizes[-1], output_size)

#         torch.nn.init.kaiming_uniform(self.input_layer.weight)
#         [torch.nn.init.kaiming_uniform_(layer.weight) for layer in self.hidden_layers]
#         torch.nn.init.kaiming_uniform(self.output_layer.weight)
#         self.if_bn = if_bn
        
#     def forward(self, x):
#         x = nn.GELU()(self.input_layer(x))

#         if len(x.shape) == 2:
#             for l,bn in zip(self.hidden_layers,self.bn_layers):
#                 x = l(x)
#                 if self.if_bn:
#                     x = bn(x)
#                 else:
#                     pass
#                 x = nn.GELU()(x)
#                 x = self.dp(x)
        
#         elif len(x.shape) == 3:
#             for l,bn in zip(self.hidden_layers,self.bn_layers):
#                 x = l(x)
#                 if self.if_bn:
#                     x = x.permute(0,2,1)
#                     x = bn(x)
#                     x = x.permute(0,2,1)
#                 else:
#                     pass
#                 x = nn.GELU()(x)
#                 x = self.dp(x)
#         else:
#             print('input size needs to be 2d or 3d')
#             raise
#         emb = x
#         x = self.output_layer(x)
#         return x

In [225]:
# class Model(nn.Module):
#     def __init__(self, num_features, num_targets, hidden_sizes):
#         super(Model, self).__init__()
#         self.num_features = num_features
#         self.num_targets = num_targets
#         self.hidden_sizes = hidden_sizes
#         self.dropout = 0.2
#         self.fc = fc(input_size = num_features,
#                      hidden_sizes = self.hidden_sizes,
#                      output_size = self.num_targets,
#                      dropout = self.dropout,
#                      if_bn = False)

#     def forward(self, x):
#         out = self.fc(x)
# #         out = torch.sigmoid(out)

#         return out

In [220]:
model = Model(num_features = 50553, num_targets = 1, hidden_sizes = [1024,512,256,128,64,32,16,8,4,1]).to('cuda')



In [200]:
from torchsummary import summary


In [204]:
summary(model)

Layer (type:depth-idx)                   Param #
├─fc: 1-1                                --
|    └─Linear: 2-1                       51,767,296
|    └─ModuleList: 2-2                   --
|    |    └─Linear: 3-1                  524,800
|    |    └─Linear: 3-2                  131,328
|    |    └─Linear: 3-3                  32,896
|    |    └─Linear: 3-4                  8,256
|    |    └─Linear: 3-5                  2,080
|    |    └─Linear: 3-6                  528
|    |    └─Linear: 3-7                  136
|    |    └─Linear: 3-8                  36
|    |    └─Linear: 3-9                  5
|    └─ModuleList: 2-3                   --
|    |    └─BatchNorm1d: 3-10            1,024
|    |    └─BatchNorm1d: 3-11            512
|    |    └─BatchNorm1d: 3-12            256
|    |    └─BatchNorm1d: 3-13            128
|    |    └─BatchNorm1d: 3-14            64
|    |    └─BatchNorm1d: 3-15            32
|    |    └─BatchNorm1d: 3-16            16
|    |    └─BatchNorm1d: 3-17       

Layer (type:depth-idx)                   Param #
├─fc: 1-1                                --
|    └─Linear: 2-1                       51,767,296
|    └─ModuleList: 2-2                   --
|    |    └─Linear: 3-1                  524,800
|    |    └─Linear: 3-2                  131,328
|    |    └─Linear: 3-3                  32,896
|    |    └─Linear: 3-4                  8,256
|    |    └─Linear: 3-5                  2,080
|    |    └─Linear: 3-6                  528
|    |    └─Linear: 3-7                  136
|    |    └─Linear: 3-8                  36
|    |    └─Linear: 3-9                  5
|    └─ModuleList: 2-3                   --
|    |    └─BatchNorm1d: 3-10            1,024
|    |    └─BatchNorm1d: 3-11            512
|    |    └─BatchNorm1d: 3-12            256
|    |    └─BatchNorm1d: 3-13            128
|    |    └─BatchNorm1d: 3-14            64
|    |    └─BatchNorm1d: 3-15            32
|    |    └─BatchNorm1d: 3-16            16
|    |    └─BatchNorm1d: 3-17       

In [207]:
summary(model)



Layer (type:depth-idx)                   Param #
├─Sequential: 1-1                        --
|    └─Sequential: 2-1                   --
|    |    └─Linear: 3-1                  51,767,296
|    |    └─ReLU: 3-2                    --
|    |    └─Dropout: 3-3                 --
|    └─Sequential: 2-2                   --
|    |    └─Linear: 3-4                  524,800
|    |    └─ReLU: 3-5                    --
|    |    └─Dropout: 3-6                 --
|    └─Sequential: 2-3                   --
|    |    └─Linear: 3-7                  131,328
|    |    └─ReLU: 3-8                    --
|    |    └─Dropout: 3-9                 --
|    └─Sequential: 2-4                   --
|    |    └─Linear: 3-10                 32,896
|    |    └─ReLU: 3-11                   --
|    |    └─Dropout: 3-12                --
|    └─Sequential: 2-5                   --
|    |    └─Linear: 3-13                 8,256
|    |    └─ReLU: 3-14                   --
|    |    └─Dropout: 3-15                --
| 

Layer (type:depth-idx)                   Param #
├─Sequential: 1-1                        --
|    └─Sequential: 2-1                   --
|    |    └─Linear: 3-1                  51,767,296
|    |    └─ReLU: 3-2                    --
|    |    └─Dropout: 3-3                 --
|    └─Sequential: 2-2                   --
|    |    └─Linear: 3-4                  524,800
|    |    └─ReLU: 3-5                    --
|    |    └─Dropout: 3-6                 --
|    └─Sequential: 2-3                   --
|    |    └─Linear: 3-7                  131,328
|    |    └─ReLU: 3-8                    --
|    |    └─Dropout: 3-9                 --
|    └─Sequential: 2-4                   --
|    |    └─Linear: 3-10                 32,896
|    |    └─ReLU: 3-11                   --
|    |    └─Dropout: 3-12                --
|    └─Sequential: 2-5                   --
|    |    └─Linear: 3-13                 8,256
|    |    └─ReLU: 3-14                   --
|    |    └─Dropout: 3-15                --
| 