In [None]:
from deblurring_diffusion_pytorch import Unet, GaussianDiffusion, Trainer, ProteinDataset2ESM, ProteinDataset
import torchvision
import torch
import os
import errno
import shutil
import argparse
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader

In [None]:
inputSize = 64

In [None]:
df1 = pd.read_csv('./data.csv')

df1.info()
use_df = df1[:]
use_df.head(10)
feature = df1['feature'].tolist()

In [None]:
df2 = df1[df1['feature'].str.len().between(15, inputSize)]
print('一共有{}条数据'.format(len(df2)))
print(df2.head())
feature = df2['feature'].tolist()
feature

In [None]:
model = Unet(
    dim = 32,
    dim_mults = (1, 2, 4, 8),
    channels=1
).cuda()


diffusion = GaussianDiffusion(
    model,
    image_size = inputSize,
    device_of_kernel = 'cuda',
    channels = 1,
    timesteps = 20,
    loss_type = 'l1',
    kernel_std=0.1,
    kernel_size=3,
    blur_routine='Incremental',
    train_routine = 'Final',
    sampling_routine = 'x0_step_down',
    discrete=False,
    results_folder = './tmp/4',
    indices_to_skip = []
).cuda()

trainer = Trainer(
    diffusion,
    feature,
    image_size = inputSize,
    train_batch_size = 5,
    train_lr = 2e-5,
    train_num_steps = 501,
    gradient_accumulate_every = 2,
    ema_decay = 0.995, 
    fp16 = False, 
    results_folder = './tmp/4',
    dataset = 'Protein'
)

diffusion = torch.nn.DataParallel(diffusion, device_ids=range(torch.cuda.device_count()))

In [None]:
trainer.train()

In [None]:
testProtein = ['ADNKFNKEQQNAFYEILHLPNLNEEQRNGFIQSLKDDPSQSANLLAEAKKLNDAQAPK']

In [None]:
import types

def new_method(self):
    print("This is a new method")

tester =Trainer(
    diffusion,
    #'./root_mnist',
    testProtein,
    image_size = inputSize,
    train_batch_size = 1,
    train_lr = 2e-5,
    train_num_steps = 1001,         # total training steps
    gradient_accumulate_every = 2,    # gradient accumulation steps
    ema_decay = 0.995,                # exponential moving average decay
    fp16 = False,                       # turn on mixed precision training with apex
    results_folder = './tmp/6',
    dataset = 'Protein',
    load_path = './tmp/4/model_500.pt',
)

In [None]:
ogProtein, noiseProtein, deNoiseProtein = tester.test_from_data('test', d_times = 20, s_times= 20)

In [None]:
def normalize_onehot(matrix):
    max_values = np.max(matrix[:,:,:,:21], axis=-1, keepdims=True)  
    onehot_matrix = np.where(matrix == max_values, 1, 0)
    return onehot_matrix

In [None]:
def onehot_to_protein(onehot_tensor):
    amino_acids = 'ACDEFGHIKLMNPQRSTVWYX'
    num_rows, num_cols = onehot_tensor.shape
    protein_sequence = []
    for i in range(num_rows):
        max_index = np.argmax(onehot_tensor[i])
        protein_sequence.append(amino_acids[max_index])
    
    return ''.join(protein_sequence)

In [None]:
onehot_tensor = normalize_onehot(deNoiseProtein[19].cpu().numpy()).squeeze()
protein_sequence = onehot_to_protein(onehot_tensor)
print(protein_sequence[:58])