In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
# Set constants
HEIGHT = 100
WIDTH = 180

In [None]:
import random
import math
import numpy as np
import matplotlib.pyplot as plt
import sys
from tqdm.notebook import tqdm

class VariantCalling:
    def __init__(self, mutation_labels, mutation_types_names, file_name) -> None:
        self.mutation_labels = mutation_labels
        self.mutation_type_names = mutation_types_names
        self.NUCLEOTIDES = "ACGT"
        self.transdict = {"A":0, "C": 1, "G":2, "T":3}
        self.reverse_transdict = {0: "A", 1: "C", 2: "G", 3: "T"}

        # self.clones = []
        # with open(file_name, "r") as f:
        #     for clone in f:
        #         alignment = []
        #         for char in clone.strip():
        #             alignment.append(char)
        #         self.clones.append(alignment)
        self.nb_clones = len(self.clones)
        self.clones_int = self.char_to_int(self.clones)

class VariantCallingDataMini(VariantCalling):
    """Class for simulated data generation"""
    def __init__(self,
                 mutation_labels={"no_SNP": 0, "heterozygous_SNP": 1, "homozygous_SNP": 2},
                 mutation_types_names={0: "No mutation", 1: "Heterozygous SNP", 2: "Homozygous SNP"},
                 file_name="clones.txt"
                 ) -> None:
        self.generate_random_clones()
        super().__init__(mutation_labels=mutation_labels, mutation_types_names=mutation_types_names,
                         file_name=file_name)
        self.alignments = None
        self.mutation_types = None


    def char_to_int(self, alignments=None) -> np.ndarray:
        """Maps the char ACGT to the corresponding integers"""
        if alignments is None:
            alignments = self.alignments
        return np.vectorize(self.transdict.get)(alignments)



    def _simulate_read(self, clone_type_index: int, alignment_error_prob: float, sequencing_error_prob: float):
        sim_read = []
        copied_clone = list(self.clones[clone_type_index])  # This is so that run-time is o(n) not o(2n)
        pointer = 0
        second_alignment_error_prob = alignment_error_prob / 5  # guess work right here
        third_alignment_error_prob = alignment_error_prob /  8  # guess work right here
        forth_alignment_error_prob = alignment_error_prob /  15  # guess work right here
        while len(sim_read) < len(self.clones[clone_type_index]):
            alignment = random.uniform(0, 1)
            direction = random.choice([1, -1])
            if alignment <=  forth_alignment_error_prob:
                pointer += direction * 4
            elif alignment <= third_alignment_error_prob:
                pointer += direction * 3
            elif alignment <= second_alignment_error_prob:
                pointer += direction * 2
            elif alignment <=  alignment_error_prob:
                pointer += direction

            if pointer < 0 or pointer >= len(self.clones[clone_type_index]):
                sim_read.append(random.choice(['A', 'C', 'G', 'T']))
            else:
                sequencing = random.uniform(0,1)
                if sequencing < sequencing_error_prob:
                    current = copied_clone[pointer]
                    choice_array = ['A', 'C', 'G', 'T']
                    choice_array.remove(current)
                    copied_clone[pointer] = random.choice(choice_array)

                sim_read.append(copied_clone[pointer])

            pointer += 1
        return self.char_to_int(sim_read)

    def generate_data_for_noise_reduction(self, sample_size=1000, image_height=HEIGHT, alignment_error_prob=0.05, sequencing_error_prob=0.05):
        """Function which returns noisy and non-noisy data"""

        noisy_images = []
        clean_images = []
        all_mutation_positions = []

        for _ in tqdm(range(sample_size)):
            self.generate_random_clones()
            noisy = []
            clean = []
            random_array = np.random.randint(len(self.clones), size=image_height)
            for clone_type in random_array:
                noisy.append(self._simulate_read(clone_type, alignment_error_prob, sequencing_error_prob))
                clean.append(self.char_to_int(list(self.clones[clone_type])))
            noisy_images.append(noisy)
            clean_images.append(clean)
            all_mutation_positions.append(self.mutation_positions)

        return np.array(noisy_images), np.array(clean_images), np.array(all_mutation_positions)

    def generate_random_clones(self, num_of_mutants=3, num_of_mutations_per_clone=3, size=WIDTH):
        clones = []
        bases = ['A', 'C', 'G', 'T']
        random_seq = np.random.choice(bases, size=size).tolist()
        clones.append(random_seq)

        mean_value = (size - 1) / 2  # (0 + 177) / 2
        std_deviation = mean_value / 3.5
        # std_deviation = mean_value / 20
        mutation_pos_1 = np.random.normal(loc=mean_value, scale=std_deviation)
        mutation_pos_1 = np.clip(mutation_pos_1, 0, size - 1)
        mutation_pos_1 = int(round(mutation_pos_1))

        mutation_positions = set()
        for _ in range(num_of_mutants):
            new_variant = random_seq[:]
            for _ in range(num_of_mutations_per_clone):
                mutation_pos_variant = int(np.clip(mutation_pos_1 + np.random.normal(loc=0, scale=0.5)*10, 0, size - 1))
                new_variant[mutation_pos_variant] = np.random.choice(bases)
                mutation_positions.add(mutation_pos_variant)
            clones.append(new_variant)

        self.clones = clones
        self.mutation_positions = list(mutation_positions)
        return clones, mutation_positions

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, UpSampling2D, Input, Conv2DTranspose
from tensorflow.keras.models import Model
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.optimizers.legacy import Adam

In [None]:
import pickle

In [None]:
dg = VariantCallingDataMini()

In [None]:
# Check if GPU is available
if tf.config.experimental.list_physical_devices('GPU'):
    # Limit GPU memory allocation to avoid GPU memory exhaustion
    for gpu in tf.config.experimental.list_physical_devices('GPU'):
        tf.config.experimental.set_memory_growth(gpu, True)
else:
    print("No GPU found. Using CPU.")


In [None]:
def create_autoencoder_model(input_shape):
    # Encoder
    input_layer = Input(shape=input_shape)
    x = Conv2D(64, (3, 3), activation='relu', padding='same')(input_layer)
    x = Conv2D(64, (3, 3), activation='relu', padding='same')(x)
    x = MaxPooling2D((2, 2))(x)
    x = Conv2D(128, (3, 3), activation='relu', padding='same')(x)
    x = Conv2D(128, (3, 3), activation='relu', padding='same')(x)
    x = MaxPooling2D((2, 2))(x)
    x = Conv2D(256, (3, 3), activation='relu', padding='same')(x)
    encoded = Conv2D(256, (3, 3), activation='relu', padding='same')(x)

    # Decoder
    x = Conv2D(256, (3, 3), activation='relu', padding='same')(encoded)
    x = Conv2D(256, (3, 3), activation='relu', padding='same')(x)
    x = UpSampling2D((2, 2))(x)
    x = Conv2D(128, (3, 3), activation='relu', padding='same')(x)
    x = Conv2D(128, (3, 3), activation='relu', padding='same')(x)
    x = UpSampling2D((2, 2))(x)
    x = Conv2D(64, (3, 3), activation='relu', padding='same')(x)
    x = Conv2D(64, (3, 3), activation='relu', padding='same')(x)
    decoded = Conv2D(1, (3, 3), activation='sigmoid', padding='same')(x)  # Adjust channels based on input_shape

    autoencoder = Model(input_layer, decoded)

    return autoencoder


def create_cnn_model1(input_shape):
    input_layer = Input(shape=input_shape)

    # Encoder
    encoded = Conv2D(64, (8, 8), activation='relu', padding='same')(input_layer)
    encoded = Conv2D(128, (4, 4), activation='relu', padding='same')(encoded)

    # Decoder
    decoded = Conv2DTranspose(128, (4, 4), activation='relu', padding='same')(encoded)
    decoded = Conv2DTranspose(64, (8, 8), activation='relu', padding='same')(decoded)
    decoded = Conv2D(1, (8, 8), activation='sigmoid', padding='same')(decoded)

    model = Model(input_layer, decoded)

    return model

def create_cnn_model2(input_shape):
    input_layer = Input(shape=input_shape)

    # Encoder
    encoded = Conv2D(64, (6, 6), activation='relu', padding='same')(input_layer)
    encoded = Conv2D(128, (3, 3), activation='relu', padding='same')(encoded)

    # Decoder
    decoded = Conv2DTranspose(128, (3, 3), activation='relu', padding='same')(encoded)
    decoded = Conv2DTranspose(64, (6, 6), activation='relu', padding='same')(decoded)
    decoded = Conv2D(1, (6, 6), activation='sigmoid', padding='same')(decoded)

    model = Model(input_layer, decoded)

    return model

In [None]:
def gen_data(align_err, seq_err):
    img_row = HEIGHT
    img_col = WIDTH

    print(f"\ngenerating alignment error {align_err}, seq error {seq_err}")
    noisy, clean, all_mutation_positions = dg.generate_data_for_noise_reduction(sample_size=6000, alignment_error_prob=align_err, sequencing_error_prob=seq_err)

    noisy = noisy.astype('float32') / 3
    clean = clean.astype('float32') / 3
    noisy = noisy.reshape(noisy.shape[0], img_row, img_col, 1)
    clean = clean.reshape(clean.shape[0], img_row, img_col, 1)
    return noisy, clean, all_mutation_positions

In [None]:
# Custom Weighted MSE Loss

def WeightedMSELoss(y_true_mutation_pos: tuple[np.ndarray, np.ndarray], y_pred: tf.Tensor):
    y_true, mutation_pos = y_true_mutation_pos

    if not tf.is_tensor(y_pred):
        y_pred = tf.constant(y_pred)
    if not tf.is_tensor(y_true):
        y_true = tf.constant(y_true)

    # Calculate squared error
    squared_error = tf.square(y_true - y_pred)

    # generate weight
    tensor_shape = (HEIGHT, WIDTH)
    weight_tensor = tf.zeros(tensor_shape)
    for i in range(tensor_shape[0]):
        weight_tensor = tf.tensor_scatter_nd_add(weight_tensor, indices=[[i, pos] for pos in mutation_pos], updates=(tf.ones_like(mutation_pos, dtype=tf.float32)))
    weight_tensor = weight_tensor * 19
    weight_tensor = weight_tensor + 1
    weight_tensor = weight_tensor.reshape(weight_tensor.shape[0], HEIGHT, WIDTH, 1)

    # apply weights
    weighted_squared_error = squared_error * weight_tensor

    # compute mean of loss
    loss = tf.reduce_mean(weighted_squared_error)

    return loss

In [None]:
input_shape = (HEIGHT, WIDTH, 1)

model1 = create_autoencoder_model(input_shape)
model2 = create_cnn_model2(input_shape)

learning_rate = 0.0005
model1.compile(optimizer=Adam(learning_rate=learning_rate), loss=WeightedMSELoss)
print("\nmodel1:")
print(model1.summary())

learning_rate = 0.0005
model2.compile(optimizer=Adam(learning_rate=learning_rate), loss=WeightedMSELoss)
print("\nmodel2:")
print(model2.summary())



model1:
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 100, 180, 1)]     0         
                                                                 
 conv2d (Conv2D)             (None, 100, 180, 64)      640       
                                                                 
 conv2d_1 (Conv2D)           (None, 100, 180, 64)      36928     
                                                                 
 max_pooling2d (MaxPooling2  (None, 50, 90, 64)        0         
 D)                                                              
                                                                 
 conv2d_2 (Conv2D)           (None, 50, 90, 128)       73856     
                                                                 
 conv2d_3 (Conv2D)           (None, 50, 90, 128)       147584    
                                                    

In [None]:
# # Curriculum
batch_size = 64
img_row = HEIGHT
img_col = WIDTH

curriculum = [
    {"align_err": 0, "seq_err": 0.01, "model1_epochs": 16, "model2_epochs": 5},
    {"align_err": 0, "seq_err": 0.01, "model1_epochs": 16, "model2_epochs": 5},
    {"align_err": 0, "seq_err": 0.05, "model1_epochs": 24, "model2_epochs": 5},
    {"align_err": 0, "seq_err": 0.05, "model1_epochs": 24, "model2_epochs": 5},
    {"align_err": 0.001, "seq_err": 0.05, "model1_epochs": 24, "model2_epochs": 5},
    {"align_err": 0.001, "seq_err": 0.05, "model1_epochs": 24, "model2_epochs": 5},
    {"align_err": 0.003, "seq_err": 0.05, "model1_epochs": 32, "model2_epochs": 5},
    {"align_err": 0.003, "seq_err": 0.05, "model1_epochs": 32, "model2_epochs": 5},
    {"align_err": 0.005, "seq_err": 0.05, "model1_epochs": 32, "model2_epochs": 5},
    {"align_err": 0.005, "seq_err": 0.05, "model1_epochs": 32, "model2_epochs": 5},
    {"align_err": 0.005, "seq_err": 0.05, "model1_epochs": 32, "model2_epochs": 5},
    {"align_err": 0.01, "seq_err": 0.05, "model1_epochs": 42, "model2_epochs": 8},
    {"align_err": 0.01, "seq_err": 0.05, "model1_epochs": 42, "model2_epochs": 8},
]

for module in curriculum:
    print("current module: ", module)
    noisy, clean, all_mutation_positions = gen_data(module["align_err"], module["seq_err"])

    epochs = module["model1_epochs"]
    model1.fit(noisy, (clean, all_mutation_positions), batch_size=batch_size, epochs=epochs, shuffle=True)

    noisy1, clean1, _ = dg.generate_data_for_noise_reduction(sample_size=1,
                                                          alignment_error_prob=module["align_err"],
                                                          sequencing_error_prob=module["seq_err"])
    noisy1 = noisy1.astype('float32') / 3
    clean1 = clean1.astype('float32') / 3
    noisy1 = noisy1.reshape(noisy1.shape[0], img_row, img_col, 1)
    clean1 = clean1.reshape(clean1.shape[0], img_row, img_col, 1)

    noisy_image = np.array([noisy1[-1]])
    denoised_image = model1.predict(noisy_image)

    plt.imshow(noisy1[0], cmap='jet')
    plt.savefig(f'/content/drive/My Drive/VC/VC_noisy_{module["align_err"]}_{module["seq_err"]}.png')
    plt.close()

    plt.imshow(clean1[0], cmap='jet')
    plt.savefig(f'/content/drive/My Drive/VC/VC_expected_{module["align_err"]}_{module["seq_err"]}.png')
    plt.close()

    plt.imshow(np.round(denoised_image[0] * 3), cmap='jet')
    plt.savefig(f'/content/drive/My Drive/VC/VC_denoised_{module["align_err"]}_{module["seq_err"]}.png')
    plt.close()


    denoised_image1 = model1.predict(noisy)
    denoised_image1_quantised = np.round(denoised_image1 * 3) / 3

    epochs = module["model2_epochs"]
    model2.fit(denoised_image1_quantised, (clean, all_mutation_positions), batch_size=batch_size, epochs=epochs, shuffle=True)
    denoised_image2 = model2.predict(np.round(np.array([denoised_image[0]]) * 3) / 3)
    plt.imshow(np.round(denoised_image2[0] * 3), cmap='jet')
    plt.savefig(f'/content/drive/My Drive/VC/VC_denoised2_{module["align_err"]}_{module["seq_err"]}.png')
    plt.close()

    model1.save('/content/drive/My Drive/model1_vgg_generic_larger_epochs_01122023')
    model2.save('/content/drive/My Drive/model2_vgg_generic_larger_epochs_01122023')


current module:  {'align_err': 0, 'seq_err': 0.01, 'model1_epochs': 16, 'model2_epochs': 5}

generating alignment error 0, seq error 0.01


  0%|          | 0/6000 [00:00<?, ?it/s]

Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


  0%|          | 0/1 [00:00<?, ?it/s]

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
current module:  {'align_err': 0, 'seq_err': 0.01, 'model1_epochs': 16, 'model2_epochs': 5}

generating alignment error 0, seq error 0.01


  0%|          | 0/6000 [00:00<?, ?it/s]

Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


  0%|          | 0/1 [00:00<?, ?it/s]

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
current module:  {'align_err': 0, 'seq_err': 0.05, 'model1_epochs': 24, 'model2_epochs': 5}

generating alignment error 0, seq error 0.05


  0%|          | 0/6000 [00:00<?, ?it/s]

Epoch 1/24
Epoch 2/24
Epoch 3/24
Epoch 4/24
Epoch 5/24
Epoch 6/24
Epoch 7/24
Epoch 8/24
Epoch 9/24
Epoch 10/24
Epoch 11/24
Epoch 12/24
Epoch 13/24
Epoch 14/24
Epoch 15/24
Epoch 16/24
Epoch 17/24
Epoch 18/24
Epoch 19/24
Epoch 20/24
Epoch 21/24
Epoch 22/24
Epoch 23/24
Epoch 24/24


  0%|          | 0/1 [00:00<?, ?it/s]

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
current module:  {'align_err': 0, 'seq_err': 0.05, 'model1_epochs': 24, 'model2_epochs': 5}

generating alignment error 0, seq error 0.05


  0%|          | 0/6000 [00:00<?, ?it/s]

Epoch 1/24
Epoch 2/24
Epoch 3/24
Epoch 4/24
Epoch 5/24
Epoch 6/24
Epoch 7/24
Epoch 8/24
Epoch 9/24
Epoch 10/24
Epoch 11/24
Epoch 12/24
Epoch 13/24
Epoch 14/24
Epoch 15/24
Epoch 16/24
Epoch 17/24
Epoch 18/24
Epoch 19/24
Epoch 20/24
Epoch 21/24
Epoch 22/24
Epoch 23/24
Epoch 24/24


  0%|          | 0/1 [00:00<?, ?it/s]

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
current module:  {'align_err': 0.001, 'seq_err': 0.05, 'model1_epochs': 24, 'model2_epochs': 5}

generating alignment error 0.001, seq error 0.05


  0%|          | 0/6000 [00:00<?, ?it/s]

Epoch 1/24
Epoch 2/24
Epoch 3/24
Epoch 4/24
Epoch 5/24
Epoch 6/24
Epoch 7/24
Epoch 8/24
Epoch 9/24
Epoch 10/24
Epoch 11/24
Epoch 12/24
Epoch 13/24
Epoch 14/24
Epoch 15/24
Epoch 16/24
Epoch 17/24
Epoch 18/24
Epoch 19/24
Epoch 20/24
Epoch 21/24
Epoch 22/24
Epoch 23/24
Epoch 24/24


  0%|          | 0/1 [00:00<?, ?it/s]

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
current module:  {'align_err': 0.001, 'seq_err': 0.05, 'model1_epochs': 24, 'model2_epochs': 5}

generating alignment error 0.001, seq error 0.05


  0%|          | 0/6000 [00:00<?, ?it/s]

Epoch 1/24
Epoch 2/24
Epoch 3/24
Epoch 4/24
Epoch 5/24
Epoch 6/24
Epoch 7/24
Epoch 8/24
Epoch 9/24
Epoch 10/24
Epoch 11/24
Epoch 12/24
Epoch 13/24
Epoch 14/24
Epoch 15/24
Epoch 16/24
Epoch 17/24
Epoch 18/24
Epoch 19/24

In [None]:
# model1.save('/content/drive/My Drive/model1_generic_larger_epochs_29102023')
# model2.save('/content/drive/My Drive/model2_generic_larger_epochs_29102023')

In [None]:
# noisy1, clean1 = dg.generate_data_for_noise_reduction(sample_size=1, alignment_error_prob=0.003, sequencing_error_prob=0.05)
# noisy1 = noisy1.astype('float32') / 3
# clean1 = clean1.astype('float32') / 3
# noisy1 = noisy1.reshape(noisy1.shape[0], img_row, img_col, 1)
# clean1 = clean1.reshape(clean1.shape[0], img_row, img_col, 1)

In [None]:
# noisy_image = np.array([noisy1[-1]])

In [None]:
# denoised_image = model1.predict(noisy_image)

In [None]:
# plt.imshow(noisy1[0], cmap='jet')

In [None]:
# plt.imshow(clean1[0], cmap='jet')

In [None]:
# plt.imshow(np.round(denoised_image[0] * 3), cmap='jet')

In [None]:
# plt.imshow(np.round(denoised_image[0] * 3), cmap='jet')
# denoised_image2 = model2.predict(np.round(np.array([denoised_image[0]]) * 3) / 3)

In [None]:
# plt.imshow(np.round(denoised_image2[0] * 3), cmap="jet")

In [None]:
# clone_1  = dg.char_to_int(dg.clones[0])
# clone_2  = dg.char_to_int(dg.clones[1])
# clone_3  = dg.char_to_int(dg.clones[2])


In [None]:
# reshaped_test_data = np.round(denoised_image2[0] * 3).reshape(100,178)

In [None]:
# clone_counter = [0,0,0,0,]
# for row in reshaped_test_data:
#     if (row == clone_1).all():
#         clone_counter[0] += 1
#     elif (row == clone_2).all():
#         clone_counter[1] += 1
#     elif (row == clone_3).all():
#         clone_counter[2] += 1
#     else:
#         clone_counter[3] += 1

# clone_counter