In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from convert_encodings import m2
import re

In [2]:
def combine(inputs, labels, new_inputs, new_labels):
    new_inputs = np.vstack(new_inputs)
    new_labels = np.hstack(new_labels)
    min_size = min(inputs.shape[0], new_inputs.shape[0])
    new_inputs = new_inputs[:, :min_size]

    
    inputs = np.vstack((inputs, new_inputs))
    labels = np.hstack((labels, new_labels))

    return inputs, labels


def random_replace(inputs, labels, factor):
    new_inputs = []
    new_labels = []
    for idx in range(inputs.shape[0]):
        ip = inputs[idx]
        label = labels[idx]

        try:
            unpadded_len = np.where(ip == 0)[0][0]
        except IndexError:
            unpadded_len = len(ip)
        num_to_replace = round(unpadded_len * factor)
        indices = np.random.choice(unpadded_len, num_to_replace, replace=False)
        ip[indices] = np.random.choice(np.arange(5, 25), num_to_replace, replace=True)

        new_inputs.append(ip)
        new_labels.append(label)

    return new_inputs, new_labels


def random_delete(inputs, labels, factor):
    new_inputs = []
    new_labels = []
    for idx in range(inputs.shape[0]):
        ip = inputs[idx]
        label = labels[idx]

        try:
            unpadded_len = np.where(ip == 0)[0][0]
        except IndexError:
            unpadded_len = len(ip)
        ip = list(ip[:unpadded_len])
        num_to_delete = round(unpadded_len * factor)
        indices = np.random.choice(unpadded_len, num_to_delete, replace=False)
        for i in reversed(sorted(indices)):
            ip.pop(i)
        ip.extend([0] * (200 - len(ip)))

        new_inputs.append(np.asarray(ip))
        new_labels.append(label)

    return new_inputs, new_labels


def random_replace_with_A(inputs, labels, factor):
    new_inputs = []
    new_labels = []
    for idx in range(inputs.shape[0]):
        ip = inputs[idx]
        label = labels[idx]

        try:
            unpadded_len = np.where(ip == 0)[0][0]
        except IndexError:
            unpadded_len = len(ip)
        num_to_replace = round(unpadded_len * factor)
        indices = np.random.choice(unpadded_len, num_to_replace, replace=False)
        ip[indices] = m2['A']

        new_inputs.append(ip)
        new_labels.append(label)

    return new_inputs, new_labels


def random_swap(inputs, labels, factor):
    new_inputs = []
    new_labels = []
    for idx in range(inputs.shape[0]):
        ip = inputs[idx]
        label = labels[idx]

        try:
            unpadded_len = np.where(ip == 0)[0][0]
        except IndexError:
            unpadded_len = len(ip)
        ip = list(ip[:unpadded_len])
        num_to_swap = round(unpadded_len * factor)
        indices = np.random.choice(range(1, unpadded_len, 2), num_to_swap, replace=False)
        for i in indices:
            ip[i-1], ip[i] = ip[i], ip[i-1]
        ip.extend([0] * (200 - len(ip)))

        new_inputs.append(np.asarray(ip))
        new_labels.append(label)

    return new_inputs, new_labels


def random_insertion_with_A(inputs, labels, factor):
    new_inputs = []
    new_labels = []
    for idx in range(inputs.shape[0]):
        ip = inputs[idx]
        label = labels[idx]

        try:
            unpadded_len = np.where(ip == 0)[0][0]
        except IndexError:
            unpadded_len = len(ip)
        ip = list(ip[:unpadded_len])
        num_to_insert = round(unpadded_len * factor)
        indices = np.random.choice(unpadded_len, num_to_insert, replace=False)
        for i in indices:
            ip.insert(i, m2['A'])
        if len(ip) < 200:
            ip.extend([0] * (200 - len(ip)))
        elif len(ip) > 200:
            ip = ip[:200]

        new_inputs.append(np.asarray(ip))
        new_labels.append(label)

    return new_inputs, new_labels


def random_masking(sequences, mask_prob=0.15, mask_token_id=0):
    masked_sequences = np.copy(sequences)
    mask = np.random.rand(*sequences.shape) < mask_prob
    masked_sequences[mask] = mask_token_id
    return masked_sequences

In [3]:
seqs = pd.read_csv('./data/VEGF_seqs.csv')
seqs

Unnamed: 0,Sequence,Label
0,RRPKGRGKRRREKQRP,1
1,RRPKGRGKRRREKQRPCDKPRR,1
2,RRPKGRGKRRREKQRPSDKPRR,1
3,RRPKGRGKRRREKQRPDAVPRR,1
4,CKGRGKRCREKQRPSDKPRR,1
5,KGRGKRRREKQRPCDKPRR,1
6,RRREKQRPCDKPRR,1
7,KGRGKRRREKQRPSDKPR,1
8,KGRGKRRREKQRPSDKP,1
9,KGRGKRRREKQRPSDKPRR,1


In [4]:
inputs = seqs['Sequence'].to_numpy()
labels = seqs['Label'].to_numpy()

In [109]:
inputs.shape

(240,)

In [122]:
 # new_inputs1, new_labels1 = random_replace(inputs, labels, 0.02)
# new_inputs2, new_labels2 = random_delete(inputs, labels, 0.02)
# new_inputs3, new_labels3 = random_replace_with_A(inputs, labels, 0.02)
new_inputs4, new_labels4 = random_swap(inputs, labels, 0.04)
# new_inputs5, new_labels5 = random_insertion_with_A(inputs, labels, 0.02)
#new_inputs6, new_labels6 = random_masking(inputs, mask_prob=0.15, mask_token_id=0)


  unpadded_len = np.where(ip == 0)[0][0]


In [123]:
new_inputs4 = np.vstack(new_inputs4)
new_inputs = pd.DataFrame(new_inputs4 )
augmented = new_inputs.apply(lambda row: re.sub(r'[0]','',''.join(row.values.astype(str))), axis=1)




In [124]:
augmented.shape

(264,)

In [125]:
augmented = pd.DataFrame(augmented, columns = ['Sequence'])

In [126]:
augmented = pd.concat([augmented, pd.DataFrame(new_labels4, columns=['Label'])], axis = 1)
augmented

Unnamed: 0,Sequence,Label
0,RRKPGRGKRRREQKRP,1
1,RRPKRGGKRRERKQRPDCKPRR,1
2,RRKPGRKGRRREQKPRDSPKRR,1
3,RRPKRGGKRRERKQRPDAVPRR,1
4,KCGRKGRCREKQRPDSKPRR,1
...,...,...
259,KGRGRKAAEKQAPSDKPRR,1
260,AGRGARAAEAQRSPDKPRR,1
261,RRPKRGGKRRREKQRPSDAAAR,1
262,KSVRGKGKGQKRKRKKRSYK,1


In [127]:
augmented.rename(columns={0:'Sequence', 0:'Label'}, inplace = True)
augmented

Unnamed: 0,Sequence,Label
0,RRKPGRGKRRREQKRP,1
1,RRPKRGGKRRERKQRPDCKPRR,1
2,RRKPGRKGRRREQKPRDSPKRR,1
3,RRPKRGGKRRERKQRPDAVPRR,1
4,KCGRKGRCREKQRPDSKPRR,1
...,...,...
259,KGRGRKAAEKQAPSDKPRR,1
260,AGRGARAAEAQRSPDKPRR,1
261,RRPKRGGKRRREKQRPSDAAAR,1
262,KSVRGKGKGQKRKRKKRSYK,1


In [128]:
seqs.shape

(24, 2)

In [129]:
augmented = pd.concat([augmented, seqs], axis = 0, ignore_index=True)
augmented

Unnamed: 0,Sequence,Label
0,RRKPGRGKRRREQKRP,1
1,RRPKRGGKRRERKQRPDCKPRR,1
2,RRKPGRKGRRREQKPRDSPKRR,1
3,RRPKRGGKRRERKQRPDAVPRR,1
4,KCGRKGRCREKQRPDSKPRR,1
...,...,...
283,KGRGKRAAEKQAPSDKPRR,1
284,AGRGARAAEAQRPSDKPRR,1
285,RRPKGRGKRRREKQRPSDAAAR,1
286,KSVRGKGKGQKRKRKKSRYK,1


In [134]:
augmented.to_csv('./data/vegf_augmented.csv')

In [130]:
inputs = augmented['Sequence'].to_numpy()
labels = augmented['Label'].to_numpy()

In [131]:
aug_tmp = augmented.copy()

In [132]:
inputs.shape

(288,)

In [133]:
seqs.shape

(24, 2)