<a href="https://colab.research.google.com/github/RodrigoEslava/Keras/blob/main/Deep_Learing_para_Bioinform%C3%A1tica_deep_learning_com_Keras_parte_II.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!wget -O anticp.txt 'https://webs.iiitd.edu.in/raghava/anticp2/pos_train_main'

--2023-03-13 18:13:32--  https://webs.iiitd.edu.in/raghava/anticp2/pos_train_main
Resolving webs.iiitd.edu.in (webs.iiitd.edu.in)... 103.25.231.42
Connecting to webs.iiitd.edu.in (webs.iiitd.edu.in)|103.25.231.42|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 14940 (15K)
Saving to: ‘anticp.txt’


2023-03-13 18:13:33 (366 KB/s) - ‘anticp.txt’ saved [14940/14940]



In [None]:
X = []
y = []

for peptide in open('anticp.txt'):
  peptide='$' + peptide.strip('\n') + '@'
  for i in range(0, len(peptide)-1):
    X.append(peptide[:i+1])
    y.append(peptide[i+1])

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(X_train)

X_train_tokens = tokenizer.texts_to_sequences(X_train)
X_test_tokens = tokenizer.texts_to_sequences(X_test)

In [None]:
tokenizer.word_index

{'k': 1,
 'g': 2,
 'l': 3,
 'a': 4,
 '$': 5,
 'c': 6,
 'f': 7,
 'i': 8,
 's': 9,
 'r': 10,
 'v': 11,
 'p': 12,
 't': 13,
 'n': 14,
 'w': 15,
 'e': 16,
 'd': 17,
 'h': 18,
 'y': 19,
 'q': 20,
 'm': 21}

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

X_train_tokens_padded = pad_sequences(X_train_tokens, maxlen=50)
X_test_tokens_padded = pad_sequences(X_test_tokens, maxlen=50)

In [None]:
X_train_tokens_padded.shape

(11205, 50)

In [None]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

le = LabelEncoder()
le.fit(y_train)

y_train_encoded = to_categorical(le.transform(y_train), num_classes=len(le.classes_))
y_test_encoded  = to_categorical(le.transform(y_test), num_classes=len(le.classes_))

In [None]:
le.classes_

array(['@', 'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N',
       'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'], dtype='<U1')

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization
import numpy as np
import os
import re
import string
import random

In [None]:
def causal_attention_mask(batch_size, n_dest, n_src, dtype):
    """
    Mask the upper half of the dot product matrix in self attention.
    This prevents flow of information from future tokens to current token.
    1's in the lower triangle, counting from the lower right corner.
    """
    i = tf.range(n_dest)[:, None]
    j = tf.range(n_src)
    m = i >= j - n_src + n_dest
    mask = tf.cast(m, dtype)
    mask = tf.reshape(mask, [1, n_dest, n_src])
    mult = tf.concat(
        [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)], 0
    )
    return tf.tile(mask, mult)


class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads, embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size = input_shape[0]
        seq_len = input_shape[1]
        causal_mask = causal_attention_mask(batch_size, seq_len, seq_len, tf.bool)
        attention_output = self.att(inputs, inputs, attention_mask=causal_mask)
        attention_output = self.dropout1(attention_output)
        out1 = self.layernorm1(inputs + attention_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        return self.layernorm2(out1 + ffn_output)

In [None]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [None]:
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, Dropout, Flatten, Dense

model = Sequential()
model.add(Embedding(len(tokenizer.word_index)+1, 20, input_length=50))
model.add(Conv1D(32,8))
model.add(Dropout(0.5))
model.add(Conv1D(32,4))
model.add(Dropout(0.5))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(len=(le.classes_), activation='softmax'))
model.compile(optimizer='adam', loss='c')

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 token_and_position_embeddin  (None, 50, 20)           1440      
 g_2 (TokenAndPositionEmbedd                                     
 ing)                                                            
                                                                 
 transformer_block_3 (Transf  (None, 50, 20)           15596     
 ormerBlock)                                                     
                                                                 
 dense_17 (Dense)            (None, 50, 32)            672       
                                                                 
 dropout_17 (Dropout)        (None, 50, 32)            0         
                                                                 
 dense_18 (Dense)            (None, 50, 21)            693       
                                                      

In [None]:
model.fit(
    X_train_tokens_padded,
    y_train_encoded,
    validation_data=(
        X_test_tokens_padded,
        y_test_encoded
    ),
    epochs=10
)

Epoch 1/10


ValueError: ignored

In [None]:
import numpy as np

def sample(array, temperature=1.0):
  array = np.log(array) / temperature
  array = np.exp(array) / np.sum(np.exp(array))
  return np.argmax(np.random.multinomial(1, array, 1))

def generate_peptide(max_length=50):
  sequence = '$'
  next_token = None
  while len(sequence) + 1 < max_length and next_token != "@":
    sequence_tokens = tokenizer.texts_to_sequences([sequence])
    sequence_tokens_padded = pad_sequences(sequence_tokens, maxlen=max_length)
    y_pred = model.predict(sequence_tokens_padded, verbose=False)[0]
    next_token = le.inverse_transform([sample(y_pred)])[0]
    sequence += next_token
  return sequence.strip('$').strip('@')

for i in range(100):
  anticancer_peptide = generate_peptide()
  print(f'>{i}')
  print(generate_peptide())

>0
KAAAILEKFVKKVL
>1
LKCAPHGEAAGFGPI
>2
WLF
>3
ILKKLGKKLSLFALIHVLPKLKTAKK
>4
NGFFGLFKSIWKTAGCFIRELFLHKIVQANRKPERKGA
>5
ILVKGALGLKAALAKFLAKKAAKKLG
>6
RTSGRCSGVLPALGVCSMCHHILGTFGCARKRCSLRQYR
>7
SKK
>8
EGDKSPFGSRLFCGRIKG
>9
LLAKIIKWLVKHLA
>10
FFAKLIKKLAKLAKKAVK
>11
ALQGSLALGNAGSGKLFVGKCRNMVTWGLCKTPIDCKNYIHSGLWVHE
>12
LAFLLKALKKAA
>13
HIEVADYRCG
>14
GTLFLLIKKI
>15
LKLLKKALAAFFKLAK
>16
FLFHPFLLTHNPPFL
>17
GENTGLKYDLQRACYGGFLS
>18
VFFAKLSCKLKCRSCQPNPKWIKMCHLRPKDRRYDS
>19
GALDCILTVCGKQDISACALPCDGHCHRRKKGNY
>20
DLLLNLLKVFA
>21
IVGSDIN
>22
KFKFLWKALLKLL
>23
WAGFSCAFLCKKLT
>24
PRYGADFWGGVREKNCILESKWHPI
>25
D
>26
ITTVLLAALHIANVVL
>27
CKGLKKIAKFI
>28
AAPAKIVAGPGIDFTG
>29
FLF
>30
IAHLGCGAACRNP
>31
TWAAALKLFNGI
>32
DCAHGSPIEYWLGDCKVSKDGTMYAYPRKRCSWLHTT
>33
AKFLKKALKAAK
>34
LWPLWAPALKKLASKA
>35
AKGWGKAFKKALKKA
>36

>37
LAKRFPKAKLLLKAAK
>38
WSAIKTRPCAVTTVNGPKHGSVTCDGSTKGVYRVGGKCGERVRH
>39
WGKAGKFIGLAGKAA
>40
FLLILKRKIDKAAARLTFAKT
>41
PGIDQNNLGYGLNHPSWSGEAKARHAKNAFCDAGPKCPIPC
>42
GQKS
>43
TNFLLLKIDKW
>