<a href="https://colab.research.google.com/github/RodrigoEslava/Keras/blob/main/Deep_Learning_para_Bioinform%C3%A1tica_deep_learning_com_Keras_parte_I.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Baixando dados

In [None]:
!wget -O membrane.fasta 'https://rest.uniprot.org/uniprotkb/stream?format=fasta&query=%28%28taxonomy_id%3A2%29%20AND%20%28cc_scl_term%3ASL-0162%29%20AND%20%28reviewed%3Atrue%29%29'

--2023-03-09 23:08:04--  https://rest.uniprot.org/uniprotkb/stream?format=fasta&query=%28%28taxonomy_id%3A2%29%20AND%20%28cc_scl_term%3ASL-0162%29%20AND%20%28reviewed%3Atrue%29%29
Resolving rest.uniprot.org (rest.uniprot.org)... 193.62.193.81
Connecting to rest.uniprot.org (rest.uniprot.org)|193.62.193.81|:443... connected.
HTTP request sent, awaiting response... 200 
Length: unspecified [text/plain]
Saving to: ‘membrane.fasta’

membrane.fasta          [               <=>  ]  23.14M  3.10MB/s    in 10s     

2023-03-09 23:08:16 (2.21 MB/s) - ‘membrane.fasta’ saved [24260077]



In [None]:
!wget -O cytoplasm.fasta 'https://rest.uniprot.org/uniprotkb/stream?format=fasta&query=%28%28taxonomy_id%3A2%29%20AND%20%28cc_scl_term%3ASL-0086%29%20AND%20%28reviewed%3Atrue%29%29'

--2023-03-09 23:08:16--  https://rest.uniprot.org/uniprotkb/stream?format=fasta&query=%28%28taxonomy_id%3A2%29%20AND%20%28cc_scl_term%3ASL-0086%29%20AND%20%28reviewed%3Atrue%29%29
Resolving rest.uniprot.org (rest.uniprot.org)... 193.62.193.81
Connecting to rest.uniprot.org (rest.uniprot.org)|193.62.193.81|:443... connected.
HTTP request sent, awaiting response... 200 
Length: unspecified [text/plain]
Saving to: ‘cytoplasm.fasta’

cytoplasm.fasta         [                 <=>]  56.33M  2.30MB/s    in 26s     

2023-03-09 23:08:43 (2.15 MB/s) - ‘cytoplasm.fasta’ saved [59070211]



In [None]:
!pip install biopython

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# 2. Pre-processamento

In [None]:
from Bio import SeqIO
import pandas as pd

rows = []

for record in SeqIO.parse('membrane.fasta', 'fasta'):
  rows.append({'sequence': str(record.seq), 'location': 'membrane'})

for record in SeqIO.parse('cytoplasm.fasta', 'fasta'):
  rows.append({'sequence': str(record.seq), 'location': 'cytoplasm'})

df = pd.DataFrame(rows)

In [None]:
df.groupby(by=['location']).count()

Unnamed: 0_level_0,sequence
location,Unnamed: 1_level_1
cytoplasm,120252
membrane,49634


In [None]:
from imblearn.under_sampling import RandomUnderSampler

sampler = RandomUnderSampler()
df_resampled, _ = sampler.fit_resample(df, df['location'])

In [None]:
df_resampled.groupby(by=['location']).count()

Unnamed: 0_level_0,sequence
location,Unnamed: 1_level_1
cytoplasm,49634
membrane,49634


In [None]:
df_resampled = df_resampled.sample(30000)

In [None]:
df_resampled.groupby(by=['location']).count()

Unnamed: 0_level_0,sequence
location,Unnamed: 1_level_1
cytoplasm,15076
membrane,14924


In [None]:
X = df_resampled['sequence']
y = df_resampled['location']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
from tensorflow import keras

tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts(X_train)

X_train_tokens = tokenizer.texts_to_sequences(X_train)
X_test_tokens  = tokenizer.texts_to_sequences(X_test)

In [None]:
MAX_SEQ_LEN = max(X_train.map(len))
MAX_SEQ_LEN

10624

In [None]:
X_train_tokens_padded = keras.preprocessing.sequence.pad_sequences(
    X_train_tokens, 
    maxlen=1500
)

X_test_tokens_padded = keras.preprocessing.sequence.pad_sequences(
    X_test_tokens, 
    maxlen=1500
)

In [None]:
X_test_tokens_padded.shape

(7500, 1500)

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(y_train)

y_train_encoded = keras.utils.to_categorical(le.transform(y_train), num_classes=2)
y_test_encoded  = keras.utils.to_categorical(le.transform(y_test), num_classes=2)

y_train_encoded

array([[0., 1.],
       [0., 1.],
       [1., 0.],
       ...,
       [1., 0.],
       [0., 1.],
       [1., 0.]], dtype=float32)

# 3. Treinamento de modelo

In [None]:
model = keras.models.Sequential()
model.add(keras.layers.Embedding(len(tokenizer.word_index)+1, 20, input_length=1500))
model.add(keras.layers.Conv1D(32, 8))
model.add(keras.layers.Flatten())
model.add(keras.layers.Dense(32, activation='relu'))
model.add(keras.layers.Dense(2, activation='softmax'))

In [None]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 1500, 20)          460       
                                                                 
 conv1d_2 (Conv1D)           (None, 1493, 32)          5152      
                                                                 
 flatten_2 (Flatten)         (None, 47776)             0         
                                                                 
 dense_4 (Dense)             (None, 32)                1528864   
                                                                 
 dense_5 (Dense)             (None, 2)                 66        
                                                                 
Total params: 1,534,542
Trainable params: 1,534,542
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.fit(
    X_train_tokens_padded, 
    y_train_encoded, 
    validation_data=(X_test_tokens_padded, y_test_encoded),
    epochs=10,
    batch_size=32
  )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f6400216100>

In [None]:
model = keras.models.Sequential()
model.add(keras.layers.Embedding(len(tokenizer.word_index)+1, 20, input_length=1500))
model.add(keras.layers.Conv1D(64, 8))
model.add(keras.layers.Conv1D(32, 8))
model.add(keras.layers.Flatten())
model.add(keras.layers.Dropout(0.5))
model.add(keras.layers.Dense(64, activation='relu'))
model.add(keras.layers.Dropout(0.5))
model.add(keras.layers.Dense(32, activation='relu'))
model.add(keras.layers.Dropout(0.5))
model.add(keras.layers.Dense(2, activation='softmax'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.fit(
    X_train_tokens_padded, 
    y_train_encoded, 
    validation_data=(X_test_tokens_padded, y_test_encoded),
    epochs=10,
    batch_size=32
  )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f63f82ff3a0>

In [None]:
model = keras.models.Sequential()
model.add(keras.layers.Embedding(len(tokenizer.word_index)+1, 20, input_length=1500))
model.add(keras.layers.LSTM(32))
model.add(keras.layers.Dropout(0.5))
model.add(keras.layers.Dense(64, activation='relu'))
model.add(keras.layers.Dropout(0.5))
model.add(keras.layers.Dense(32, activation='relu'))
model.add(keras.layers.Dropout(0.5))
model.add(keras.layers.Dense(2, activation='softmax'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.fit(
    X_train_tokens_padded, 
    y_train_encoded, 
    validation_data=(X_test_tokens_padded, y_test_encoded),
    epochs=10,
    batch_size=32
  )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f63f93d8850>

In [None]:
import numpy as np

def predict_location(protein_sequence):
  sequences = [protein_sequence]
  sequences_tokens = tokenizer.texts_to_sequences(sequences)
  sequences_tokens_padded = keras.utils.pad_sequences(sequences_tokens, maxlen=1500)
  y_pred = np.argmax(model.predict(sequences_tokens_padded, verbose=False), axis=1)
  return le.inverse_transform(y_pred)[0]

predict_location('MWSSRTV')

'membrane'

In [None]:
for record in SeqIO.parse('membrane.fasta', 'fasta'):
  print(record.id, predict_location(str(record.seq)))

sp|A0A089QRB9|MSL3_MYCTU membrane
sp|A0A0H2URG7|GTFA_STRPN cytoplasm
sp|A0A0H2V871|IROE_ECOL6 membrane
sp|A0A0H2V8B5|TCPC_ECOL6 membrane
sp|A0A0H2VG78|GLCP_STAES membrane
sp|A0A0H2ZMF9|PBP2A_STRP2 membrane
sp|A0A0H3GDH9|PGDA_LISM4 membrane
sp|A0A0H3GGY3|PGPH_LISM4 membrane
sp|A0A0H3M5A8|PPMNT_MYCBP membrane
sp|A0A0K8P8E7|MHETH_IDESA membrane
sp|A0A1S7LCW6|MAMP_MAGMO cytoplasm
sp|A0A2S3R7M0|MARTX_VIBVL membrane
sp|A0A2S4N3N0|OMPA_SHIFL cytoplasm
sp|A0A2T4VDM4|GSDM_VITXG membrane
sp|A0A3Q0NBH7|PGDA_LISMG membrane
sp|A0A3S5YBC7|EGCSE_RHOH1 cytoplasm
sp|A0LNN5|SFMCT_SYNFM membrane
sp|A0QNG1|PKNB_MYCS2 cytoplasm
sp|A0QP27|MMPL3_MYCS2 membrane
sp|A0QQF4|TTFA_MYCS2 cytoplasm
sp|A0QR29|MSPA_MYCS2 membrane
sp|A0QVH8|RIP1_MYCS2 membrane
sp|A0QWG5|ACYLT_MYCS2 membrane
sp|A0QWG6|PIMA_MYCS2 membrane
sp|A0QZ13|LNT_MYCS2 membrane
sp|A0R1E8|PKS5_MYCS2 membrane
sp|A1C3L9|GTFA_STRPA cytoplasm
sp|A1C3M0|GTFB_STRPA cytoplasm
sp|A1JUB7|YADA_YERE8 membrane
sp|A1YKW7|RTXA_KINKI membrane
sp|A5A616|MGTS_ECOLI 

KeyboardInterrupt: ignored

In [None]:
model.save('model.h5')

In [None]:
load_model = keras.models.load_model('model.h5')
load_model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 1500, 20)          460       
                                                                 
 conv1d_4 (Conv1D)           (None, 1493, 64)          10304     
                                                                 
 conv1d_5 (Conv1D)           (None, 1486, 32)          16416     
                                                                 
 flatten_4 (Flatten)         (None, 47552)             0         
                                                                 
 dropout_2 (Dropout)         (None, 47552)             0         
                                                                 
 dense_8 (Dense)             (None, 64)                3043392   
                                                                 
 dropout_3 (Dropout)         (None, 64)               