In [27]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.models import Model
from keras.layers import Input, Embedding, Bidirectional, LSTM, TimeDistributed, Dense
from keras.utils import pad_sequences
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.utils import to_categorical
from keras_contrib.layers import CRF  # Use CRF from keras_contrib
from keras_contrib.losses import crf_loss
from keras_contrib.metrics import crf_accuracy


from plot_keras_history import plot_history

In [6]:
positive_data = pd.read_csv('MIBiG.pfam.tsv', sep='\t')
positive_data


Unnamed: 0,sequence_id,protein_id,gene_start,gene_end,gene_strand,pfam_id,in_cluster
0,BGC0000001.1,AEK75490.1,0,1083,1,PF02353,1
1,BGC0000001.1,AEK75490.1,0,1083,1,PF01135,1
2,BGC0000001.1,AEK75490.1,0,1083,1,PF01269,1
3,BGC0000001.1,AEK75490.1,0,1083,1,PF13489,1
4,BGC0000001.1,AEK75490.1,0,1083,1,PF01596,1
...,...,...,...,...,...,...,...
96407,BGC0001833.1,AYA44686.1,0,15051,1,PF13193,1
96408,BGC0001833.1,AYA44686.1,0,15051,1,PF00668,1
96409,BGC0001833.1,AYA44686.1,0,15051,1,PF00550,1
96410,BGC0001833.1,AYA44686.1,0,15051,1,PF00975,1


In [7]:
negative_data = pd.read_csv('GeneSwap_Negatives.pfam.tsv', sep='\t', index_col=0)
negative_data.rename(columns={"contig_id": "sequence_id"}, inplace=True)

negative_data

Unnamed: 0,sequence_id,protein_id,gene_start,gene_end,gene_strand,pfam_id,domain_start,domain_end,bitscore,in_cluster
0,NEG_FAKE_CLUSTER|AE000511.1|AF269227.1,AE000511_1072,0,2237,1,PF00702,0.0,210.0,138.5,0
1,NEG_FAKE_CLUSTER|AE000511.1|AF269227.1,AE000511_1072,0,2237,1,PF00403,1.0,61.0,45.8,0
2,NEG_FAKE_CLUSTER|AE000511.1|AF269227.1,AE000511_1072,0,2237,1,PF00122,2.0,178.0,162.4,0
3,NEG_FAKE_CLUSTER|AE000511.1|AF269227.1,AE000511_1072,0,2237,1,PF08282,196.0,255.0,20.6,0
4,NEG_FAKE_CLUSTER|AE000511.1|AF269227.1,AE000511_1503,2237,4870,1,PF00593,1.0,469.0,93.7,0
...,...,...,...,...,...,...,...,...,...,...
706945,NEG_FAKE_CLUSTER|U00096.3|AB007043.2,U00096_4155,4452,5399,-1,PF13384,16.0,49.0,18.5,0
706946,NEG_FAKE_CLUSTER|U00096.3|AB007043.2,U00096_4155,4452,5399,-1,PF09339,17.0,40.0,16.5,0
706947,NEG_FAKE_CLUSTER|U00096.3|AB007043.2,U00096_4155,4452,5399,-1,PF00532,22.0,274.0,40.2,0
706948,NEG_FAKE_CLUSTER|U00096.3|AB007043.2,U00096_3308,5399,6625,-1,PF01676,1.0,243.0,169.8,0


In [25]:
positive_data['label'] = 1
negative_data['label'] = 0

combined_data = pd.concat([positive_data, negative_data], ignore_index=True)
combined_data.head(40)

Unnamed: 0,sequence_id,protein_id,gene_start,gene_end,gene_strand,pfam_id,in_cluster,label,domain_start,domain_end,bitscore
0,BGC0000001.1,AEK75490.1,0,1083,1,PF02353,1,1,,,
1,BGC0000001.1,AEK75490.1,0,1083,1,PF01135,1,1,,,
2,BGC0000001.1,AEK75490.1,0,1083,1,PF01269,1,1,,,
3,BGC0000001.1,AEK75490.1,0,1083,1,PF13489,1,1,,,
4,BGC0000001.1,AEK75490.1,0,1083,1,PF01596,1,1,,,
5,BGC0000001.1,AEK75490.1,0,1083,1,PF13847,1,1,,,
6,BGC0000001.1,AEK75490.1,0,1083,1,PF13649,1,1,,,
7,BGC0000001.1,AEK75490.1,0,1083,1,PF08241,1,1,,,
8,BGC0000001.1,AEK75492.1,1886,2633,1,PF00486,1,1,,,
9,BGC0000001.1,AEK75492.1,1886,2633,1,PF03704,1,1,,,


In [9]:
pfam_ids = combined_data['pfam_id'].unique()
pfam_ids, len(pfam_ids)

(array(['PF02353', 'PF01135', 'PF01269', ..., 'PF15011', 'PF12378',
        'PF12597'], dtype=object),
 9633)

In [10]:
pfam_to_idx = {pfam: idx for idx, pfam in enumerate(pfam_ids)}
pfam_to_idx

{'PF02353': 0,
 'PF01135': 1,
 'PF01269': 2,
 'PF13489': 3,
 'PF01596': 4,
 'PF13847': 5,
 'PF13649': 6,
 'PF08241': 7,
 'PF00486': 8,
 'PF03704': 9,
 'PF00067': 10,
 'PF00196': 11,
 'PF13424': 12,
 'PF14559': 13,
 'PF13401': 14,
 'PF13191': 15,
 'PF13428': 16,
 'PF07719': 17,
 'PF00515': 18,
 'PF13176': 19,
 'PF13432': 20,
 'PF05593': 21,
 'PF00108': 22,
 'PF08545': 23,
 'PF08541': 24,
 'PF00550': 25,
 'PF00198': 26,
 'PF06500': 27,
 'PF12697': 28,
 'PF16197': 29,
 'PF00698': 30,
 'PF02801': 31,
 'PF14765': 32,
 'PF01370': 33,
 'PF02719': 34,
 'PF03435': 35,
 'PF00109': 36,
 'PF08659': 37,
 'PF13561': 38,
 'PF00106': 39,
 'PF08990': 40,
 'PF08240': 41,
 'PF00107': 42,
 'PF13602': 43,
 'PF14246': 44,
 'PF00440': 45,
 'PF07690': 46,
 'PF00083': 47,
 'PF03209': 48,
 'PF00296': 49,
 'PF00496': 50,
 'PF00528': 51,
 'PF13555': 52,
 'PF02463': 53,
 'PF13671': 54,
 'PF13304': 55,
 'PF00005': 56,
 'PF13481': 57,
 'PF08352': 58,
 'PF06902': 59,
 'PF13459': 60,
 'PF13370': 61,
 'PF03358': 62,
 '

In [11]:
combined_data['pfam_idx'] = combined_data['pfam_id'].map(pfam_to_idx)
combined_data

Unnamed: 0,sequence_id,protein_id,gene_start,gene_end,gene_strand,pfam_id,in_cluster,label,domain_start,domain_end,bitscore,pfam_idx
0,BGC0000001.1,AEK75490.1,0,1083,1,PF02353,1,1,,,,0
1,BGC0000001.1,AEK75490.1,0,1083,1,PF01135,1,1,,,,1
2,BGC0000001.1,AEK75490.1,0,1083,1,PF01269,1,1,,,,2
3,BGC0000001.1,AEK75490.1,0,1083,1,PF13489,1,1,,,,3
4,BGC0000001.1,AEK75490.1,0,1083,1,PF01596,1,1,,,,4
...,...,...,...,...,...,...,...,...,...,...,...,...
803357,NEG_FAKE_CLUSTER|U00096.3|AB007043.2,U00096_4155,4452,5399,-1,PF13384,0,0,16.0,49.0,18.5,83
803358,NEG_FAKE_CLUSTER|U00096.3|AB007043.2,U00096_4155,4452,5399,-1,PF09339,0,0,17.0,40.0,16.5,86
803359,NEG_FAKE_CLUSTER|U00096.3|AB007043.2,U00096_4155,4452,5399,-1,PF00532,0,0,22.0,274.0,40.2,825
803360,NEG_FAKE_CLUSTER|U00096.3|AB007043.2,U00096_3308,5399,6625,-1,PF01676,0,0,1.0,243.0,169.8,471


In [12]:
sequences = combined_data.groupby('sequence_id')['pfam_idx'].apply(list).values
len(sequences)

12112

In [13]:
labels = combined_data.groupby('sequence_id')['label'].first().values
labels

array([1, 1, 1, ..., 0, 0, 0])

In [14]:
max_len = max(len(seq) for seq in sequences)
max_len

373

In [15]:
X = pad_sequences(sequences, maxlen=max_len, padding='post')
X, len(X)

(array([[   0,    1,    2, ...,    0,    0,    0],
        [  68,   69,   70, ...,    0,    0,    0],
        [ 108,   37,   42, ...,    0,    0,    0],
        ...,
        [6684, 4012, 4013, ...,    0,    0,    0],
        [ 118,  119, 4110, ...,    0,    0,    0],
        [ 168, 2121, 1944, ...,    0,    0,    0]], dtype=int32),
 12112)

In [16]:
y = to_categorical(labels, num_classes=2)
y, len(y)

(array([[0., 1.],
        [0., 1.],
        [0., 1.],
        ...,
        [1., 0.],
        [1., 0.],
        [1., 0.]], dtype=float32),
 12112)

In [17]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
input_dim = len(pfam_ids)
input_dim

9633

In [28]:
output_dim = 50  # Embedding dimension
input_length = max_len


In [29]:
input_layer = Input(shape=(input_length,))
embedding_layer = Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length)(input_layer)
bilstm_layer = Bidirectional(LSTM(units=50, return_sequences=True))(embedding_layer)
dense_layer = TimeDistributed(Dense(2))(bilstm_layer)  # 2 output classes (0 and 1)

# Add CRF layer
crf_layer = CRF(2)  # 2 output classes
output_layer = crf_layer(dense_layer)

# Define the model
model = Model(inputs=input_layer, outputs=output_layer)

# Compile the model
model.compile(optimizer='adam', loss=crf_loss, metrics=[crf_accuracy])

model.summary()


2025-03-03 23:48:29.972381: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2025-03-03 23:48:29.976141: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2025-03-03 23:48:29.977966: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 373)]             0         
                                                                 
 embedding_1 (Embedding)     (None, 373, 50)           481650    
                                                                 
 bidirectional_1 (Bidirectio  (None, 373, 100)         40400     
 nal)                                                            
                                                                 
 time_distributed_1 (TimeDis  (None, 373, 2)           202       
 tributed)                                                       
                                                                 
 crf_1 (CRF)                 (None, 373, 2)            14        
                                                                 
Total params: 522,266
Trainable params: 522,266
Non-trainab

In [30]:
callbacks = [
    EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True),
    ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True)
]

# Train the model
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=10,
    batch_size=32,
    callbacks=callbacks
)


Epoch 1/10


2025-03-03 23:48:35.040434: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2025-03-03 23:48:35.042644: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2025-03-03 23:48:35.043879: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

AttributeError: in user code:

    File "/home/ultron/.local/lib/python3.10/site-packages/keras/engine/training.py", line 1284, in train_function  *
        return step_function(self, iterator)
    File "/home/ultron/.local/lib/python3.10/site-packages/keras_contrib/losses/crf_losses.py", line 54, in crf_loss  *
        crf, idx = y_pred._keras_history[:2]

    AttributeError: 'Tensor' object has no attribute '_keras_history'
