In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.models import Model
from keras.layers import Input, Embedding, Bidirectional, LSTM, TimeDistributed, Dense
from keras.utils import pad_sequences
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.utils import to_categorical
from keras_contrib.layers import CRF
from keras_contrib import losses
from keras_contrib import metrics
from keras import optimizers


2025-03-03 23:41:07.744415: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-03 23:41:07.783023: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-03 23:41:07.989558: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-03 23:41:07.992469: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
positive_data = pd.read_csv('MIBiG.pfam.tsv', sep='\t')
negative_data = pd.read_csv('GeneSwap_Negatives.pfam.tsv', sep='\t')
negative_data.rename(columns={"contig_id": "sequence_id"}, inplace=True)

# Combine datasets
positive_data['label'] = 1
negative_data['label'] = 0
combined_data = pd.concat([positive_data, negative_data], ignore_index=True)
combined_data


Unnamed: 0.1,sequence_id,protein_id,gene_start,gene_end,gene_strand,pfam_id,in_cluster,label,Unnamed: 0,domain_start,domain_end,bitscore
0,BGC0000001.1,AEK75490.1,0,1083,1,PF02353,1,1,,,,
1,BGC0000001.1,AEK75490.1,0,1083,1,PF01135,1,1,,,,
2,BGC0000001.1,AEK75490.1,0,1083,1,PF01269,1,1,,,,
3,BGC0000001.1,AEK75490.1,0,1083,1,PF13489,1,1,,,,
4,BGC0000001.1,AEK75490.1,0,1083,1,PF01596,1,1,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
803357,NEG_FAKE_CLUSTER|U00096.3|AB007043.2,U00096_4155,4452,5399,-1,PF13384,0,0,706945.0,16.0,49.0,18.5
803358,NEG_FAKE_CLUSTER|U00096.3|AB007043.2,U00096_4155,4452,5399,-1,PF09339,0,0,706946.0,17.0,40.0,16.5
803359,NEG_FAKE_CLUSTER|U00096.3|AB007043.2,U00096_4155,4452,5399,-1,PF00532,0,0,706947.0,22.0,274.0,40.2
803360,NEG_FAKE_CLUSTER|U00096.3|AB007043.2,U00096_3308,5399,6625,-1,PF01676,0,0,706948.0,1.0,243.0,169.8


In [3]:
pfam_ids = combined_data['pfam_id'].unique()
pfam_to_idx = {pfam: idx for idx, pfam in enumerate(pfam_ids)}
combined_data['pfam_idx'] = combined_data['pfam_id'].map(pfam_to_idx)

# Prepare sequences
sequences = combined_data.groupby('sequence_id')['pfam_idx'].apply(list).values
labels = combined_data.groupby('sequence_id')['label'].first().values

# Pad sequences
max_len = max(len(seq) for seq in sequences)
X = pad_sequences(sequences, maxlen=max_len, padding='post')
y = to_categorical(labels, num_classes=2)

# Split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Build BiLSTM-CRF model
input_dim = len(pfam_ids)
output_dim = 50  # Embedding dimension
input_length = max_len


In [4]:
WORD_COUNT = len(pfam_ids)  # Vocabulary size
DENSE_EMBEDDING = 50  # Embedding dimension
LSTM_UNITS = 50  # Number of LSTM units
LSTM_DROPOUT = 0.1  # LSTM dropout rate
DENSE_UNITS = 2  # Number of output classes (0 and 1)
TAG_COUNT = 2  # Number of output classes (0 and 1)
BATCH_SIZE = 32  # Batch size
MAX_EPOCHS = 10  # Number of epochs

# Build the model
input_layer = Input(shape=(max_len,))

# Embedding layer
model = Embedding(input_dim=WORD_COUNT, output_dim=DENSE_EMBEDDING, input_length=max_len)(input_layer)

# Bidirectional LSTM layer
model = Bidirectional(LSTM(units=LSTM_UNITS, recurrent_dropout=LSTM_DROPOUT, return_sequences=True))(model)

# TimeDistributed Dense layer
model = TimeDistributed(Dense(DENSE_UNITS, activation="relu"))(model)

# CRF layer
crf_layer = CRF(units=TAG_COUNT)
output_layer = crf_layer(model)

# Define the model
ner_model = Model(inputs=input_layer, outputs=output_layer)

# Compile the model
loss = losses.crf_loss
acc_metric = metrics.crf_accuracy
opt = optimizers.Adam(learning_rate=0.001)

ner_model.compile(optimizer=opt, loss=loss, metrics=[acc_metric])

# Print model summary
ner_model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 373)]             0         
                                                                 
 embedding (Embedding)       (None, 373, 50)           481650    
                                                                 
 bidirectional (Bidirectiona  (None, 373, 100)         40400     
 l)                                                              
                                                                 
 time_distributed (TimeDistr  (None, 373, 2)           202       
 ibuted)                                                         
                                                                 
 crf (CRF)                   (None, 373, 2)            14        
                                                                 
Total params: 522,266
Trainable params: 522,266
Non-trainable

In [5]:
callbacks = [
    EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True),
    ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True)
]

# Train the model
history = ner_model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    batch_size=BATCH_SIZE,
    epochs=MAX_EPOCHS,
    callbacks=callbacks,
    verbose=2
)


Epoch 1/10


AttributeError: in user code:

    File "/home/ultron/.local/lib/python3.10/site-packages/keras/engine/training.py", line 1284, in train_function  *
        return step_function(self, iterator)
    File "/home/ultron/.local/lib/python3.10/site-packages/keras_contrib/losses/crf_losses.py", line 54, in crf_loss  *
        crf, idx = y_pred._keras_history[:2]

    AttributeError: 'Tensor' object has no attribute '_keras_history'


In [8]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split  
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, TimeDistributed, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import Adam
import tensorflow_addons as tfa
from tensorflow_addons.layers import CRF

# Load datasets
positive_data = pd.read_csv('MIBiG.pfam.tsv', sep='\t')
negative_data = pd.read_csv('GeneSwap_Negatives.pfam.tsv', sep='\t')
negative_data.rename(columns={"contig_id": "sequence_id"}, inplace=True)

# Combine datasets
positive_data['label'] = 1
negative_data['label'] = 0
combined_data = pd.concat([positive_data, negative_data], ignore_index=True)

# Encode Pfam IDs
pfam_ids = combined_data['pfam_id'].unique()
pfam_to_idx = {pfam: idx for idx, pfam in enumerate(pfam_ids)}
combined_data['pfam_idx'] = combined_data['pfam_id'].map(pfam_to_idx)

# Prepare sequences
sequences = combined_data.groupby('sequence_id')['pfam_idx'].apply(list).values
labels = combined_data.groupby('sequence_id')['label'].first().values

# Pad sequences
max_len = max(len(seq) for seq in sequences)
X = pad_sequences(sequences, maxlen=max_len, padding='post')
y = tf.keras.utils.to_categorical(labels, num_classes=2)

# Split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=1234)

# Define model parameters
WORD_COUNT = len(pfam_ids)  
DENSE_EMBEDDING = 50  
LSTM_UNITS = 50  
LSTM_DROPOUT = 0.1  
DENSE_UNITS = 2  
TAG_COUNT = 2  
BATCH_SIZE = 32  
MAX_EPOCHS = 10  

# Build the model
input_layer = Input(shape=(max_len,))

# Embedding layer
model = Embedding(input_dim=WORD_COUNT, output_dim=DENSE_EMBEDDING, input_length=max_len)(input_layer)

# Bidirectional LSTM layer
model = Bidirectional(LSTM(units=LSTM_UNITS, recurrent_dropout=LSTM_DROPOUT, return_sequences=True))(model)

# TimeDistributed Dense layer
model = TimeDistributed(Dense(DENSE_UNITS, activation="relu"))(model)

# CRF layer
crf_layer = CRF(TAG_COUNT)
output_layer = crf_layer(model)

# Define the model
ner_model = Model(inputs=input_layer, outputs=output_layer)

# ✅ Fix Loss Function
loss = crf_layer.get_loss()  # ✅ Correct
acc_metric = crf_layer.get_accuracy()  # ✅ Correct

# Compile the model
opt = Adam(learning_rate=0.001)
ner_model.compile(optimizer=opt, loss=loss, metrics=[acc_metric])

# Print model summary
ner_model.summary()

# Add callbacks for early stopping and model checkpointing
callbacks = [
    EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True),
    ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True)
]

# Train the model
history = ner_model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    batch_size=BATCH_SIZE,
    epochs=MAX_EPOCHS,
    callbacks=callbacks,
    verbose=2
)

# Load test data
test_data = pd.read_csv('test.csv')
test_sequences = test_data['pfam_id'].map(pfam_to_idx).values
test_X = pad_sequences(test_sequences, maxlen=max_len, padding='post')

# Predict
predictions = ner_model.predict(test_X)
predicted_labels = np.argmax(predictions, axis=-1)

# Output predictions
test_data['predicted_label'] = predicted_labels.flatten()
test_data.to_csv('predictions.csv', index=False)


AttributeError: 'CRF' object has no attribute 'get_loss'

In [9]:
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
from tensorflow_addons.layers import CRF
from tensorflow_addons.text.crf import crf_log_likelihood


# Helper function to unpack data
def unpack_data(data):
    if isinstance(data, tuple):
        return data if len(data) == 3 else (data[0], data[1], None)
    return data, None, None


# ✅ Custom CRF Model Wrapper
class ModelWithCRFLoss(tf.keras.Model):
    def __init__(self, base_model):
        super().__init__()
        self.base_model = base_model

    def call(self, inputs):
        return self.base_model(inputs)

    def compute_loss(self, x, y, sample_weight=None, training=False):
        y_pred = self(x, training=training)
        potentials, sequence_length, chain_kernel = y_pred

        # Compute CRF loss
        crf_loss = -crf_log_likelihood(potentials, y, sequence_length, chain_kernel)[0]

        if sample_weight is not None:
            crf_loss *= sample_weight

        return tf.reduce_mean(crf_loss) + sum(self.losses)

    def train_step(self, data):
        x, y, sample_weight = unpack_data(data)

        with tf.GradientTape() as tape:
            total_loss = self.compute_loss(x, y, sample_weight, training=True)

        gradients = tape.gradient(total_loss, self.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))

        return {"loss": total_loss}

    def test_step(self, data):
        x, y, sample_weight = unpack_data(data)
        total_loss = self.compute_loss(x, y, sample_weight)
        return {"val_loss": total_loss}


# ✅ Define Model
WORD_COUNT = 1000  # Example vocabulary size
DENSE_EMBEDDING = 50
LSTM_UNITS = 50
MAX_LEN = 100
TAG_COUNT = 2  # Number of output labels

input_layer = Input(shape=(MAX_LEN,))
embedding_layer = Embedding(input_dim=WORD_COUNT, output_dim=DENSE_EMBEDDING, input_length=MAX_LEN)(input_layer)
bi_lstm_layer = Bidirectional(LSTM(units=LSTM_UNITS, return_sequences=True))(embedding_layer)
dense_layer = Dense(TAG_COUNT)(bi_lstm_layer)

# ✅ Apply CRF correctly
crf_layer = CRF(TAG_COUNT)
output_layer = crf_layer(dense_layer)

# ✅ Wrap the model with CRF Loss Handling
base_model = Model(inputs=input_layer, outputs=output_layer)
crf_model = ModelWithCRFLoss(base_model)

# ✅ Compile with Optimizer
crf_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001))

# ✅ Summary
crf_model.summary()



2025-03-04 00:22:09.725213: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2025-03-04 00:22:09.725998: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2025-03-04 00:22:09.727166: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

ValueError: This model has not yet been built. Build the model first by calling `build()` or by calling the model on a batch of data.