In [76]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score, ShuffleSplit, KFold
from sklearn import metrics
import keras
from keras.layers import Dense, Conv1D, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.layers import LSTM
from keras.layers import TimeDistributed, Bidirectional
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from gensim.models import Word2Vec

In [77]:
from keras.models import load_model

# Load the best saved model
model = load_model("2.best_model.h5")




In [78]:
labels = pd.read_csv("MIBiG.classes.csv").set_index('contig_id')
NUM_LABELS = labels.shape[1]
labels

Unnamed: 0_level_0,Alkaloid,NRP,Other,Polyketide,RiPP,Saccharide,Terpene
contig_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
BGC0000001.1,0,0,0,1,0,0,0
BGC0000002.1,0,0,0,1,0,0,0
BGC0000003.1,0,0,0,1,0,0,0
BGC0000004.1,0,0,0,1,0,0,0
BGC0000005.1,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...
BGC0001826.1,0,1,0,0,0,0,0
BGC0001827.1,0,0,0,1,0,0,0
BGC0001828.1,0,0,0,1,0,0,0
BGC0001829.1,0,1,0,1,0,0,0


In [79]:
TIMESTEPS = 128
BATCH_SIZE = 16
PFAM2VEC_DIMENSIONS = 100

In [80]:
from gensim.models import Word2Vec

# Load the model correctly using gensim
pfam2vec_bin = Word2Vec.load('Word2Vec/word2vec_model.bin')

# Convert to DataFrame
pfam2vec = pd.DataFrame(pfam2vec_bin.wv.vectors, index=pfam2vec_bin.wv.index_to_key)

# Show first two rows
pfam2vec


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
PF00005,0.233514,0.052725,0.033491,0.240617,-0.052133,-0.383191,0.097324,0.307650,-0.516893,-0.119899,...,-0.130947,-0.349238,-0.052996,0.266614,0.556849,-0.025646,0.321105,-0.287103,0.029798,0.149110
PF07690,0.084144,0.282901,-0.045280,0.076797,-0.022403,-0.326592,0.109536,0.147560,0.070727,-0.395535,...,0.271868,-0.034904,-0.008765,-0.342772,-0.019638,-0.114983,-0.203931,-0.463244,0.079770,0.186862
PF13304,0.111681,0.023277,0.181347,0.151265,-0.179347,-0.140607,0.081777,0.406897,-0.509895,-0.132071,...,-0.100485,-0.220790,-0.087275,0.204365,0.523665,-0.074779,0.398022,-0.301283,0.035700,0.046756
PF00072,-0.303923,-0.144928,0.174792,0.111320,-0.050851,-0.241886,0.114559,0.242707,0.023629,-0.339719,...,0.259481,-0.173986,-0.335270,-0.064067,0.594384,-0.297773,-0.032209,-0.532959,0.102281,0.016370
PF00528,-0.073311,0.541115,-0.048413,-0.371761,0.181419,0.124935,0.017770,0.555671,-0.512254,-0.206273,...,0.077985,-0.153859,0.116167,0.072122,0.970655,0.079777,0.441720,-0.321656,0.043775,0.187739
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
PF00518,-0.070158,-0.065413,0.019068,0.022858,0.069239,-0.179477,0.119020,0.293031,-0.051709,-0.138381,...,0.104844,-0.152967,0.031874,0.091196,-0.031648,-0.022964,0.043654,-0.201326,0.148341,0.097215
PF07033,-0.010519,0.020789,-0.006451,-0.060890,-0.009975,-0.159964,0.054209,0.259067,-0.032121,-0.150636,...,-0.011022,0.020476,0.119876,0.037193,0.120064,-0.122264,0.046126,-0.287325,0.053798,-0.106282
PF15873,-0.110694,-0.012107,0.059395,-0.005941,0.106874,-0.165131,-0.093995,0.093994,0.076426,-0.070573,...,0.111926,-0.116411,0.112473,0.085117,0.157332,0.065394,0.070263,-0.179310,0.149701,-0.101817
PF15049,-0.126276,0.086068,0.140609,-0.026558,0.106151,-0.213212,-0.017060,0.259988,0.099858,-0.165359,...,-0.014387,-0.078912,0.023920,0.009382,0.090026,0.048761,0.020485,-0.080630,0.066559,-0.094392


In [81]:
def preprocess_sequence(sequence, pfam2vec, timesteps=TIMESTEPS):
    """
    Convert a sequence into Pfam2Vec vectors and pad it to match the input shape.
    """
    vectors = pfam2vec.reindex(sequence).dropna()
    padded_sequence = pad_sequences([vectors.values], maxlen=timesteps, dtype=np.float64, padding='post', truncating='post')
    return padded_sequence


In [83]:
# Example sequence (replace with an actual sequence)
sequence =     ['PF01926', 'PF03938', 'PF00350', 'PF02108', 'PF14400', 'PF01053', 'PF01041', 'PF00155', 'PF00266', 'PF01370', 'PF16363', 'PF01073', 'PF02719', 'PF07993', 'PF04321', 'PF01411', 'PF07973', 'PF02272', 'PF13662', 'PF02132', 'PF01370', 'PF16363', 'PF01073', 'PF02719', 'PF07993', 'PF04321', 'PF13662', 'PF02132', 'PF00701', 'PF08241', 'PF08242', 'PF02353', 'PF13649', 'PF13847', 'PF13489', 'PF12847', 'PF13679', 'PF05175', 'PF08123', 'PF01209', 'PF01596', 'PF01135', 'PF08003', 'PF01555', 'PF06325', 'PF13245', 'PF00004', 'PF13238', 'PF00910', 'PF13086', 'PF00580', 'PF09848', 'PF05729', 'PF03266', 'PF13604', 'PF04851', 'PF00005', 'PF13401', 'PF02562', 'PF01078', 'PF13476', 'PF13191', 'PF03193', 'PF00437', 'PF13905', 'PF00085', 'PF13098', 'PF13192', 'PF00462', 'PF13899', 'PF02966', 'PF00578', 'PF08534', 'PF06110', 'PF14595', 'PF04756', 'PF00294', 'PF01467', 'PF08543', 'PF13662', 'PF02132', 'PF02667', 'PF03606', 'PF03806', 'PF05670', 'PF05833', 'PF13184', 'PF08529', 'PF13538', 'PF13245', 'PF00580', 'PF12705', 'PF01930', 'PF13361', 'PF13184', 'PF08529', 'PF02602', 'PF03572', 'PF08241', 'PF08242', 'PF13649', 'PF13847', 'PF01209', 'PF13489', 'PF13793', 'PF00156', 'PF14572', 'PF14681', 'PF00294', 'PF01467', 'PF08543', 'PF17190', 'PF07160', 'PF07926', 'PF04791', 'PF08317', 'PF01926', 'PF03938', 'PF00350', 'PF00593', 'PF07715']

# Convert to vector format
X_new = preprocess_sequence(sequence, pfam2vec)

# Make sure it's batch size compatible
# X_new = np.expand_dims(X_new, axis=0)  # Adding batch dimension if needed

# Predict
y_pred = model.predict(X_new)

# Convert to labels (if multi-class, use argmax)
predicted_labels = (y_pred > 0.5).astype(int)  # Adjust for multi-label vs. multi-class






[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 403ms/step


In [84]:
predicted_classes = labels.columns[predicted_labels[0] == 1]  # Get predicted labels
print("Predicted BGC Classes:", list(predicted_classes))


Predicted BGC Classes: ['Saccharide']


In [85]:
y_pred

array([[7.6468277e-06, 1.2575827e-02, 3.7826832e-02, 5.1329928e-03,
        4.2308844e-08, 9.4445586e-01, 7.3752011e-07]], dtype=float32)

In [86]:
y_pred_df = pd.DataFrame(y_pred, columns=labels.columns)
y_pred_df

Unnamed: 0,Alkaloid,NRP,Other,Polyketide,RiPP,Saccharide,Terpene
0,8e-06,0.012576,0.037827,0.005133,4.230884e-08,0.944456,7.375201e-07
