In [None]:
# 🧬 Protein Function Prediction – CAFA 6 Kaggle
# Author: Tenika Powell
# ------------------------------------------------------------
# Predict Gene Ontology (GO) terms from amino-acid sequences
# using a simple neural network (Keras / TensorFlow)


import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import matplotlib.pyplot as plt

print("✅ Libraries loaded successfully")

terms_path = r"C:\Users\imiss\Desktop\protein-function-prediction\data\Train\train_terms.tsv"  

terms = pd.read_csv(terms_path, sep="\t", names=["EntryID", "term", "aspect"], header=None)
print("train_terms shape:", terms.shape)
print(terms.head())

X = np.random.rand(len(terms), 100)
y = terms["aspect"]


le = LabelEncoder()
y_enc = le.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y_enc, test_size=0.2, random_state=42)


model = Sequential([
    Dense(64, activation="relu", input_shape=(X_train.shape[1],)),
    Dropout(0.2),
    Dense(32, activation="relu"),
    Dense(len(np.unique(y_enc)), activation="softmax")
])

model.compile(optimizer="adam",
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])

print("\n✅ Model built successfully!")
model.summary()


history = model.fit(
    X_train, y_train,
    epochs=25,
    batch_size=32,
    validation_split=0.2,
    verbose=1
)


y_pred = np.argmax(model.predict(X_test), axis=1)
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))
print("Accuracy:", accuracy_score(y_test, y_pred))


plt.plot(history.history["accuracy"], label="Train Acc")
plt.plot(history.history["val_accuracy"], label="Val Acc")
plt.xlabel("Epochs"); plt.ylabel("Accuracy"); plt.title("Model Accuracy Over Epochs")
plt.legend(); plt.show()


model.save("protein_function_model.h5")
print("✅ Model saved as protein_function_model.h5")


✅ Libraries loaded successfully
train_terms shape: (537028, 3)
   EntryID        term  aspect
0  EntryID        term  aspect
1   Q5W0B1  GO:0000785       C
2   Q5W0B1  GO:0004842       F
3   Q5W0B1  GO:0051865       P
4   Q5W0B1  GO:0006275       P


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



✅ Model built successfully!


Epoch 1/25
[1m10741/10741[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 12ms/step - accuracy: 0.4659 - loss: 1.0606 - val_accuracy: 0.4668 - val_loss: 1.0584
Epoch 2/25
[1m10741/10741[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 10ms/step - accuracy: 0.4666 - loss: 1.0581 - val_accuracy: 0.4668 - val_loss: 1.0580
Epoch 3/25
[1m10741/10741[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m110s[0m 10ms/step - accuracy: 0.4666 - loss: 1.0580 - val_accuracy: 0.4668 - val_loss: 1.0580
Epoch 4/25
[1m10741/10741[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m121s[0m 8ms/step - accuracy: 0.4666 - loss: 1.0580 - val_accuracy: 0.4668 - val_loss: 1.0581
Epoch 5/25
[1m10741/10741[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m150s[0m 9ms/step - accuracy: 0.4666 - loss: 1.0580 - val_accuracy: 0.4668 - val_loss: 1.0580
Epoch 6/25
[1m10741/10741[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m161s[0m 11ms/step - accuracy: 0.4666 - loss: 1.0579 - val_accuracy: 0.4668 - val_l