In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("/content/labeled_protein_sequences.csv")
df.head()

Unnamed: 0,Sequence_ID,Protein_Sequence,Label
0,UIF27901.1,ITVEELKKLLEQWNLVIGFLFLTWICLLQFAYANRNRFLYIIKLIF...,1
1,QJX19961.1,MAFSASLFKPVQLVPVSPAFHRIESTDSIVFTYIPASGYVAALAVN...,0
2,UMG10730.1,MSDNGPQNQRNAPRITFGGPSDSTGSNQNGERSGARSKQRRPQGLP...,1
3,ASU90795.1,MAFSASLFKPVQLVPVSPAFHRIESPDSIVFTYIPASGYVAALAVN...,0
4,UBU60787.1,MDLFMRIFTIGTVTLKQGEIKDATPLDFVRATATIPIQASLPFGWL...,1


In [3]:
df.shape

(17398, 3)

In [4]:
third_col = df.iloc[:, 2]

In [5]:
third_col.shape

(17398,)

In [6]:
third_col_array = third_col.to_numpy()

# Save to .npy file
np.save('labels.npy', third_col_array)

In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
import numpy as np

# ---------- STEP 1: Load your data ----------
features = np.load("features.npy")   # shape: [num_samples, num_features]
labels = np.load("labels.npy")       # shape: [num_samples]

# ---------- STEP 2: Train-test split ----------
train_idx, test_idx = train_test_split(np.arange(len(labels)), test_size=0.2, stratify=labels, random_state=42)

# ---------- STEP 3: Build edge_index (graph connections) ----------
def build_edge_index(features, threshold=0.9):
    sim_matrix = cosine_similarity(features)
    edge_index = []

    num_nodes = sim_matrix.shape[0]
    for i in range(num_nodes):
        for j in range(num_nodes):
            if i != j and sim_matrix[i][j] > threshold:
                edge_index.append([i, j])

    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    return edge_index

edge_index = build_edge_index(features)

# ---------- STEP 4: Create PyTorch Geometric Data ----------
x = torch.tensor(features, dtype=torch.float)
y = torch.tensor(labels, dtype=torch.long)
data = Data(x=x, edge_index=edge_index, y=y)

# ---------- STEP 5: Define GCN ----------
class GCN(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.conv1(x, edge_index))
        x = self.conv2(x, edge_index)
        return x

model = GCN(in_channels=x.shape[1], hidden_channels=64, out_channels=2)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

# ---------- STEP 6: Train ----------
for epoch in range(1, 101):
    model.train()
    out = model(data)
    train_logits = out[train_idx]
    train_labels = y[train_idx]

    loss = criterion(train_logits, train_labels)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    # Metrics
    with torch.no_grad():
        model.eval()
        test_logits = out[test_idx]
        test_labels = y[test_idx]

        train_pred = train_logits.argmax(dim=1)
        train_acc = (train_pred == train_labels).float().mean().item()

        test_loss = criterion(test_logits, test_labels)
        test_pred = test_logits.argmax(dim=1)
        test_acc = (test_pred == test_labels).float().mean().item()

    if epoch % 10 == 0 or epoch == 1:
        print(f"Epoch {epoch:03d} | Train Loss: {loss.item():.4f} | Train Acc: {train_acc:.4f} | "
              f"Test Loss: {test_loss.item():.4f} | Test Acc: {test_acc:.4f}")

Epoch 001 | Train Loss: 1.0839 | Train Acc: 0.4457 | Test Loss: 1.0783 | Test Acc: 0.4405
Epoch 010 | Train Loss: 0.7167 | Train Acc: 0.6067 | Test Loss: 0.7285 | Test Acc: 0.6055
Epoch 020 | Train Loss: 0.6952 | Train Acc: 0.5428 | Test Loss: 0.7012 | Test Acc: 0.5353
Epoch 030 | Train Loss: 0.6723 | Train Acc: 0.6196 | Test Loss: 0.6811 | Test Acc: 0.6167
Epoch 040 | Train Loss: 0.6664 | Train Acc: 0.6173 | Test Loss: 0.6741 | Test Acc: 0.6092
Epoch 050 | Train Loss: 0.6632 | Train Acc: 0.6217 | Test Loss: 0.6726 | Test Acc: 0.6170
Epoch 060 | Train Loss: 0.6609 | Train Acc: 0.6232 | Test Loss: 0.6713 | Test Acc: 0.6178
Epoch 070 | Train Loss: 0.6586 | Train Acc: 0.6243 | Test Loss: 0.6708 | Test Acc: 0.6187
Epoch 080 | Train Loss: 0.6564 | Train Acc: 0.6262 | Test Loss: 0.6712 | Test Acc: 0.6187
Epoch 090 | Train Loss: 0.6540 | Train Acc: 0.6284 | Test Loss: 0.6717 | Test Acc: 0.6181
Epoch 100 | Train Loss: 0.6515 | Train Acc: 0.6293 | Test Loss: 0.6727 | Test Acc: 0.6138


In [18]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2

# Load your data
X = np.load('features.npy')  # Shape: (17000, N)
y = np.load('labels.npy')    # Shape: (17000,)

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

# Define the model
model = Sequential([
    Dense(512, activation='relu', input_shape=(X.shape[1],), kernel_regularizer=l2(0.001)),
    BatchNormalization(),
    Dropout(0.5),

    Dense(256, activation='relu', kernel_regularizer=l2(0.001)),
    BatchNormalization(),
    Dropout(0.5),

    Dense(128, activation='relu', kernel_regularizer=l2(0.001)),
    BatchNormalization(),
    Dropout(0.3),

    Dense(64, activation='relu'),
    Dropout(0.2),

    Dense(1, activation='sigmoid')  # Binary classification
])

# Compile
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# Callbacks
early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', patience=5, factor=0.5, min_lr=1e-6)

# Train
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=100,
    batch_size=64,
    callbacks=[early_stop, reduce_lr],
    verbose=1
)

# Evaluate
loss, acc = model.evaluate(X_val, y_val)
print(f"\n✅ Validation Accuracy: {acc:.4f}")


Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 12ms/step - accuracy: 0.5297 - loss: 1.3915 - val_accuracy: 0.6230 - val_loss: 1.2034 - learning_rate: 0.0010
Epoch 2/100
[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 11ms/step - accuracy: 0.5727 - loss: 1.2274 - val_accuracy: 0.6221 - val_loss: 1.1447 - learning_rate: 0.0010
Epoch 3/100
[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step - accuracy: 0.5889 - loss: 1.1464 - val_accuracy: 0.6233 - val_loss: 1.0794 - learning_rate: 0.0010
Epoch 4/100
[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.6143 - loss: 1.0673 - val_accuracy: 0.6221 - val_loss: 1.0122 - learning_rate: 0.0010
Epoch 5/100
[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - accuracy: 0.6097 - loss: 1.0032 - val_accuracy: 0.6227 - val_loss: 0.9558 - learning_rate: 0.0010
Epoch 6/100
[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3

In [19]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
print(clf.score(X_val, y_val))

0.6135057471264368
