In [1]:
import pandas as pd 
import numpy as np 
import keras 
import tarfile
import os
import lzma
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, MaxPooling1D, Dense, Dropout, Flatten
from sklearn.model_selection import StratifiedKFold, train_test_split
from scikeras.wrappers import KerasClassifier
import matplotlib.pyplot as plt
from keras.backend import clear_session
from sklearn.metrics import f1_score
from scipy import stats
import keras_tuner as kt
from keras.layers import GRU, SimpleRNN, Dense, Dropout, Embedding
from sklearn.model_selection import StratifiedKFold, train_test_split


In [2]:
filepath = "data/cath.tar.xz"
output_dir = "data/extracted_data/cath"
os.makedirs(output_dir, exist_ok=True)

with lzma.open(filepath) as xz_file:
    with tarfile.open(fileobj=xz_file) as tar:
        tar.extractall(path=output_dir)

FN_DOMAIN_LIST = 'data/extracted_data/cath/proteins/domain_classification.txt'
FN_SF_NAMES = 'data/extracted_data/cath/proteins/superfamily_names.txt'
FN_SEQ_S60 = 'data/extracted_data/cath/proteins/seqs_S60.fa'

sequences = []
current_id = None
current_seq = []


with open(FN_SEQ_S60, 'r') as f:
    for line in f:
        line = line.strip()
        if line.startswith('>'):
            if current_id:
                sequences.append({
                    'domain_id': current_id, 
                    'sequence': "".join(current_seq)
                })
        
            parts = line[1:].split('|')
            
            if len(parts) >= 3:
                full_code = parts[2]
                current_id = full_code.split('/')[0]
            else:
                current_id = line[1:].split()[0]

            current_seq = []
        else:
            current_seq.append(line)
    
    # Save last entry
    if current_id:
        sequences.append({'domain_id': current_id, 'sequence': "".join(current_seq)})
            
df_seq =  pd.DataFrame(sequences)


col_names = ['domain_id', 'C', 'A', 'T', 'H', 'S', 'O', 'L', 'I', 'D', 'len', 'res']

df_domains = pd.read_csv(
    FN_DOMAIN_LIST, 
    sep=r'\s+', 
    comment='#', 
    header=None,
    names=col_names,
    usecols=['domain_id', 'C', 'A', 'T', 'H','S', 'O', 'L', 'I', 'D', 'len', 'res']
)

# Create Superfamily ID (C.A.T.H)
df_domains['superfamily_id'] = df_domains.apply(
    lambda x: f"{x['C']}.{x['A']}.{x['T']}.{x['H']}", axis=1
)

sf_names = {}
with open(FN_SF_NAMES, 'r') as f:
    for line in f:
        if line.startswith('#'): continue
        parts = line.strip().split(maxsplit=1)
        if len(parts) == 2:
            sf_names[parts[0]] = parts[1]

df_merged= pd.merge(df_seq, df_domains, on='domain_id', how='inner')

sf_counts = df_merged['superfamily_id'].value_counts()
small_sfs = sf_counts[sf_counts < 1000]
top_5_sfs = small_sfs.nlargest(5).index.tolist()
df_filtered = df_merged[df_merged['superfamily_id'].isin(top_5_sfs)].copy()

final_data = df_merged[df_merged['superfamily_id'].isin(top_5_sfs)].astype(str).copy()
final_data = final_data[["sequence", "superfamily_id"]].reset_index(drop=True)

max_len = int(final_data.sequence.str.len().max())

vectorizer = keras.layers.TextVectorization(split="character", output_sequence_length=max_len)
vectorizer.adapt(final_data.sequence)
x = vectorizer(final_data.sequence)

le = LabelEncoder()
y = le.fit_transform(final_data.superfamily_id)
num_classes = len(le.classes_)

x_numpy = x.numpy() if hasattr(x, 'numpy') else np.array(x)
y_numpy = y.numpy() if hasattr(y, 'numpy') else np.array(y)
y_numpy = y_numpy.astype(int)
rng = np.random.RandomState(42)

In [None]:
TOKENS = 24
DIMENSIONS = 16
CLASSES = 5
SIZE = 4  # Kernel/Pool size constant

# 0. DATA SPLIT (Matches your example)
x_train, x_val, y_train, y_val = train_test_split(
    x_numpy, y_numpy, test_size=0.2, stratify=y, random_state=42
)

# --- 1. DEFINE CNN MODEL ---
def build_cnn(hp):
    # Tunable parameters
    units = hp.Int('units', min_value=32, max_value=128, step=32)
    dropout = hp.Choice('dropout', values=[0.2, 0.5])
    lr = hp.Choice('learning_rate', values=[1e-2, 1e-3])

    model = keras.Sequential([
        keras.Input(shape=(max_len,)),
        Embedding(TOKENS, DIMENSIONS, mask_zero=False), # mask_zero=False is better for CNNs

        # CNN Layer: Uses tuned 'units' for filters
        # We use padding='same' to ensure Flatten works safely
        Conv1D(filters=units, kernel_size=SIZE, activation="relu", padding='same'),
        MaxPooling1D(pool_size=SIZE),
        
        Flatten(),

        Dense(units, activation="relu"),
        Dropout(dropout),

        Dense(CLASSES, activation="softmax")
    ])

    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=lr),
        loss="sparse_categorical_crossentropy",
        metrics=["accuracy"]
    )
    return model

# --- 2. SETUP THE TUNER ---
cnn_tuner = kt.RandomSearch(
    build_cnn,
    objective='val_accuracy',
    max_trials=5,
    executions_per_trial=1,
    directory='my_dir',
    project_name='cnn_tuning_v1', # Unique name
    overwrite=True                # CRITICAL: Deletes old logs to prevent errors
)

# --- 3. RUN SEARCH ---
print("Starting CNN Tuner Search...")
cnn_tuner.search(
    x_train, y_train,
    epochs=50,
    validation_data=(x_val, y_val),
    batch_size=32,
    verbose=1
)

# --- 4. GET RESULTS ---
best_hps = cnn_tuner.get_best_hyperparameters(num_trials=1)[0]
print(f"""
CNN Search Complete.
Best units: {best_hps.get('units')}
Best learning rate: {best_hps.get('learning_rate')}
Best dropout: {best_hps.get('dropout')}
""")

Starting CNN Tuner Search...

Search: Running Trial #1

Value             |Best Value So Far |Hyperparameter
64                |64                |units
0.5               |0.5               |dropout
0.01              |0.01              |learning_rate

Epoch 1/50
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 23ms/step - accuracy: 0.4463 - loss: 1.3403 - val_accuracy: 0.4733 - val_loss: 1.1512
Epoch 2/50
[1m80/80[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.5019 - loss: 1.1452

KeyboardInterrupt: 

In [5]:

# --- CONFIGURATION ---
EPOCHS_TUNING = 50
MAX_TRIALS = 5 
TOKENS = 24       # Assumed from your previous message
CLASSES = 5       # Assumed from your previous message

# --- 1. DATA SPLIT FOR TUNING ---
# (Assuming x and y are already defined)
x_tune_train, x_tune_val, y_tune_train, y_tune_val = train_test_split(
    x_numpy, y_numpy, test_size=0.2, stratify=y, random_state=42
)

# --- 2. DEFINE THE HYPERMODEL (GRU + RNN) ---
def build_gru_rnn(hp):
    # --- Tunable Hyperparameters ---
    
    # 1. NEW: Tune Embedding Dimensions
    # We try 8, 16, 32, and 64 to see how "wide" the vector needs to be.
    embed_dim = hp.Choice('embedding_dim', values=[8, 16, 32, 64])

    # 2. Tune Units (GRU)
    units = hp.Int('units', min_value=32, max_value=128, step=32)
    
    # 3. Tune Dropout
    dropout = hp.Choice('dropout', values=[0.2, 0.5])
    
    # 4. Tune Learning Rate
    lr = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])

    model = keras.Sequential([
        keras.Input(shape=(max_len,)),
        
        # Pass the tuned 'embed_dim' here instead of a fixed constant
        Embedding(input_dim=TOKENS, output_dim=embed_dim, mask_zero=True),

        # Layer 1: GRU
        GRU(units, return_sequences=True),
        
        # Layer 2: SimpleRNN (Half the size of the GRU)
        SimpleRNN(units // 2),

        Dense(units, activation="relu"),
        Dropout(dropout),

        Dense(CLASSES, activation="softmax")
    ])

    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=lr),
        loss="sparse_categorical_crossentropy",
        metrics=["accuracy"]
    )
    return model

# --- 3. PHASE 1: HYPERPARAMETER TUNING ---
print(f"--- Phase 1: Tuning Hyperparameters (Max Trials: {MAX_TRIALS}) ---")

tuner = kt.RandomSearch(
    build_gru_rnn,
    objective='val_accuracy',
    max_trials=MAX_TRIALS,
    executions_per_trial=1,
    directory='my_dir',
    project_name='gru_rnn_tuning_v2', # Changed name to avoid conflict with previous runs
    overwrite=True
)

# Run the search
tuner.search(
    x_tune_train, y_tune_train,
    epochs=EPOCHS_TUNING,
    validation_data=(x_tune_val, y_tune_val),
    verbose=1
)

# Get the best hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

print("\n--- Tuning Complete. Best Hyperparameters: ---")
print(f"Embedding Dimensions: {best_hps.get('embedding_dim')}")  # New print
print(f"Units: {best_hps.get('units')}")
print(f"Dropout: {best_hps.get('dropout')}")
print(f"Learning Rate: {best_hps.get('learning_rate')}")

Trial 5 Complete [00h 18m 47s]
val_accuracy: 0.5047169923782349

Best val_accuracy So Far: 0.7924528121948242
Total elapsed time: 01h 30m 49s

--- Tuning Complete. Best Hyperparameters: ---
Embedding Dimensions: 32
Units: 32
Dropout: 0.2
Learning Rate: 0.01
