# Protein Classification Challenge – Fourth Challenge
 
**Name:** AJ Book  
**Course:** EN.605.656.8VL – Computational Drug Discovery and Development  
**Due Date:** 04/30/2025  
 


This module begins by loading the preprocessed training set (`metadata_org_w_features.csv`) and the held‐out evaluation set (`testing_data_w_features.csv`).  Feature augmentation is then performed by merging:

1. **Pfam** domain presence/absence profiles  
2. **Peptide descriptors** (Kidera factors, VHSE-8, Atchley factors, Z-scales)  
3. **Protlearn** physicochemical descriptors (e.g., Shannon entropy, autocorrelation)  
4. **ProtBert** contextual embeddings  
5. **ESM2** evolutionary language model embeddings  

Each feature block is validated for row and column consistency before proceeding to preprocessing, model training, evaluation, and final submission formatting.  


In [1]:
# === Standard Library Imports ===
import re
import time
import random
import warnings
import itertools
from pathlib import Path
from collections import Counter

# === Third-Party Scientific Computing Libraries ===
import numpy as np
import pandas as pd
from scipy.stats import randint, uniform, loguniform

# === Bioinformatics Libraries ===
from Bio.SeqUtils.ProtParam import ProteinAnalysis

# === Machine Learning Libraries (scikit-learn) ===
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import (
    train_test_split, RandomizedSearchCV, cross_val_score, GridSearchCV, cross_validate, StratifiedKFold
)
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    confusion_matrix, classification_report
)
from sklearn.decomposition import PCA
from sklearn.inspection import permutation_importance
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier

# === Specialized Machine Learning Libraries ===
from xgboost import XGBClassifier

# === Visualization Libraries ===
import matplotlib.pyplot as plt
import seaborn as sns

# === Utilities ===
import joblib
from tqdm import tqdm

# === Setup Path for Custom Utilities ===
import sys
import os

# Add parent directory to sys.path so we can import from utils/
sys.path.append(os.path.abspath(".."))


# === Custom Utility Functions (our own helper scripts) ===
from utils.data_utils import save_npy, load_npy, load_npy_from_tracker
from utils.model_utils import save_model, load_model, load_best_model_from_tracker


# Read in data

In [2]:
# Read in data
data = pd.read_csv("../data/metadata_org_w_features.csv")
data_evaluation = pd.read_csv("../data/testing_data_w_features.csv")

# Information and sample data

In [3]:
# information on data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 681 entries, 0 to 680
Columns: 589 entries, Entry to Gravy
dtypes: float64(584), int64(1), object(4)
memory usage: 3.1+ MB


In [4]:
# describe on data
data.describe()

Unnamed: 0,SequenceLength,A,C,D,E,F,G,H,I,K,...,DDD,MolecularWeight,IsoelectricPoint,Aromaticity,InstabilityIndex,Flexibility,Helix,Sheet,Turn,Gravy
count,681.0,681.0,681.0,681.0,681.0,681.0,681.0,681.0,681.0,681.0,...,681.0,681.0,681.0,681.0,681.0,681.0,681.0,681.0,681.0,681.0
mean,888.660793,0.071173,0.023348,0.044993,0.055845,0.046743,0.065586,0.022028,0.056255,0.047262,...,0.002206,99103.867857,7.097584,0.096863,42.16057,0.995021,0.304995,0.279123,0.389817,-0.051027
std,827.859592,0.022832,0.0123,0.013231,0.017925,0.015308,0.01884,0.00775,0.018204,0.017596,...,0.003108,92308.45646,1.551349,0.021673,7.412366,0.006859,0.029081,0.032611,0.043874,0.325872
min,103.0,0.024283,0.0,0.007605,0.005714,0.007782,0.016807,0.004065,0.00984,0.002915,...,0.0,11710.27,4.383673,0.033074,15.359896,0.974438,0.222539,0.18315,0.24863,-0.757339
25%,384.0,0.056695,0.015251,0.037879,0.04428,0.035479,0.052764,0.016825,0.044171,0.034237,...,0.0,42771.0835,5.723255,0.082587,37.259593,0.991538,0.284024,0.257261,0.361333,-0.284269
50%,676.0,0.06746,0.021417,0.046046,0.058594,0.044747,0.065963,0.021544,0.056726,0.048017,...,0.001377,74697.6635,6.556855,0.094744,42.218182,0.996686,0.302326,0.277,0.382857,-0.12035
75%,1015.0,0.081784,0.030753,0.053254,0.067669,0.055838,0.078212,0.026769,0.068,0.058943,...,0.003229,112207.0978,8.660054,0.109974,46.958534,0.999503,0.323615,0.300946,0.416667,0.136484
max,7096.0,0.164,0.074873,0.116598,0.123102,0.110701,0.153509,0.056093,0.125461,0.104956,...,0.031637,794048.9258,11.114366,0.180812,66.024733,1.013136,0.403005,0.419863,0.546125,1.008772


In [5]:
data.head()

Unnamed: 0,Entry,CleanSequence,Selected_PDB,ProteinClass,SequenceLength,A,C,D,E,F,...,DDD,MolecularWeight,IsoelectricPoint,Aromaticity,InstabilityIndex,Flexibility,Helix,Sheet,Turn,Gravy
0,P21611,MGKAAAVVLVTLVALLGLAQADLTPKVQVYSRFPASAGTKNVLNCF...,3p73,MHC,119,0.109244,0.016807,0.058824,0.042017,0.058824,...,0.0,13041.7928,5.834204,0.109244,22.404202,0.996347,0.336134,0.277311,0.369748,-0.047059
1,Q66GT5,MAASAWLEAGLARVLFYPTLLYTVFRGRVRGPAHRDWYHRIDHTVL...,3rgo,Phosphatase,193,0.108808,0.015544,0.020725,0.067358,0.025907,...,0.0,21942.2575,9.739448,0.082902,37.915078,0.992205,0.367876,0.186528,0.38342,-0.16114
2,Q9Y006,MNLTIKEEDFTNTFMKNEESFNTFRVTKVKRWNAKRLFKILFVTVF...,3qvc,Protease,451,0.028825,0.008869,0.04878,0.068736,0.079823,...,0.002227,51692.5641,8.043414,0.137472,37.618204,1.002078,0.305987,0.290466,0.43459,-0.249667
3,P05622,MGLPGVIPALVLRGQLLLSVLWLLGPQTSRGLVITPPGPEFVLNIS...,1aya,RTK,1098,0.043716,0.017304,0.056466,0.069217,0.030055,...,0.00365,122788.2434,4.99401,0.083789,47.564763,0.999045,0.281421,0.311475,0.385246,-0.203097
4,P06343,MALQIPSLLLLAAVVVLTVLSSPGTEGGNSERHFVHQFQPFCYFTN...,1d9k,MHC,263,0.041825,0.019011,0.022814,0.076046,0.038023,...,0.0,29966.8989,8.99168,0.091255,46.750989,0.99569,0.262357,0.254753,0.39924,-0.303042


# Extract the relevant fields from the dataset

In [6]:
def extract_fields_from_data(data):
    column_start = data.columns.get_loc("SequenceLength")+1
    # Entry and ProteinClass
    df = data.loc[:,['Entry', 'ProteinClass']]
    # PDB
    selected_PDB = data.loc[:,['Selected_PDB']]
    # Sequence
    seq = data.loc[:,['CleanSequence']]
    # Sequence Length
    seq_L = data.loc[:,['SequenceLength']]
    # Amino Acid Frequencies
    freq = data.iloc[:, column_start : column_start+20]
    # All possible dipeptide frequencies
    dipep = data.iloc[:, column_start+20 : column_start+20 + 400]
    # Reduced Amino Acid Alphabet Frequencies
    red_freq = data.iloc[:, column_start+20 + 400 : column_start+20 + 400 + 5]
    # N-Gram Profiles of Reduced Amino Acid Alphabet
    red_ngram = data.iloc[:, column_start+20 + 400 + 5 : column_start+20 + 400 + 5 + 150]
    # Protein Properties
    prop = data.iloc[:, column_start+20 + 400 + 5 + 150 :]

    return df, selected_PDB, seq, seq_L, freq, dipep, red_freq, red_ngram, prop

# For data with no ProteinClass
def extract_fields_from_data_evaluation(data):
    column_start = data.columns.get_loc("SequenceLength")+1
    # Entry and ProteinClass
    df = data.loc[:,['Entry']]
    # PDB
    selected_PDB = data.loc[:,['Selected_PDB']]
    # Sequence
    seq = data.loc[:,['CleanSequence']]
    # Sequence Length
    seq_L = data.loc[:,['SequenceLength']]
    # Amino Acid Frequencies
    freq = data.iloc[:, column_start : column_start+20]
    # All possible dipeptide frequencies
    dipep = data.iloc[:, column_start+20 : column_start+20 + 400]
    # Reduced Amino Acid Alphabet Frequencies
    red_freq = data.iloc[:, column_start+20 + 400 : column_start+20 + 400 + 5]
    # N-Gram Profiles of Reduced Amino Acid Alphabet
    red_ngram = data.iloc[:, column_start+20 + 400 + 5 : column_start+20 + 400 + 5 + 150]
    # Protein Properties
    prop = data.iloc[:, column_start+20 + 400 + 5 + 150 :]

    return df, selected_PDB, seq, seq_L, freq, dipep, red_freq, red_ngram, prop

In [7]:
(df, selected_PDB, seq, seq_L, freq, dipep,
 red_freq, red_ngram, prop) = extract_fields_from_data(data)

# Evaluation data
(df_evaluation, selected_PDB_evaluation, seq_evaluation, seq_L_evaluation,
  freq_evaluation, dipep_evaluation, red_freq_evaluation, red_ngram_evaluation, prop_evaluation) = extract_fields_from_data_evaluation(data_evaluation)

# You can use this section if you wanted to add more features to the data

## Merge External Features into Dataset
To enhance protein classification, external biological and structural features are incorporated. These include Pfam domain annotations, peptide physicochemical descriptors, and deep embeddings (ProtBert, ESM2).

## Step: Merge and Prefix Base Features

**Purpose:**  
This step consolidates the base feature blocks extracted earlier — including amino acid frequencies, dipeptide frequencies, reduced alphabet profiles, N-gram profiles, and physicochemical properties — into a single feature matrix. It then prefixes all feature columns with `base_` for clarity and easy tracking when additional external features are merged later.

**Actions Performed:**

- **Merging Base Features:**
  - Concatenates `freq`, `dipep`, `red_freq`, `red_ngram`, and `prop` side-by-side into a unified dataframe `base_features`.
  - Combines `base_features` with `Entry` and `ProteinClass` metadata into `df_base`.

- **Prefixing Base Features:**
  - Adds a `base_` prefix to all columns corresponding to engineered features, while leaving `Entry` and `ProteinClass` unchanged.
  - This step ensures future merging with external features (e.g., Pfam, ProtBert, ESM2 embeddings) avoids column name conflicts.

**Output after Execution:**

- `df_base` shape: `(681, 586)`  
- First few prefixed columns:  
  `['Entry', 'ProteinClass', 'base_A', 'base_C', 'base_D', 'base_E', 'base_F', 'base_G', 'base_H', 'base_I']`



In [8]:
# Feel free to add other feature to this data

# --- Step 1: Merge extracted Base features ---
# You already extracted: df, freq, dipep, red_freq, red_ngram, prop

# Merge base feature blocks into one
base_features = pd.concat([freq, dipep, red_freq, red_ngram, prop], axis=1)

# Combine base features with Entry and ProteinClass
df_base = pd.concat([df, base_features], axis=1)

print(f"Base features merged shape: {df_base.shape}")
print("Sample columns:", df_base.columns[:10].tolist())


# --- Step 2: Prefix Base Features with 'base_' ---

# Identify all columns that are actual features (not Entry or ProteinClass)
base_feature_cols = [col for col in df_base.columns if col not in ["Entry", "ProteinClass"]]

# Rename these columns by prefixing with 'base_'
df_base.rename(columns={col: f"base_{col}" for col in base_feature_cols}, inplace=True)

print(f"Prefixed base feature columns. Now total columns: {len(df_base.columns)}")
print(f" Sample prefixed columns: {df_base.columns[:10].tolist()}")



Base features merged shape: (681, 586)
Sample columns: ['Entry', 'ProteinClass', 'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I']
Prefixed base feature columns. Now total columns: 586
 Sample prefixed columns: ['Entry', 'ProteinClass', 'base_A', 'base_C', 'base_D', 'base_E', 'base_F', 'base_G', 'base_H', 'base_I']


In [9]:
# --- Step 2: Load External Engineered Features (ProtLearn and Peptide Descriptors) ---

# Load feature CSVs from the organized folder
protlearn_features = pd.read_csv("../data/features/protlearn_features.csv")
peptide_descriptors = pd.read_csv("../data/features/peptide_descriptors.csv")

# Clean column names to ensure no trailing whitespace
protlearn_features.columns = protlearn_features.columns.str.strip()
peptide_descriptors.columns = peptide_descriptors.columns.str.strip()

# Insert 'Entry' column manually (assumes row order matches main dataset)
protlearn_features.insert(0, "Entry", data["Entry"].values)
peptide_descriptors.insert(0, "Entry", data["Entry"].values)

print("ProtLearn feature columns:", protlearn_features.columns[:5].tolist())
print("Peptide descriptor columns:", peptide_descriptors.columns[:5].tolist())

# --- Step 3: Merge Base Features with External Features ---

# Merge base features first
df_full_features = df_base.merge(protlearn_features, on="Entry", how="left")
df_full_features = df_full_features.merge(peptide_descriptors, on="Entry", how="left")

print(f"Full merged feature matrix shape after adding ProtLearn and Peptides: {df_full_features.shape}")

# Build the final modeling matrices
X_full_features = df_full_features.drop(columns=["Entry", "ProteinClass"])  # Features only
y_labels_full = df_full_features[["Entry", "ProteinClass"]]                  # Entry and Labels

print("X_full_features shape (features only):", X_full_features.shape)
print("y_labels_full shape (Entry + Label):", y_labels_full.shape)


ProtLearn feature columns: ['Entry', 'PL_AAIndex1_0', 'PL_AAIndex1_1', 'PL_AAIndex1_2', 'PL_AAIndex1_3']
Peptide descriptor columns: ['Entry', 'PEP_AF1', 'PEP_AF2', 'PEP_AF3', 'PEP_AF4']
Full merged feature matrix shape after adding ProtLearn and Peptides: (681, 2392)
X_full_features shape (features only): (681, 2390)
y_labels_full shape (Entry + Label): (681, 2)


In [10]:
# Step 3: Define clean function to load Pfam hits

def load_clean_pfam(domtblout_path: Path, entries: pd.Series, e_cut: float = 1e-3) -> pd.DataFrame:
    """
    Parse hmmscan domtblout.txt and build a clean binary Pfam domain hit matrix.

    Args:
        domtblout_path (Path): Path to domtblout.txt file.
        entries (pd.Series): List of Entry IDs (protein IDs).
        e_cut (float): E-value cutoff for accepting a domain hit.

    Returns:
        pd.DataFrame: Binary matrix (Entry × PfamAcc), 1 = domain present, 0 = absent.
    """
    rows = []
    with domtblout_path.open() as fh:
        for line in fh:
            if line.startswith("#"):
                continue
            parts = line.strip().split()
            if len(parts) < 23:
                continue
            pfam_acc = parts[1]
            entry = parts[3]
            evalue = float(parts[6])
            if evalue <= e_cut:
                rows.append((entry, pfam_acc))

    hits_df = pd.DataFrame(rows, columns=["Entry", "PfamAcc"])
    pfam_matrix = pd.crosstab(hits_df["Entry"], hits_df["PfamAcc"])
    pfam_matrix = pfam_matrix.reindex(entries.values, fill_value=0)
    pfam_matrix.index.name = "Entry"

    return pfam_matrix

In [11]:
# Step 4: Load clean Pfam domains

# Define correct domtblout path inside the data folder
domtblout_path = Path("../data/pfam/domtblout.txt")

# Load Pfam matrix (binary hits Entry × PfamAcc)
pfam_df_clean = load_clean_pfam(domtblout_path, y_labels_full["Entry"])

# Reset index to make 'Entry' a regular column
pfam_df_clean = pfam_df_clean.reset_index()

print(f"Clean Pfam matrix shape: {pfam_df_clean.shape}")
print(f"Example Pfam accession columns: {pfam_df_clean.columns[1:6].tolist()}")


Clean Pfam matrix shape: (681, 529)
Example Pfam accession columns: ['PF00001.24', 'PF00002.27', 'PF00003.25', 'PF00004.32', 'PF00005.30']


In [12]:
# Step 5: Check for existing PF-prefixed columns before cleaning

# Identify any columns in X_full_features starting with "PF"
pfam_cols_in_features = [col for col in X_full_features.columns if col.startswith("PF")]

print(f"Found {len(pfam_cols_in_features)} existing columns starting with 'PF':")
print(pfam_cols_in_features)

#  Preview some rows combining Entry ID and PF-prefixed features
print("\nPreview of existing PF-prefixed features:")
preview_df = pd.concat([y_labels_full[["Entry"]], X_full_features[pfam_cols_in_features]], axis=1)
display(preview_df.head())


Found 0 existing columns starting with 'PF':
[]

Preview of existing PF-prefixed features:


Unnamed: 0,Entry
0,P21611
1,Q66GT5
2,Q9Y006
3,P05622
4,P06343


In [13]:
# === Step 6: Merge Clean Pfam Domains into Full Features ===

# Recalculate PF-prefixed columns (in case running this cell standalone)
pfam_cols_in_full = [col for col in df_full_features.columns if col.startswith("PF")]

# Drop old PF-prefixed columns (if any were found earlier)
df_full_features = df_full_features.drop(columns=pfam_cols_in_full, errors="ignore")
print(f"Dropped {len(pfam_cols_in_full)} old PF-prefixed columns before clean Pfam merge.")

# Merge the cleaned Pfam matrix into df_full_features
df_full_features = df_full_features.merge(pfam_df_clean, on="Entry", how="left").fillna(0)
print(f"Final df_full_features shape after clean Pfam merge: {df_full_features.shape}")

# === Final Sanity Check: Validate Pfam Features ===

pfam_columns_final = [col for col in df_full_features.columns if col.startswith("PF")]

print(f"Final number of Pfam domain features: {len(pfam_columns_final)}")
print("\nPreview of final clean Pfam features (first 5 domains):")
print(df_full_features[["Entry"] + pfam_columns_final[:5]].head(10))


Dropped 0 old PF-prefixed columns before clean Pfam merge.
Final df_full_features shape after clean Pfam merge: (681, 2920)
Final number of Pfam domain features: 528

Preview of final clean Pfam features (first 5 domains):
    Entry  PF00001.24  PF00002.27  PF00003.25  PF00004.32  PF00005.30
0  P21611           0           0           0           0           0
1  Q66GT5           0           0           0           0           0
2  Q9Y006           0           0           0           0           0
3  P05622           0           0           0           0           0
4  P06343           0           0           0           0           0
5  P50281           0           0           0           0           0
6  Q13255           0           0           1           0           0
7  P08069           0           0           0           0           0
8  B0V2N1           0           0           0           0           0
9  Q9Y5Z0           0           0           0           0           0


In [14]:
# --- Inspect Pfam Domain Features After Merge ---

# Step 1: Identify all Pfam domain columns by their 'PF' prefix
pfam_columns = [c for c in df_full_features.columns if c.startswith("PF")]
print(f"Found {len(pfam_columns)} Pfam domain features.")

# Step 2: Preview first few Pfam features for the first 10 proteins
print(df_full_features[["Entry"] + pfam_columns[:5]].head(10))

# Step 3: Identify proteins with zero Pfam domain hits
no_pfam = df_full_features[pfam_columns].sum(axis=1) == 0
print(f"Proteins with NO Pfam hits: {no_pfam.sum()} out of {len(df_full_features)}")

# Step 4: Show a sample of proteins with no detected Pfam domains
print(df_full_features.loc[no_pfam, ["Entry"] + pfam_columns[:5]].head())


Found 528 Pfam domain features.
    Entry  PF00001.24  PF00002.27  PF00003.25  PF00004.32  PF00005.30
0  P21611           0           0           0           0           0
1  Q66GT5           0           0           0           0           0
2  Q9Y006           0           0           0           0           0
3  P05622           0           0           0           0           0
4  P06343           0           0           0           0           0
5  P50281           0           0           0           0           0
6  Q13255           0           0           1           0           0
7  P08069           0           0           0           0           0
8  B0V2N1           0           0           0           0           0
9  Q9Y5Z0           0           0           0           0           0
Proteins with NO Pfam hits: 0 out of 681
Empty DataFrame
Columns: [Entry, PF00001.24, PF00002.27, PF00003.25, PF00004.32, PF00005.30]
Index: []


In [15]:
# --- Load ProtBERT Embeddings (Training Set) ---

protbert_path = "../embed_models/protbert_train_full.npy"
protbert_embeddings = np.load(protbert_path)  # Expected shape: (681, 1024)

# Convert to DataFrame and align Entry index
protbert_df = pd.DataFrame(
    protbert_embeddings,
    columns=[f"ProtBERT_{i}" for i in range(protbert_embeddings.shape[1])]
)
protbert_df.insert(0, "Entry", df_full_features["Entry"].values)

print(f"ProtBERT embedding DataFrame shape: {protbert_df.shape}")
print(f"Sample ProtBERT columns: {protbert_df.columns[:5].tolist()}")

# Merge ProtBERT features into the full feature matrix
df_full_features = df_full_features.merge(protbert_df, on="Entry", how="left")
print(f"Shape after merging ProtBERT embeddings: {df_full_features.shape}")


# --- Load ESM2 Embeddings (Training Set) ---

esm2_path = "../embed_models/esm2_train_full.npy"
esm2_embeddings = np.load(esm2_path)  # Expected shape: (681, 1280)

# Convert to DataFrame and align Entry index
esm2_df = pd.DataFrame(
    esm2_embeddings,
    columns=[f"ESM2_{i}" for i in range(esm2_embeddings.shape[1])]
)
esm2_df.insert(0, "Entry", df_full_features["Entry"].values)

print(f"ESM2 embedding DataFrame shape: {esm2_df.shape}")
print(f"Sample ESM2 columns: {esm2_df.columns[:5].tolist()}")

# Merge ESM2 features into the full feature matrix
df_full_features = df_full_features.merge(esm2_df, on="Entry", how="left")
print(f"Shape after merging ESM2 embeddings: {df_full_features.shape}")


# --- Final Sanity Check ---

print("\nFinal columns after embedding merge:")
print(df_full_features.columns[:10])

# Construct final modeling matrices
X_full_features = df_full_features.drop(columns=["Entry", "ProteinClass"])
y_labels_full = df_full_features[["Entry", "ProteinClass"]]

print(f"Final X_full_features shape: {X_full_features.shape}")
print(f"Final y_labels_full shape: {y_labels_full.shape}")


ProtBERT embedding DataFrame shape: (681, 1025)
Sample ProtBERT columns: ['Entry', 'ProtBERT_0', 'ProtBERT_1', 'ProtBERT_2', 'ProtBERT_3']
Shape after merging ProtBERT embeddings: (681, 3944)
ESM2 embedding DataFrame shape: (681, 1281)
Sample ESM2 columns: ['Entry', 'ESM2_0', 'ESM2_1', 'ESM2_2', 'ESM2_3']
Shape after merging ESM2 embeddings: (681, 5224)

Final columns after embedding merge:
Index(['Entry', 'ProteinClass', 'base_A', 'base_C', 'base_D', 'base_E',
       'base_F', 'base_G', 'base_H', 'base_I'],
      dtype='object')
Final X_full_features shape: (681, 5222)
Final y_labels_full shape: (681, 2)


In [16]:
# --- Step 5: Final Data Double-Check ---

# Check for missing values
n_missing_X = X_full_features.isnull().sum().sum()
n_missing_y = y_labels_full.isnull().sum().sum()

print(f"Missing values in X_full_features: {n_missing_X}")
print(f"Missing values in y_labels_full: {n_missing_y}")

assert n_missing_X == 0, "Found missing values in X_full_features after merging!"
assert n_missing_y == 0, "Found missing values in y_labels_full after merging!"

print("Double-check passed: No missing values.")


Missing values in X_full_features: 0
Missing values in y_labels_full: 0
Double-check passed: No missing values.


In [17]:
df_full_features.head()

Unnamed: 0,Entry,ProteinClass,base_A,base_C,base_D,base_E,base_F,base_G,base_H,base_I,...,ESM2_1270,ESM2_1271,ESM2_1272,ESM2_1273,ESM2_1274,ESM2_1275,ESM2_1276,ESM2_1277,ESM2_1278,ESM2_1279
0,P21611,MHC,0.109244,0.016807,0.058824,0.042017,0.058824,0.058824,0.02521,0.016807,...,0.004602,-0.082712,-0.138449,-0.015294,-0.006374,-0.047119,0.072656,-0.077782,-0.014603,0.038721
1,Q66GT5,Phosphatase,0.108808,0.015544,0.020725,0.067358,0.025907,0.051813,0.041451,0.046632,...,-0.061771,-0.063909,-0.09209,-0.00935,0.042447,0.023802,0.101591,-0.167533,0.036926,0.024987
2,Q9Y006,Protease,0.028825,0.008869,0.04878,0.068736,0.079823,0.053215,0.015521,0.062084,...,0.183224,0.005093,-0.064172,0.07959,-0.049639,0.000521,0.14536,-0.155799,-0.135549,-0.052576
3,P05622,RTK,0.043716,0.017304,0.056466,0.069217,0.030055,0.06102,0.021858,0.050091,...,0.041743,0.00576,-0.123973,0.010424,0.021136,-0.079837,0.051444,-0.076187,0.03254,0.041431
4,P06343,MHC,0.041825,0.019011,0.022814,0.076046,0.038023,0.072243,0.030418,0.041825,...,0.032086,-0.010952,-0.108737,-0.008614,0.030564,0.006659,0.102912,-0.017738,0.003438,0.00277


# Construct Dataset with any of the extracted relevant fields

In [18]:
# This is an example of construction of the data with freq and red_freq, but you can add and make any field
Dataset = pd.concat([df, freq, red_freq, prop], axis=1)
Dataset_evaluation = pd.concat([df_evaluation, freq_evaluation, red_freq_evaluation, prop_evaluation], axis=1)

In [19]:
# Training Data with ProteinClass
Dataset.head()

Unnamed: 0,Entry,ProteinClass,A,C,D,E,F,G,H,I,...,D.1,MolecularWeight,IsoelectricPoint,Aromaticity,InstabilityIndex,Flexibility,Helix,Sheet,Turn,Gravy
0,P21611,MHC,0.109244,0.016807,0.058824,0.042017,0.058824,0.058824,0.02521,0.016807,...,0.10084,13041.7928,5.834204,0.109244,22.404202,0.996347,0.336134,0.277311,0.369748,-0.047059
1,Q66GT5,Phosphatase,0.108808,0.015544,0.020725,0.067358,0.025907,0.051813,0.041451,0.046632,...,0.088083,21942.2575,9.739448,0.082902,37.915078,0.992205,0.367876,0.186528,0.38342,-0.16114
2,Q9Y006,Protease,0.028825,0.008869,0.04878,0.068736,0.079823,0.053215,0.015521,0.062084,...,0.117517,51692.5641,8.043414,0.137472,37.618204,1.002078,0.305987,0.290466,0.43459,-0.249667
3,P05622,RTK,0.043716,0.017304,0.056466,0.069217,0.030055,0.06102,0.021858,0.050091,...,0.125683,122788.2434,4.99401,0.083789,47.564763,0.999045,0.281421,0.311475,0.385246,-0.203097
4,P06343,MHC,0.041825,0.019011,0.022814,0.076046,0.038023,0.072243,0.030418,0.041825,...,0.098859,29966.8989,8.99168,0.091255,46.750989,0.99569,0.262357,0.254753,0.39924,-0.303042


In [20]:
# Evaluation Data without ProteinClass
Dataset_evaluation.head()

Unnamed: 0,Entry,A,C,D,E,F,G,H,I,K,...,D.1,MolecularWeight,IsoelectricPoint,Aromaticity,InstabilityIndex,Flexibility,Helix,Sheet,Turn,Gravy
0,Q9LF79,0.081006,0.010242,0.054935,0.053073,0.031657,0.089385,0.01676,0.069832,0.065177,...,0.108007,116172.6688,7.86561,0.05959,33.680642,0.999017,0.323091,0.292365,0.384544,0.028026
1,P9WI81,0.108626,0.003195,0.073482,0.043131,0.023962,0.08147,0.019169,0.052716,0.028754,...,0.116613,66509.0117,5.223413,0.049521,28.408163,1.001449,0.263578,0.332268,0.34984,-0.173163
2,P04439,0.09863,0.013699,0.065753,0.063014,0.021918,0.079452,0.024658,0.035616,0.030137,...,0.128767,40840.2477,5.655275,0.090411,36.957014,1.000302,0.293151,0.265753,0.345205,-0.49863
3,Q16581,0.056017,0.03112,0.045643,0.03112,0.070539,0.045643,0.020747,0.051867,0.026971,...,0.076763,53863.6435,6.20218,0.114108,40.862697,0.989447,0.255187,0.307054,0.425311,0.204979
4,Q6QNK2,0.072082,0.020595,0.026316,0.040046,0.050343,0.064073,0.04119,0.050343,0.038902,...,0.066362,96528.7218,8.021753,0.110984,33.635481,0.990173,0.283753,0.28833,0.416476,0.15595


# Evaluation (Testing) Feature Construction

In [21]:
## To Do ##
# Construct your own dataset with these field and any other field that you see fit
# Make sure you do both the training and evaluation data as haveing the same fields


# === Build Correct Base Feature DataFrame for Evaluation ===

# Step 1: Concatenate all the extracted base feature blocks
base_features_eval = pd.concat(
    [freq_evaluation, dipep_evaluation, red_freq_evaluation, red_ngram_evaluation, prop_evaluation],
    axis=1
)

# Step 2: Prefix all base feature columns with 'base_' for clarity
base_features_eval.columns = [f"base_{col}" for col in base_features_eval.columns]

# Step 3: Merge with Entry column (used for future joins)
df_base_evaluation = pd.concat([df_evaluation, base_features_eval], axis=1)

# Step 4: Preview shape and sample column names
print(f"df_base_evaluation shape: {df_base_evaluation.shape}")
print("Sample columns:", df_base_evaluation.columns[:10].tolist())



df_base_evaluation shape: (171, 585)
Sample columns: ['Entry', 'base_A', 'base_C', 'base_D', 'base_E', 'base_F', 'base_G', 'base_H', 'base_I', 'base_K']


In [22]:
# Load descriptor files from the standardized data/features directory
protlearn_eval_df = pd.read_csv("../data/features/protlearn_eval_features.csv")
peptide_eval_df = pd.read_csv("../data/features/peptide_eval_descriptors.csv")

# Clean up column whitespace if any
protlearn_eval_df.columns = protlearn_eval_df.columns.str.strip()
peptide_eval_df.columns = peptide_eval_df.columns.str.strip()

# Preview the structure
print("ProtLearn evaluation columns:", protlearn_eval_df.columns[:5].tolist())
print("Peptide evaluation columns:", peptide_eval_df.columns[:5].tolist())

# Merge descriptor matrices into base evaluation DataFrame
df_evaluation_full = df_base_evaluation.copy()
df_evaluation_full = df_evaluation_full.merge(protlearn_eval_df, on="Entry", how="left")
df_evaluation_full = df_evaluation_full.merge(peptide_eval_df, on="Entry", how="left")

print(f"Full merged evaluation feature matrix shape after adding ProtLearn and Peptides: {df_evaluation_full.shape}")

# Final feature matrix (exclude 'Entry' for modeling)
X_eval_full = df_evaluation_full.drop(columns=["Entry"])
print("X_eval_full shape (features only):", X_eval_full.shape)


ProtLearn evaluation columns: ['Entry', 'PL_AAIndex1_0', 'PL_AAIndex1_1', 'PL_AAIndex1_2', 'PL_AAIndex1_3']
Peptide evaluation columns: ['Entry', 'PEP_AF1', 'PEP_AF2', 'PEP_AF3', 'PEP_AF4']
Full merged evaluation feature matrix shape after adding ProtLearn and Peptides: (171, 2391)
X_eval_full shape (features only): (171, 2390)


In [23]:
# Write evaluation sequences to FASTA for Pfam scanning
with open("../data/pfam/query_eval.fasta", "w") as fh:
    for entry, seq in zip(df_evaluation["Entry"], seq_evaluation["CleanSequence"]):
        fh.write(f">{entry}\n{seq}\n")

print("Evaluation FASTA for hmmscan prepared at: data/pfam/query_eval.fasta")


Evaluation FASTA for hmmscan prepared at: data/pfam/query_eval.fasta


In [24]:
#!hmmscan --cpu 8 --domtblout data/pfam/domtblout_eval.txt data/pfam/Pfam-A.hmm data/pfam/query_eval.fasta > data/pfam/hmmscan_eval.log


In [25]:
# Load Pfam hits for evaluation set (domain presence matrix)
pfam_eval_df = load_clean_pfam(Path("../data/pfam/domtblout_eval.txt"), df_evaluation["Entry"])

# Reset index to prepare for merging with full evaluation DataFrame
pfam_eval_df = pfam_eval_df.reset_index()

# Clean up residual index naming if present
pfam_eval_df.columns.name = None

# Quick preview of the parsed domain matrix
print("Cleaned Pfam matrix preview:")
print(pfam_eval_df.head())

print(f"Evaluation Pfam domain matrix shape: {pfam_eval_df.shape}")


Cleaned Pfam matrix preview:
    Entry  PF00001.24  PF00002.27  PF00003.25  PF00004.32  PF00005.30  \
0  Q9LF79           0           0           0           0           0   
1  P9WI81           0           0           0           0           0   
2  P04439           0           0           0           0           0   
3  Q16581           2           0           0           0           0   
4  Q6QNK2           0           1           0           0           0   

   PF00023.33  PF00026.26  PF00027.32  PF00036.35  ...  PF18861.4  PF19028.3  \
0           0           0           0           0  ...          0          0   
1           0           0           0           0  ...          0          0   
2           0           0           0           0  ...          0          0   
3           0           0           0           0  ...          0          0   
4           0           0           0           0  ...          0          0   

   PF19030.3  PF19035.3  PF19188.3  PF19285.2  PF19

In [26]:
# Merge the clean binary Pfam matrix into the evaluation feature matrix
df_evaluation_full = df_evaluation_full.merge(pfam_eval_df, on="Entry", how="left").fillna(0)

# Sanity check: updated matrix shape after domain merge
print(f"Shape after merging Pfam domains: {df_evaluation_full.shape}")


Shape after merging Pfam domains: (171, 2651)


In [27]:
# Drop the Entry column to create the final evaluation feature matrix
X_eval_full = df_evaluation_full.drop(columns=["Entry"])

# Sanity check: print shape of evaluation matrix
print(f"X_eval_full shape (features only): {X_eval_full.shape}")


X_eval_full shape (features only): (171, 2650)


In [28]:
# === Load ProtBERT Evaluation Embeddings ===
protbert_eval_path = "../embed_models/protbert_eval_full.npy"
protbert_eval_embeddings = np.load(protbert_eval_path)  # Expected shape: (171, 1024)

# Create DataFrame and align with Entry order
protbert_eval_df = pd.DataFrame(
    protbert_eval_embeddings,
    columns=[f"ProtBERT_{i}" for i in range(protbert_eval_embeddings.shape[1])]
)
protbert_eval_df.insert(0, "Entry", df_evaluation_full["Entry"].values)

print(f"ProtBERT evaluation DataFrame shape: {protbert_eval_df.shape}")
print(f"Sample ProtBERT columns: {protbert_eval_df.columns[:5].tolist()}")

# Merge ProtBERT into evaluation feature matrix
df_evaluation_full = df_evaluation_full.merge(protbert_eval_df, on="Entry", how="left")
print(f"Shape after merging ProtBERT embeddings: {df_evaluation_full.shape}")

# === Load ESM2 Evaluation Embeddings ===
esm2_eval_path = "../embed_models/esm2_eval_full.npy"
esm2_eval_embeddings = np.load(esm2_eval_path)  # Expected shape: (171, 1280)

# Create DataFrame and align with Entry order
esm2_eval_df = pd.DataFrame(
    esm2_eval_embeddings,
    columns=[f"ESM2_{i}" for i in range(esm2_eval_embeddings.shape[1])]
)
esm2_eval_df.insert(0, "Entry", df_evaluation_full["Entry"].values)

print(f"ESM2 evaluation DataFrame shape: {esm2_eval_df.shape}")
print(f"Sample ESM2 columns: {esm2_eval_df.columns[:5].tolist()}")

# Merge ESM2 into evaluation feature matrix
df_evaluation_full = df_evaluation_full.merge(esm2_eval_df, on="Entry", how="left")
print(f"Shape after merging ESM2 embeddings: {df_evaluation_full.shape}")

# === Final Sanity Check ===
print("\nFinal columns after embedding merge:")
print(df_evaluation_full.columns[:10].tolist())

# Drop Entry to form clean evaluation matrix for model input
X_eval_full = df_evaluation_full.drop(columns=["Entry"])
print(f"\nFinal X_eval_full shape (features only): {X_eval_full.shape}")


ProtBERT evaluation DataFrame shape: (171, 1025)
Sample ProtBERT columns: ['Entry', 'ProtBERT_0', 'ProtBERT_1', 'ProtBERT_2', 'ProtBERT_3']
Shape after merging ProtBERT embeddings: (171, 3675)
ESM2 evaluation DataFrame shape: (171, 1281)
Sample ESM2 columns: ['Entry', 'ESM2_0', 'ESM2_1', 'ESM2_2', 'ESM2_3']
Shape after merging ESM2 embeddings: (171, 4955)

Final columns after embedding merge:
['Entry', 'base_A', 'base_C', 'base_D', 'base_E', 'base_F', 'base_G', 'base_H', 'base_I', 'base_K']

Final X_eval_full shape (features only): (171, 4954)


In [29]:
print(f"Total final number of evaluation samples: {X_eval_full.shape[0]}")  # Should be 171
print(f"Total number of features: {X_eval_full.shape[1]}")  # Should match your training feature set


Total final number of evaluation samples: 171
Total number of features: 4954


In [30]:
# === Check Matching of Columns ===

# Retrieve feature column names
train_columns = X_full_features.columns.tolist()
eval_columns = X_eval_full.columns.tolist()

# Check if columns match as sets (order does not matter here)
columns_match = set(train_columns) == set(eval_columns)

print(f"\nDo training and evaluation feature columns match (ignoring order)? {columns_match}")

# Align evaluation feature column order to match training
if columns_match:
    X_eval_full = X_eval_full[train_columns]
    print("Aligned evaluation columns to match training feature order.")
else:
    # If mismatch, identify differences
    print("\nMismatch detected! Columns missing or different:")
    print("In training but not in evaluation:", set(train_columns) - set(eval_columns))
    print("In evaluation but not in training:", set(eval_columns) - set(train_columns))



Do training and evaluation feature columns match (ignoring order)? False

Mismatch detected! Columns missing or different:
In training but not in evaluation: {'PF08777.14', 'PF10316.12', 'PF02714.18', 'PF13473.9', 'PF06478.16', 'PF12763.10', 'PF02026.19', 'PF14901.9', 'PF11633.11', 'PF16697.8', 'PF12146.11', 'PF06736.14', 'PF12122.11', 'PF13499.9', 'PF18613.4', 'PF12424.11', 'PF10609.12', 'PF03266.18', 'PF10430.12', 'PF08447.15', 'PF18587.4', 'PF05409.16', 'PF02124.18', 'PF01694.25', 'PF13555.9', 'PF13423.9', 'PF13207.9', 'PF11956.11', 'PF08321.15', 'PF13833.9', 'PF13620.9', 'PF08344.14', 'PF07965.15', 'PF01443.21', 'PF17812.4', 'PF00076.25', 'PF00757.23', 'PF02354.19', 'PF04253.18', 'PF01108.20', 'PF00431.23', 'PF14533.9', 'PF10591.12', 'PF19212.3', 'PF19213.3', 'PF00335.23', 'PF01437.28', 'PF01935.20', 'PF17757.4', 'PF01062.24', 'PF01590.29', 'PF14610.9', 'PF14396.9', 'PF13181.9', 'PF13479.9', 'PF13462.9', 'PF14814.9', 'PF14843.9', 'PF02060.18', 'PF04969.19', 'PF04389.20', 'PF07724.

In [31]:
# === Check Matching of Columns ===

# Retrieve feature column names
train_columns = X_full_features.columns.tolist()
eval_columns = X_eval_full.columns.tolist()

# Check if columns match as sets (order does not matter here)
columns_match = set(train_columns) == set(eval_columns)

print(f"\nDo training and evaluation feature columns match (ignoring order)? {columns_match}")

# Align evaluation feature column order to match training
if columns_match:
    X_eval_full = X_eval_full[train_columns]
    print("Aligned evaluation columns to match training feature order.")
else:
    # If mismatch, identify differences
    print("\nMismatch detected! Columns missing or different:")
    print("In training but not in evaluation:", set(train_columns) - set(eval_columns))
    print("In evaluation but not in training:", set(eval_columns) - set(train_columns))



Do training and evaluation feature columns match (ignoring order)? False

Mismatch detected! Columns missing or different:
In training but not in evaluation: {'PF08777.14', 'PF10316.12', 'PF02714.18', 'PF13473.9', 'PF06478.16', 'PF12763.10', 'PF02026.19', 'PF14901.9', 'PF11633.11', 'PF16697.8', 'PF12146.11', 'PF06736.14', 'PF12122.11', 'PF13499.9', 'PF18613.4', 'PF12424.11', 'PF10609.12', 'PF03266.18', 'PF10430.12', 'PF08447.15', 'PF18587.4', 'PF05409.16', 'PF02124.18', 'PF01694.25', 'PF13555.9', 'PF13423.9', 'PF13207.9', 'PF11956.11', 'PF08321.15', 'PF13833.9', 'PF13620.9', 'PF08344.14', 'PF07965.15', 'PF01443.21', 'PF17812.4', 'PF00076.25', 'PF00757.23', 'PF02354.19', 'PF04253.18', 'PF01108.20', 'PF00431.23', 'PF14533.9', 'PF10591.12', 'PF19212.3', 'PF19213.3', 'PF00335.23', 'PF01437.28', 'PF01935.20', 'PF17757.4', 'PF01062.24', 'PF01590.29', 'PF14610.9', 'PF14396.9', 'PF13181.9', 'PF13479.9', 'PF13462.9', 'PF14814.9', 'PF14843.9', 'PF02060.18', 'PF04969.19', 'PF04389.20', 'PF07724.

In [32]:
# === Find full set of all columns in training and evaluation ===

full_feature_set = list(sorted(set(X_full_features.columns) | set(X_eval_full.columns)))

print(f"\nTotal unified feature set length: {len(full_feature_set)}")

# === Reindex X_full_features and X_eval_full to this full feature set ===

X_full_features = X_full_features.reindex(columns=full_feature_set, fill_value=0)
X_eval_full = X_eval_full.reindex(columns=full_feature_set, fill_value=0)

# Final check
print(f"\nX_full_features shape after reindexing: {X_full_features.shape}")
print(f"X_eval_full shape after reindexing: {X_eval_full.shape}")
print("\nColumns match after reindexing?", set(X_full_features.columns) == set(X_eval_full.columns))



Total unified feature set length: 5254

X_full_features shape after reindexing: (681, 5254)
X_eval_full shape after reindexing: (171, 5254)

Columns match after reindexing? True


# Build Train and Evaluation Datasets

# Overview of Constructed Datasets for Protein Function Modeling

This project generates several structured datasets combining different feature groups to support feature ablation, model comparison, and ensemble learning.

Each dataset contains the **Entry** identifier, the **ProteinClass** label (for training data only), and a curated subset of features as described below.

---

## Datasets

| Dataset Name             | Base Features | Pfam Domains | Peptide Descriptors | ProtLearn Features | ProtBERT/ESM2 Embeddings | Notes |
|---------------------------|---------------|--------------|---------------------|---------------------|--------------------------|-------|
| **Dataset**               | Yes           | No           | No                  | No                  | No                       | Base features only |
| **Dataset_pfam**          | Yes           | Yes          | No                  | No                  | No                       | Base + Pfam |
| **Dataset_embed**         | Yes           | No           | No                  | No                  | Yes                      | Base + Embeddings |
| **Dataset_full**          | Yes           | Yes          | Yes                 | Yes                 | Yes                      | All features included |
| **Dataset_full_noPL**     | Yes           | Yes          | Yes                 | No                  | Yes                      | All features except ProtLearn |

---

## Definitions

- **Base Features**:  
  Fundamental sequence-derived properties such as amino acid frequencies, reduced alphabet n-grams, dipeptides, and physicochemical properties.

- **Pfam Domains**:  
  Binary matrix indicating the presence or absence of protein domains annotated from the Pfam-A database.

- **Peptide Descriptors**:  
  Aggregated physicochemical descriptors including VHSE8, Atchley factors, Kidera factors, and Blosum-based scores.

- **ProtLearn Features**:  
  Handcrafted complexity and compositional features generated using the ProtLearn package.

- **ProtBERT / ESM2 Embeddings**:  
  Deep learning-based representations of protein sequences using pretrained models such as ProtBERT and ESM2.

---

## Notes on Evaluation Sets

- Each **evaluation dataset** (e.g., `Dataset_eval`, `Dataset_pfam_eval`, `Dataset_embed_eval`, `Dataset_full_eval`, `Dataset_full_noPL_eval`) is structured to match its corresponding training dataset.
- **Evaluation datasets do not contain `ProteinClass` labels**, as they are intended for final competition submissions and blind testing.

---

## Motivation for `Dataset_full_noPL`

- **Purpose**:  
  To assess the independent contribution of ProtLearn features to model performance through feature ablation.
  
- **Use Cases**:
  - Quantify the impact of removing handcrafted complexity features.
  - Evaluate whether deep embeddings and structured biological descriptors are sufficient alone.
  - Train potentially lighter models that maintain high biological interpretability.

---

# The datasets are now fully prepared for downstream model training, evaluation, and ensemble construction.


In [33]:
# === Step 1: Define feature groups ===

# Base physicochemical features
base_cols = [col for col in df_full_features.columns if col.startswith("base_")]

# Pfam domain features
pfam_cols_train = [col for col in df_full_features.columns if col.startswith("PF")]
pfam_cols_eval = [col for col in df_evaluation_full.columns if col.startswith("PF")]
common_pfam_cols = list(sorted(set(pfam_cols_train) & set(pfam_cols_eval)))

# Embeddings (ProtBERT and ESM2)
embed_cols_train = [col for col in df_full_features.columns if col.startswith("ProtBERT_") or col.startswith("ESM2_")]
embed_cols_eval = [col for col in df_evaluation_full.columns if col.startswith("ProtBERT_") or col.startswith("ESM2_")]

# Other descriptors (ProtLearn + Peptide descriptors)
other_cols_train = [col for col in df_full_features.columns if col.startswith("PL_") or col.startswith("PEP_")]
other_cols_eval = [col for col in df_evaluation_full.columns if col.startswith("PL_") or col.startswith("PEP_")]

# === Step 2: Build Train and Evaluation Datasets ===

# Dataset: base only
Dataset = pd.concat([df[["Entry", "ProteinClass"]], df_full_features[base_cols]], axis=1)
Dataset_eval = pd.concat([df_evaluation[["Entry"]], df_evaluation_full[base_cols]], axis=1)

# Dataset + Pfam domains
Dataset_pfam = pd.concat([df[["Entry", "ProteinClass"]], df_full_features[base_cols + common_pfam_cols]], axis=1)
Dataset_pfam_eval = pd.concat([df_evaluation[["Entry"]], df_evaluation_full[base_cols + common_pfam_cols]], axis=1)

# Dataset + embeddings
Dataset_embed = pd.concat([df[["Entry", "ProteinClass"]], df_full_features[base_cols + embed_cols_train]], axis=1)
Dataset_embed_eval = pd.concat([df_evaluation[["Entry"]], df_evaluation_full[base_cols + embed_cols_eval]], axis=1)

# Full Dataset (base + Pfam + ProtLearn + Peptide + Embeddings)
full_feature_cols_train = base_cols + pfam_cols_train + other_cols_train + embed_cols_train
full_feature_cols_eval = base_cols + pfam_cols_eval + other_cols_eval + embed_cols_eval

common_full_cols = list(sorted(set(full_feature_cols_train) & set(full_feature_cols_eval)))

Dataset_full = pd.concat([df[["Entry", "ProteinClass"]], df_full_features[common_full_cols]], axis=1)
Dataset_full_eval = pd.concat([df_evaluation[["Entry"]], df_evaluation_full[common_full_cols]], axis=1)

# === Step 3: Confirm Shapes ===

print(f"Dataset shape: {Dataset.shape}")
print(f"Dataset_eval shape: {Dataset_eval.shape}")

print(f"Dataset_pfam shape: {Dataset_pfam.shape}")
print(f"Dataset_pfam_eval shape: {Dataset_pfam_eval.shape}")

print(f"Dataset_embed shape: {Dataset_embed.shape}")
print(f"Dataset_embed_eval shape: {Dataset_embed_eval.shape}")

print(f"Dataset_full shape: {Dataset_full.shape}")
print(f"Dataset_full_eval shape: {Dataset_full_eval.shape}")


Dataset shape: (681, 586)
Dataset_eval shape: (171, 585)
Dataset_pfam shape: (681, 814)
Dataset_pfam_eval shape: (171, 813)
Dataset_embed shape: (681, 2890)
Dataset_embed_eval shape: (171, 2889)
Dataset_full shape: (681, 4924)
Dataset_full_eval shape: (171, 4923)


In [34]:
# === Construct Dataset_full_noPL (everything except ProtLearn features) ===

# Define feature groups
base_cols = [col for col in df_full_features.columns if col.startswith("base_")]
pfam_cols_train = [col for col in df_full_features.columns if col.startswith("PF")]
peptide_cols = [col for col in df_full_features.columns if col.startswith("PEP_")]
embed_cols = [col for col in df_full_features.columns if col.startswith("ProtBERT_") or col.startswith("ESM2_")]

# Build full feature set excluding ProtLearn (PL_) columns
full_noPL_feature_cols_train = base_cols + pfam_cols_train + peptide_cols + embed_cols

# Evaluation side (careful to match)
pfam_cols_eval = [col for col in df_evaluation_full.columns if col.startswith("PF")]
peptide_cols_eval = [col for col in df_evaluation_full.columns if col.startswith("PEP_")]
embed_cols_eval = [col for col in df_evaluation_full.columns if col.startswith("ProtBERT_") or col.startswith("ESM2_")]

full_noPL_feature_cols_eval = base_cols + pfam_cols_eval + peptide_cols_eval + embed_cols_eval

# Find intersection of available features
common_full_noPL_cols = list(sorted(set(full_noPL_feature_cols_train) & set(full_noPL_feature_cols_eval)))

# Build the datasets
Dataset_full_noPL = pd.concat([
    df[["Entry", "ProteinClass"]],
    df_full_features[common_full_noPL_cols]
], axis=1)

Dataset_full_noPL_eval = pd.concat([
    df_evaluation[["Entry"]],
    df_evaluation_full[common_full_noPL_cols]
], axis=1)

# === Confirm ===
print(f"Dataset_full_noPL shape: {Dataset_full_noPL.shape}")
print(f"Dataset_full_noPL_eval shape: {Dataset_full_noPL_eval.shape}")


Dataset_full_noPL shape: (681, 3220)
Dataset_full_noPL_eval shape: (171, 3219)


- Dataset shape: (681, 586)
- Dataset_eval shape: (171, 585)
- Dataset_pfam shape: (681, 814)
- Dataset_pfam_eval shape: (171, 813)
- Dataset_embed shape: (681, 2890)
- Dataset_embed_eval shape: (171, 2889)
- Dataset_full shape: (681, 4924)
- Dataset_full_eval shape: (171, 4923)
- Dataset_full_noPL shape: (681, 3220)
- Dataset_full_noPL_eval shape: (171, 3219)

In [35]:

def save_dataset_full_csv(dataset: pd.DataFrame, output_path: str) -> None:
    """
    Save a Dataset (containing Entry, ProteinClass, and features) to CSV after validation.

    Args:
        dataset (pd.DataFrame): DataFrame with Entry, ProteinClass, and features.
        output_path (str): File path to save the dataset CSV.

    Raises:
        ValueError: If required columns are missing or dataset has NaNs.
    """
    # Clean up column names
    dataset.columns = dataset.columns.str.strip()

    # Check essentials
    if not {'Entry', 'ProteinClass'}.issubset(dataset.columns):
        raise ValueError("Dataset must include 'Entry' and 'ProteinClass' columns.")

    if dataset.isnull().values.any():
        raise ValueError("Dataset has missing values. Please fix before saving.")

    # Save dataset
    dataset.to_csv(output_path, index=False)
    print(f"Successfully saved dataset to {output_path}")

# Save Dataset_full
save_dataset_full_csv(Dataset_full, "../processed_data/Dataset_Full.csv")


Successfully saved dataset to ../processed_data/Dataset_Full.csv


# Processing Steps

In [36]:
# === Processing Functions ===

def processing_data(df: pd.DataFrame):
    """
    Process the training dataset: split into train/test sets, scale features, encode labels.

    Args:
        df (pd.DataFrame): Full dataset containing Entry, ProteinClass, and features.

    Returns:
        tuple: (label_encoder, X, X_scaled, X_train, X_train_scaled, X_test, X_test_scaled,
                y, y_label_encoded, y_train, y_train_label_encoded, y_test, y_test_label_encoded, scaler)
    """
    # Define target and features
    y = df['ProteinClass']
    X = df.drop(['Entry', 'ProteinClass'], axis=1)

    # Split into training and testing
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Feature scaling
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Define label classes
    labels = np.array([
        'ATPase', 'Aquaporin', 'Channel', 'GPCR', 'Integrin',
        'MHC', 'Phosphatase', 'Protease', 'RTK', 'Ser:Thr'
    ])

    # Encode labels
    label_encoder = LabelEncoder()
    label_encoder.fit(labels)
    y_label_encoded = label_encoder.transform(y)
    y_train_label_encoded = label_encoder.transform(y_train)
    y_test_label_encoded = label_encoder.transform(y_test)

    return (
        label_encoder, X.values, X_scaled,
        X_train, X_train_scaled,
        X_test, X_test_scaled,
        y, y_label_encoded,
        y_train, y_train_label_encoded,
        y_test, y_test_label_encoded,
        scaler
    )


def processing_data_evaluation(df: pd.DataFrame, scaler: StandardScaler):
    """
    Process evaluation dataset using the provided scaler.

    Args:
        df (pd.DataFrame): Evaluation dataset (no ProteinClass labels).
        scaler (StandardScaler): Scaler fitted on training data.

    Returns:
        tuple: (X, X_scaled)
    """
    # Define features
    X = df.drop(['Entry'], axis=1)

    # Apply scaling
    X_scaled = scaler.transform(X)

    return X.values, X_scaled


In [37]:
# === Process Training and Evaluation Sets ===

(
    label_encoder,
    X_full_features, X_full_features_scaled,
    X_train_full_features, X_train_full_features_scaled,
    X_test_full_features, X_test_full_features_scaled,
    y_labels_full, y_labels_full_encoded,
    y_train_labels, y_train_labels_encoded,
    y_test_labels, y_test_labels_encoded,
    full_scaler
) = processing_data(Dataset_full)

# Evaluation data
X_evaluation_full_features, X_evaluation_full_features_scaled = processing_data_evaluation(
    Dataset_full_eval,
    scaler=full_scaler
)

print(f"Training set (full features) shape: {X_train_full_features_scaled.shape}")
print(f"Testing set (full features) shape: {X_test_full_features_scaled.shape}")
print(f"Evaluation set (full features) shape: {X_evaluation_full_features_scaled.shape}")


Training set (full features) shape: (544, 4922)
Testing set (full features) shape: (137, 4922)
Evaluation set (full features) shape: (171, 4922)


In [38]:
# === Save LabelEncoder Object ===

from utils.model_utils import save_model


save_model(
    model=label_encoder,
    model_type="labelencoder",
    featureset="full",
    optimizer="none",
    scoremetric="none",
    version="v1",
    accuracy=1.0,  # trivial for encoders
    output_dir="../final_models",
    tracker_file="../final_models/model_tracker.csv"
)


Model successfully saved to /home/secondbook5/JHU_Bioinformatics/ComputationalDrugDiscovery/ProtClassify/../final_models/labelencoder_full_none_none_v1.joblib
Model tracker updated at /home/secondbook5/JHU_Bioinformatics/ComputationalDrugDiscovery/ProtClassify/../final_models/model_tracker.csv


In [41]:
# === Process Training Data ===
(
    label_encoder,
    X_full_array,
    X_full_scaled,
    X_train,
    X_train_scaled,
    X_test,
    X_test_scaled,
    y_full,
    y_full_label_encoded,
    y_train,
    y_train_label_encoded,
    y_test,
    y_test_label_encoded,
    scaler
) = processing_data(Dataset_full)

# === Process Evaluation Data ===
X_eval_array, X_eval_scaled = processing_data_evaluation(Dataset_full_eval, scaler)


## Apply Processing to Full Training and Evaluation Datasets



The full training dataset `Dataset_full` (681 samples, 4924 columns including `Entry` and `ProteinClass`) is processed using the `processing_data()` function.

This processing performs the following:

- **Feature and Label Splitting**:
  - Features `X_full_values` are extracted by dropping `Entry` and `ProteinClass`.
  - Labels `y_full` are extracted as the `ProteinClass` column.

- **Train/Test Split**:
  - An 80/20 split is performed using `train_test_split`, stratified to preserve class distributions.
  - `X_train` and `y_train` are the training subsets (544 samples).
  - `X_test` and `y_test` are the testing subsets (137 samples).

- **Feature Scaling**:
  - A `StandardScaler` (`scaler_full`) is fitted to the full feature matrix.
  - `X_train_scaled` and `X_test_scaled` are scaled independently using the fitted scaler.
  - `X_full_scaled` is the scaled version of the complete dataset.

- **Label Encoding**:
  - A `LabelEncoder` (`label_encoder`) is fitted to the ten known protein classes.
  - The labels are transformed into integers for model compatibility:
    - `y_full_label_encoded`, `y_train_label_encoded`, `y_test_label_encoded`.

The function returns all versions of the features, labels, the fitted scaler, and the label encoder for later use.

## Apply Processing to Dataset_full_eval

The evaluation dataset `Dataset_full_eval` (171 samples, 4923 columns including `Entry`) is processed using the `processing_data_evaluation()` function.

This processing performs the following:

- **Feature Extraction**:
  - Features `X_eval_values` are extracted by dropping the `Entry` column.

- **Feature Scaling**:
  - The `scaler_full` fitted on the training data is applied without refitting to produce `X_eval_scaled`.
  - This ensures the evaluation features are scaled consistently with the training and testing datasets.

No labels are available in the evaluation set, as it is intended for competition submission.

## Confirmed Data Shapes After Processing

The resulting dataset shapes are:

| Variable | Shape | Description |
|:---|:---|:---|
| `X_train` | (544, 4922) | Raw features for training (80 percent split) |
| `X_train_scaled` | (544, 4922) | Scaled features for training |
| `y_train` | (544,) | Raw labels for training |
| `y_train_label_encoded` | (544,) | Encoded labels for training |
| `X_test` | (137, 4922) | Raw features for testing (20 percent split) |
| `X_test_scaled` | (137, 4922) | Scaled features for testing |
| `y_test` | (137,) | Raw labels for testing |
| `y_test_label_encoded` | (137,) | Encoded labels for testing |
| `X_full_values` | (681, 4922) | Raw full dataset features |
| `X_full_scaled` | (681, 4922) | Scaled full dataset features |
| `X_eval_values` | (171, 4922) | Raw evaluation features |
| `X_eval_scaled` | (171, 4922) | Scaled evaluation features |

This confirms that the feature dimensions match correctly across the training, testing, and evaluation datasets, ensuring readiness for model training and competition evaluation.

## Quick Variable Legend

| Variable | Description |
|:---|:---|
| `X_train_scaled`, `y_train_label_encoded` | Scaled features and encoded labels for training |
| `X_test_scaled`, `y_test_label_encoded` | Scaled features and encoded labels for testing |
| `X_full_scaled`, `y_full_label_encoded` | Scaled full dataset and encoded full labels |
| `X_eval_scaled` | Scaled evaluation dataset |


In [42]:
# === Define processing functions ===

def processing_data(df):
    """
    Process the training dataset:
    - Split into X and y
    - Train/test split
    - Standard scaling
    - Label encoding
    """
    y = df['ProteinClass']
    X = df.drop(['Entry', 'ProteinClass'], axis=1)

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Feature scaling (important for logistic regression, gradient boosting, MLP, etc.)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Define all possible labels
    labels = np.array([
        'ATPase', 'Aquaporin', 'Channel', 'GPCR', 'Integrin', 'MHC',
        'Phosphatase', 'Protease', 'RTK', 'Ser:Thr'
    ])

    # Create a Label Encoder
    label_encoder = LabelEncoder()
    label_encoder.fit(labels)

    # Encode labels
    y_label_encoded = label_encoder.transform(y)
    y_train_label_encoded = label_encoder.transform(y_train)
    y_test_label_encoded = label_encoder.transform(y_test)

    return (label_encoder, X.values, X_scaled,
            X_train, X_train_scaled, X_test, X_test_scaled,
            y, y_label_encoded, y_train, y_train_label_encoded, y_test, y_test_label_encoded, scaler)


In [43]:
def processing_data_evaluation(df, scaler):
    """
    Process the evaluation dataset:
    - Only X (no y)
    - Apply scaling based on training scaler
    """
    X = df.drop(['Entry'], axis=1)

    # Use the same scaler fitted on training data
    X_scaled = scaler.transform(X)

    return (X.values, X_scaled)

In [44]:
# ===  Apply processing to Dataset_full ===

(label_encoder,
 X_full_values, X_full_scaled,
 X_train, X_train_scaled, X_test, X_test_scaled,
 y_full, y_full_label_encoded, y_train, y_train_label_encoded, y_test, y_test_label_encoded,
 scaler_full) = processing_data(Dataset_full)

# === Apply processing to Dataset_full_eval ===

X_eval_values, X_eval_scaled = processing_data_evaluation(Dataset_full_eval, scaler_full)

# === Step 4: Confirm shapes ===

print("\nTraining X shape:", X_train.shape)
print("Training y shape:", y_train.shape)

print("Testing X shape:", X_test.shape)
print("Testing y shape:", y_test.shape)

print("Evaluation X shape:", X_eval_values.shape)


Training X shape: (544, 4922)
Training y shape: (544,)
Testing X shape: (137, 4922)
Testing y shape: (137,)
Evaluation X shape: (171, 4922)


In [45]:
# === Save Training, Testing, and Evaluation Datasets ===

save_npy(
    array=X_train_full_features_scaled,
    name="X_train",
    description="Scaled full feature set for training",
    featureset="full",
    split="train",
    version="v1"
)

save_npy(
    array=X_test_full_features_scaled,
    name="X_test",
    description="Scaled full feature set for testing",
    featureset="full",
    split="test",
    version="v1"
)

save_npy(
    array=X_evaluation_full_features_scaled,
    name="X_eval",
    description="Scaled full feature set for evaluation",
    featureset="full",
    split="eval",
    version="v1"
)

save_npy(
    array=y_train_labels_encoded,
    name="y_train",
    description="Encoded labels for training set",
    featureset="full",
    split="train",
    version="v1"
)

save_npy(
    array=y_test_labels_encoded,
    name="y_test",
    description="Encoded labels for testing set",
    featureset="full",
    split="test",
    version="v1"
)


Array successfully saved to /home/secondbook5/JHU_Bioinformatics/ComputationalDrugDiscovery/ProtClassify/processed_data/X_train_full_train_v1.npy
Data tracker updated at /home/secondbook5/JHU_Bioinformatics/ComputationalDrugDiscovery/ProtClassify/processed_data/data_tracker.csv
Array successfully saved to /home/secondbook5/JHU_Bioinformatics/ComputationalDrugDiscovery/ProtClassify/processed_data/X_test_full_test_v1.npy
Data tracker updated at /home/secondbook5/JHU_Bioinformatics/ComputationalDrugDiscovery/ProtClassify/processed_data/data_tracker.csv
Array successfully saved to /home/secondbook5/JHU_Bioinformatics/ComputationalDrugDiscovery/ProtClassify/processed_data/X_eval_full_eval_v1.npy
Data tracker updated at /home/secondbook5/JHU_Bioinformatics/ComputationalDrugDiscovery/ProtClassify/processed_data/data_tracker.csv
Array successfully saved to /home/secondbook5/JHU_Bioinformatics/ComputationalDrugDiscovery/ProtClassify/processed_data/y_train_full_train_v1.npy
Data tracker updated 

In [48]:
# === Save UNscaled data ===
# Directory: ../processed_data/unscaled/

save_npy(
    array=X_train_full_features,
    name="X_train",
    description="Training feature matrix (unscaled, full features)",
    featureset="full",
    split="train",
    version="v1",
    output_dir="processed_data/unscaled",
    tracker_file="processed_data/unscaled/data_tracker.csv"
)

save_npy(
    array=X_test_full_features,
    name="X_test",
    description="Testing feature matrix (unscaled, full features)",
    featureset="full",
    split="test",
    version="v1",
    output_dir="processed_data/unscaled",
    tracker_file="processed_data/unscaled/data_tracker.csv"
)

save_npy(
    array=X_evaluation_full_features,
    name="X_eval",
    description="Evaluation feature matrix (unscaled, full features)",
    featureset="full",
    split="eval",
    version="v1",
    output_dir="processed_data/unscaled",
    tracker_file="processed_data/unscaled/data_tracker.csv"
)


Array successfully saved to /home/secondbook5/JHU_Bioinformatics/ComputationalDrugDiscovery/ProtClassify/processed_data/unscaled/X_train_full_train_v1.npy
Data tracker updated at /home/secondbook5/JHU_Bioinformatics/ComputationalDrugDiscovery/ProtClassify/processed_data/unscaled/data_tracker.csv
Array successfully saved to /home/secondbook5/JHU_Bioinformatics/ComputationalDrugDiscovery/ProtClassify/processed_data/unscaled/X_test_full_test_v1.npy
Data tracker updated at /home/secondbook5/JHU_Bioinformatics/ComputationalDrugDiscovery/ProtClassify/processed_data/unscaled/data_tracker.csv
Array successfully saved to /home/secondbook5/JHU_Bioinformatics/ComputationalDrugDiscovery/ProtClassify/processed_data/unscaled/X_eval_full_eval_v1.npy
Data tracker updated at /home/secondbook5/JHU_Bioinformatics/ComputationalDrugDiscovery/ProtClassify/processed_data/unscaled/data_tracker.csv


# Variable and Model Tracker

This section documents all major variables and models used in the tuning and evaluation process to ensure clarity and prevent confusion.

---

## Feature and Label Matrices

- **X_train** → Raw training features (not scaled)
- **X_train_scaled** → Scaled training features (using StandardScaler)
- **X_test** → Raw testing features (not scaled)
- **X_test_scaled** → Scaled testing features (using StandardScaler)
- **X_full_values** → Raw full dataset features (entire Dataset_full)
- **X_full_scaled** → Scaled full dataset features (entire Dataset_full)
- **X_eval_values** → Raw evaluation features (unlabeled competition data)
- **X_eval_scaled** → Scaled evaluation features (unlabeled competition data)

- **y_train** → Original training labels (not label-encoded)
- **y_train_label_encoded** → Encoded training labels (after LabelEncoder)
- **y_test** → Original testing labels (not label-encoded)
- **y_test_label_encoded** → Encoded testing labels (after LabelEncoder)
- **y_full** → Full labels (entire Dataset_full, not label-encoded)
- **y_full_label_encoded** → Full labels (entire Dataset_full, label-encoded)

---

## Base (Untuned) Models

- **my_model** → Random model for baseline comparison (random guesses)
- **lr_model** → Logistic Regression model (before tuning)
- **rf_model** → Random Forest model (before tuning)
- **xgb_model** → XGBoost model (before tuning)
- **mlp_model** → Multilayer Perceptron (MLP) model (before tuning, handled in Google Colab if necessary)
- **tabnet_model** → TabNet model (before tuning, handled in Google Colab for GPU acceleration)

---

## Tuned (After Hyperparameter Tuning) Models

- **best_lr_model** → Tuned Logistic Regression model (after RandomizedSearchCV)
- **best_rf_model** → Tuned Random Forest model (after RandomizedSearchCV)
- **best_xgb_model** → Tuned XGBoost model (after RandomizedSearchCV)
- **best_mlp_model** → Tuned MLP model (after Optuna hyperparameter tuning and cross-validation)
- **best_tabnet_model** → Tuned TabNet model (after Optuna or manual tuning in Colab)

---

## Model Performance Variables

- **y_pred_random** → Predictions from Random model on test set
- **y_pred_lr** → Predictions from base Logistic Regression on test set
- **y_pred_rf** → Predictions from base Random Forest on test set
- **y_pred_xgb** → Predictions from base XGBoost on test set
- **y_pred_mlp** → Predictions from base MLP on test set
- **y_pred_tabnet** → Predictions from base TabNet on test set

- **y_pred_best_lr** → Predictions from tuned Logistic Regression
- **y_pred_best_rf** → Predictions from tuned Random Forest
- **y_pred_best_xgb** → Predictions from tuned XGBoost
- **y_pred_best_mlp** → Predictions from tuned MLP
- **y_pred_best_tabnet** → Predictions from tuned TabNet

- **acc_random** → Test accuracy of Random model
- **acc_lr** → Test accuracy of base Logistic Regression
- **acc_rf** → Test accuracy of base Random Forest
- **acc_xgb** → Test accuracy of base XGBoost
- **acc_mlp** → Test accuracy of base MLP
- **acc_tabnet** → Test accuracy of base TabNet

- **acc_best_lr** → Test accuracy of tuned Logistic Regression
- **acc_best_rf** → Test accuracy of tuned Random Forest
- **acc_best_xgb** → Test accuracy of tuned XGBoost
- **acc_best_mlp** → Test accuracy of tuned MLP
- **acc_best_tabnet** → Test accuracy of tuned TabNet

---

## Important Notes

- Always use **X_train_scaled** and **X_test_scaled** for models that require feature scaling, such as Logistic Regression and MLP.
- Random Forest and XGBoost typically perform better on raw (non-scaled) features, so **X_train** and **X_test** are used without scaling for these models.
- TabNet can accept unscaled data but may also benefit from min-max scaling depending on implementation.
- Carefully distinguish between base models (untuned) and tuned models (after hyperparameter optimization).
- RandomizedSearchCV (for tree models) and Optuna (for MLP and TabNet) are used for hyperparameter tuning.
- The evaluation dataset does not have labels, so only `X_eval_scaled` is used for competition predictions.

---

(Last Updated: [Fill in date])


# Creating Model

In [None]:
# === Model 1: Random Model (Dr. Yasin starter) ===

class random_model:
    def __init__(self):
        pass

    def fit(self, X, y):
        pass

    def predict(self, X):
        pred = []
        for x in X:
            pred.append(np.random.randint(10))  # Random class between 0–9
        return np.array(pred)

    def get_params(self, deep=True):
        return {}

# Initialize Random Model
my_model = random_model()

# === Model 2: Logistic Regression (Dr. Yasin starter) ===

from sklearn.linear_model import LogisticRegression

# Define the base logistic regression model 
lr_model = LogisticRegression(
    penalty="l2",
    solver="lbfgs",
    max_iter=1000,
    random_state=42
)


# === Model 3: Random Forest Classifier ===

from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=20,
    random_state=42,
    n_jobs=-1
)

# === Model 4: XGBoost Classifier ===

from xgboost import XGBClassifier

xgb_model = XGBClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=10,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="multi:softprob",
    eval_metric="mlogloss",
    use_label_encoder=False,
    random_state=42,
    n_jobs=-1
)

# === Confirm Initialization ===

print("Models initialized:")
print("- Random Model (baseline)")
print("- Logistic Regression")
print("- Random Forest")
print("- XGBoost")


Models initialized:
- Random Model (baseline)
- Logistic Regression
- Random Forest
- XGBoost


# Train Model

In [None]:
# Fit My Model to the training data
my_model.fit(X_train, y_train)

# Fit the model to the training data without tuning
lr_model.fit(X_train_scaled, y_train)

## To Do ##
# Fit your models to the training data you can use
# raw, scaled, or other forms

In [None]:
# === Baseline Fitting and Timing ===

# Random Model
start = time.time()
my_model.fit(X_train, y_train_label_encoded)
end = time.time()
print(f"My Random Model fit time: {end - start:.2f} seconds")

# Evaluate Random Model (FIX: use X_test.values)
y_pred_random = my_model.predict(X_test.values)  # FIXED
acc_random = accuracy_score(y_test_label_encoded, y_pred_random)
print(f"My Random Model baseline accuracy: {acc_random:.4f}")

# Logistic Regression
start = time.time()
lr_model.fit(X_train_scaled, y_train_label_encoded)
end = time.time()
print(f"Logistic Regression fit time: {end - start:.2f} seconds")

# Evaluate Logistic Regression
y_pred_lr = lr_model.predict(X_test_scaled)
acc_lr = accuracy_score(y_test_label_encoded, y_pred_lr)
print(f"Logistic Regression baseline accuracy: {acc_lr:.4f}")

# Random Forest
start = time.time()
rf_model.fit(X_train, y_train_label_encoded)
end = time.time()
print(f"Random Forest fit time: {end - start:.2f} seconds")

# Evaluate Random Forest
y_pred_rf = rf_model.predict(X_test)
acc_rf = accuracy_score(y_test_label_encoded, y_pred_rf)
print(f"Random Forest baseline accuracy: {acc_rf:.4f}")

# XGBoost
start = time.time()
xgb_model.fit(X_train, y_train_label_encoded)
end = time.time()
print(f"XGBoost fit time: {end - start:.2f} seconds")

# Evaluate XGBoost
y_pred_xgb = xgb_model.predict(X_test)
acc_xgb = accuracy_score(y_test_label_encoded, y_pred_xgb)
print(f"XGBoost baseline accuracy: {acc_xgb:.4f}")


My Random Model fit time: 0.00 seconds
My Random Model baseline accuracy: 0.1314
Logistic Regression fit time: 0.23 seconds
Logistic Regression baseline accuracy: 0.9781
Random Forest fit time: 1.82 seconds
Random Forest baseline accuracy: 0.9562


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost fit time: 206.23 seconds
XGBoost baseline accuracy: 0.9708


# Training and Tuning

In [None]:
# Define the parameter distribution for 'C'
param_dist_lr = {
    'C': loguniform(1e-3, 1e3),
    'penalty': ['l2'],
    'solver': ['lbfgs'],
    'max_iter': [500, 1000, 2000],
    'random_state': [42]
}

# Set up RandomizedSearchCV
random_search_lr = RandomizedSearchCV(
    estimator=lr_model,
    param_distributions=param_dist_lr,
    n_iter=50,
    cv=5,
    scoring='accuracy',
    random_state=42,
    n_jobs=-1
)

# Fit the model to the training data with tuning
random_search_lr.fit(X_train_scaled, y_train)
# Get the best estimator
best_lr_model = random_search_lr.best_estimator_
print("Best parameters found for Logistic Regression:")
print(random_search_lr.best_params_)

## To Do ##
# Fit your models to the training data you can use
# raw, scaled, or other forms
# Make sure to do tuning this time

Best parameters found for Logistic Regression:
{'C': 0.04848496183873291, 'max_iter': 500, 'penalty': 'l2', 'random_state': 42, 'solver': 'lbfgs'}


## Random Forest Tuning Setup

- **rf_model**: Random Forest Classifier base model before tuning.
- **param_dist_rf**: Dictionary defining the hyperparameter search space for RandomizedSearchCV.
- **random_search_rf**: RandomizedSearchCV object to perform tuning on rf_model.
- **best_rf_model**: The best Random Forest model selected after tuning.


In [None]:
import os
import joblib
from tqdm import tqdm

"""
Run full hyperparameter tuning and evaluation for multiple models.
- Loads pre-trained model from disk if available (skips re-training).
- If not available, performs RandomizedSearchCV to find best hyperparameters.
- Performs final 10-fold cross-validation for evaluation.
- Tracks progress with a tqdm progress bar.

Returns:
    best_models (dict): Dictionary of {model_name: (best_model, cv_scores)}.
"""

# === Step 1: Helper Functions ===

def track_time(func):
    """
    Decorator to time and print the runtime of any function.
    Used for profiling expensive steps like tuning or CV.
    """
    def wrapper(*args, **kwargs):
        print(f"Starting: {func.__name__}")
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        print(f"Completed: {func.__name__} in {end_time - start_time:.2f} seconds.")
        return result
    return wrapper

def save_best_model(model, filename):
    """
    Save a trained model to disk using joblib for future reuse.
    Prevents unnecessary recomputation.
    """
    joblib.dump(model, filename)
    print(f"Model saved as {filename}")

def load_or_train_model(filename, train_function, X_train, y_train):
    """
    If a model file exists, load it from disk.
    Otherwise, run the provided training function and save the model.
    This checkpointing saves huge amounts of time on re-runs.
    """
    if os.path.exists(filename):
        print(f"\nLoading existing model from {filename}...")
        model = joblib.load(filename)
    else:
        print(f"\n{filename} not found. Training new model...")
        model = train_function(X_train, y_train)
        save_best_model(model, filename)
    return model

# === Step 2: Encode Labels ===

# Encode the string labels (e.g., 'GPCR', 'Channel') into integers (0-9) once.
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)

# === Step 3: Define Randomized Search Hyperparameter Spaces ===

# Random Forest search space: broad but bounded around reasonable values
rf_param_grid = {
    'n_estimators': randint(500, 700),     # Number of trees
    'max_depth': randint(20, 30),           # Max tree depth
    'min_samples_split': randint(2, 10),    # Min samples to split node
    'min_samples_leaf': randint(1, 4),      # Min samples at leaf
    'max_features': ['sqrt', 'log2'],       # Feature subset strategy
    'bootstrap': [True, False],             # Bootstrap sampling
    'random_state': [42]                    # Reproducibility
}

# XGBoost search space: includes learning rate, gamma, regularization
xgb_param_grid = {
    'n_estimators': randint(100, 700),      # Number of boosting rounds
    'max_depth': randint(3, 12),             # Depth of trees
    'learning_rate': uniform(0.01, 0.3),     # Shrinkage
    'subsample': uniform(0.6, 0.4),          # Fraction of training rows
    'colsample_bytree': uniform(0.6, 0.4),   # Fraction of features per tree
    'gamma': uniform(0, 0.5),                # Minimum loss reduction for split
    'reg_alpha': uniform(0, 1),              # L1 regularization
    'reg_lambda': uniform(1, 4),             # L2 regularization
    'random_state': [42]                     # Reproducibility
}

# === Step 4: Randomized Search Tuning Functions ===

@track_time
def randomized_search_rf(X_train, y_train, n_iter=50, cv_folds=5, random_state=42):
    """
    Perform RandomizedSearchCV for Random Forest.
    Broad hyperparameter exploration before local refinement.
    """
    rf_base = RandomForestClassifier()
    random_search = RandomizedSearchCV(
        estimator=rf_base,
        param_distributions=rf_param_grid,
        n_iter=n_iter,
        cv=cv_folds,
        scoring='accuracy',
        random_state=random_state,
        n_jobs=-1,
        verbose=1,
        error_score='raise'
    )
    random_search.fit(X_train, y_train)
    print("\nBest Parameters Found for Random Forest:")
    print(random_search.best_params_)
    return random_search.best_estimator_

@track_time
def randomized_search_xgb(X_train, y_train, n_iter=50, cv_folds=5, random_state=42):
    """
    Perform RandomizedSearchCV for XGBoost.
    Used to precondition the space for GridSearchCV and Bayesian optimization.
    """
    xgb_base = XGBClassifier(
        use_label_encoder=False,
        eval_metric='mlogloss',
        verbosity=0,
        tree_method='hist',
        predictor='cpu_predictor',
        n_jobs=-1
    )
    random_search = RandomizedSearchCV(
        estimator=xgb_base,
        param_distributions=xgb_param_grid,
        n_iter=n_iter,
        cv=cv_folds,
        scoring='accuracy',
        random_state=random_state,
        n_jobs=-1,
        verbose=1,
        error_score='raise'
    )
    random_search.fit(X_train, y_train)
    print("\nBest Parameters Found for XGBoost:")
    print(random_search.best_params_)
    return random_search.best_estimator_

# === Step 5: Cross-Validation Function ===

@track_time
def final_cross_validation(model, X_train, y_train, cv_folds=10):
    """
    Perform full 10-fold CV after tuning.
    Report mean accuracy and standard deviation.
    """
    scores = cross_val_score(
        model, X_train, y_train,
        cv=cv_folds, scoring='accuracy', n_jobs=-1
    )
    print(f"\nCross-Validation Accuracy Scores: {scores}")
    print(f"Mean Accuracy: {np.mean(scores):.4f}")
    print(f"Standard Deviation: {np.std(scores):.4f}")
    return scores

# === Step 6: Combined Full Pipeline ===

def full_combined_pipeline(X_train, y_train):
    """
    Run entire tuning pipeline:
    - Load existing tuned models from joblib if available.
    - Otherwise tune Random Forest and XGBoost using RandomizedSearchCV.
    - Perform full 10-fold cross-validation.
    - Return best models and their CV scores.
    """
    steps = [
        ("Random Forest", randomized_search_rf, "rf_randomizedcv.joblib"),
        ("XGBoost", randomized_search_xgb, "xgb_randomizedcv.joblib")
    ]

    best_models = {}

    with tqdm(total=len(steps), desc="Combined Model Tuning Pipeline", ncols=100) as pbar:
        for model_name, tuning_function, filename in steps:
            print(f"\n--- {model_name} Tuning and Evaluation ---")
            model = load_or_train_model(filename, tuning_function, X_train, y_train)
            print(f"\nPerforming Final 10-Fold Cross-Validation for {model_name}:")
            scores = final_cross_validation(model, X_train, y_train)
            best_models[model_name] = (model, scores)
            pbar.update(1)

    return best_models

# === Step 7: Run the Pipeline ===

# Run tuning + evaluation
best_models_dict = full_combined_pipeline(X_train, y_train_encoded)

# === Step 8: Final Results ===

print("\n\n=== Final Model Summary ===")
for model_name, (model, scores) in best_models_dict.items():
    print(f"\n{model_name}")
    print(f"Mean Accuracy: {np.mean(scores):.4f}")
    print(f"Std Dev: {np.std(scores):.4f}")

# === Step 9: Load Models Separately After Pipeline (if needed) ===

print("\n\n=== Loading Best Models Separately ===")

# Load Random Forest Model
best_rf_model = joblib.load("rf_randomizedcv.joblib")
print("Successfully loaded best Random Forest model.")

# Load XGBoost Model
best_xgb_model = joblib.load("xgb_randomizedcv.joblib")
print("Successfully loaded best XGBoost model.")

# Confirm Model Types
print("\nModel Type Checks:")
print(f"Random Forest Model Type: {type(best_rf_model)}")
print(f"XGBoost Model Type: {type(best_xgb_model)}")


Combined Model Tuning Pipeline:   0%|                                         | 0/2 [00:00<?, ?it/s]


--- Random Forest Tuning and Evaluation ---

Loading existing model from rf_randomizedcv.joblib...

Performing Final 10-Fold Cross-Validation for Random Forest:
Starting: final_cross_validation


Combined Model Tuning Pipeline:  50%|████████████████▌                | 1/2 [00:51<00:51, 51.16s/it]


Cross-Validation Accuracy Scores: [0.94545455 0.90909091 0.92727273 0.89090909 0.90740741 0.92592593
 0.94444444 0.94444444 0.94444444 0.94444444]
Mean Accuracy: 0.9284
Standard Deviation: 0.0188
Completed: final_cross_validation in 50.90 seconds.

--- XGBoost Tuning and Evaluation ---

Loading existing model from xgb_randomizedcv.joblib...

Performing Final 10-Fold Cross-Validation for XGBoost:
Starting: final_cross_validation


Combined Model Tuning Pipeline: 100%|████████████████████████████████| 2/2 [05:12<00:00, 156.49s/it]


Cross-Validation Accuracy Scores: [0.98181818 0.94545455 0.92727273 0.94545455 0.88888889 0.96296296
 0.94444444 0.96296296 0.96296296 0.96296296]
Mean Accuracy: 0.9485
Standard Deviation: 0.0245
Completed: final_cross_validation in 261.55 seconds.


=== Final Model Summary ===

Random Forest
Mean Accuracy: 0.9284
Std Dev: 0.0188

XGBoost
Mean Accuracy: 0.9485
Std Dev: 0.0245


=== Loading Best Models Separately ===





Successfully loaded best Random Forest model.
Successfully loaded best XGBoost model.

Model Type Checks:
Random Forest Model Type: <class 'sklearn.ensemble._forest.RandomForestClassifier'>
XGBoost Model Type: <class 'xgboost.sklearn.XGBClassifier'>


In [None]:
import os
import time
import joblib

from sklearn.model_selection import GridSearchCV
from tqdm import tqdm

# === Step 1: Helper Functions ===

def track_time(func):
    def wrapper(*args, **kwargs):
        print(f"\nStarting: {func.__name__}")
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        print(f"Completed: {func.__name__} in {end_time - start_time:.2f} seconds.")
        return result
    return wrapper

def save_best_model(model, filename):
    joblib.dump(model, filename)
    print(f"✔ Model saved as {filename}")

def inspect_model(model, param_keys=None):
    """
    Print the model class, then:
      - if it's a CV object, show .best_params_
      - otherwise show only the params in param_keys (if given),
        or else the full .get_params().
    """
    cls = type(model).__name__
    print(f"\n→ Loaded model type: {cls}")
    if hasattr(model, 'best_params_'):
        print("→ best_params_:", model.best_params_)
    else:
        params = model.get_params()
        if param_keys is not None:
            subset = {k: params[k] for k in param_keys}
            print("→ tuned params:", subset)
        else:
            print("→ all params:", params)

def load_or_train_gridsearch(
    filename, base_model, param_grid, X_train, y_train, search_fn
):
    """
    If `filename` exists, load & inspect it;
    otherwise run `search_fn`, save & inspect the new model.
    """
    if os.path.exists(filename):
        print(f"\nLoading existing model from {filename}...")
        model = joblib.load(filename)
        # pass the keys of the grid so we only print the tuned ones:
        inspect_model(model, param_keys=list(param_grid.keys()))
    else:
        print(f"\n{filename} not found. Running GridSearchCV tuning…")
        model = search_fn(X_train, y_train, base_model, param_grid)
        save_best_model(model, filename)
        inspect_model(model, param_keys=list(param_grid.keys()))
    return model

# === Step 2: Load RandomizedSearch Best Models ===

print("\nLoading models from RandomizedSearchCV…")
best_rf_model  = joblib.load("rf_randomizedcv.joblib")
best_xgb_model = joblib.load("xgb_randomizedcv.joblib")

# speed tweaks for XGB
best_xgb_model.set_params(
    tree_method='hist',
    predictor='cpu_predictor',
    use_label_encoder=False,
    eval_metric='mlogloss'
)
print("Successfully loaded RF and XGB randomized-CV models.")

# === Step 3: Define narrower GridSearch spaces ===

rf_grid = {
    'n_estimators':    [best_rf_model.n_estimators-50, best_rf_model.n_estimators, best_rf_model.n_estimators+50],
    'max_depth':       [best_rf_model.max_depth-2,    best_rf_model.max_depth,    best_rf_model.max_depth+2],
    'min_samples_split': [best_rf_model.min_samples_split, best_rf_model.min_samples_split+1],
    'min_samples_leaf':  [best_rf_model.min_samples_leaf,  best_rf_model.min_samples_leaf+1],
    'max_features':      [best_rf_model.max_features],
    'bootstrap':         [best_rf_model.bootstrap],
    'random_state':      [42]
}

xgb_grid = {
    'n_estimators':    [best_xgb_model.n_estimators-50, best_xgb_model.n_estimators, best_xgb_model.n_estimators+50],
    'max_depth':       [best_xgb_model.max_depth-1,    best_xgb_model.max_depth,    best_xgb_model.max_depth+1],
    'learning_rate':   [best_xgb_model.learning_rate*0.8, best_xgb_model.learning_rate, best_xgb_model.learning_rate*1.2],
    'subsample':       [best_xgb_model.subsample],
    'colsample_bytree':[best_xgb_model.colsample_bytree],
    'gamma':           [best_xgb_model.gamma],
    'reg_alpha':       [best_xgb_model.reg_alpha],
    'reg_lambda':      [best_xgb_model.reg_lambda],
    'random_state':    [42]
}

# === Step 4: GridSearchCV function ===

def custom_gridsearch(X_train, y_train, base_model, param_grid, model_name):
    cv_folds = 5
    # estimate total fits for progress bar
    n_total = cv_folds * sum(len(vals) for vals in param_grid.values())
    with tqdm(total=n_total, desc=f"{model_name} GridSearchCV", ncols=100) as pbar:
        class Callback:
            def __call__(self, *args, **kwargs):
                pbar.update(1)

        grid = GridSearchCV(
            estimator=base_model,
            param_grid=param_grid,
            cv=cv_folds,
            scoring='accuracy',
            n_jobs=-1,
            verbose=0
        )
        grid.fit(X_train, y_train, callbacks=[Callback()])

    print(f"\nBest params for {model_name}: {grid.best_params_}")
    return grid.best_estimator_

# === Step 5: Full GridSearch Pipeline ===

def full_gridsearch_pipeline(X_train, y_train):
    specs = [
        ("Random Forest", best_rf_model,  rf_grid,  "rf_gridsearch.joblib"),
        ("XGBoost",       best_xgb_model, xgb_grid, "xgb_gridsearch.joblib"),
    ]
    results = {}
    for name, base, grid, fname in specs:
        print(f"\n--- {name} Fine-Tuning ---")
        results[name] = load_or_train_gridsearch(
            fname, base, grid, X_train, y_train,
            lambda X,y,mdl,grd: custom_gridsearch(X, y, mdl, grd, name)
        )
    return results

# === Step 6: Run it ===

best_grid_models = full_gridsearch_pipeline(X_train, y_train_encoded)



Loading models from RandomizedSearchCV…
Successfully loaded RF and XGB randomized-CV models.

--- Random Forest Fine-Tuning ---

Loading existing model from rf_gridsearch.joblib...

→ Loaded model type: RandomForestClassifier
→ tuned params: {'n_estimators': 635, 'max_depth': 25, 'min_samples_split': 7, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'bootstrap': False, 'random_state': 42}

--- XGBoost Fine-Tuning ---

Loading existing model from xgb_gridsearch.joblib...

→ Loaded model type: XGBClassifier
→ tuned params: {'n_estimators': 303, 'max_depth': 5, 'learning_rate': 0.22365300524649903, 'subsample': 0.8654007076432223, 'colsample_bytree': 0.6336559859980195, 'gamma': 0.08081435704730688, 'reg_alpha': 0.009197051616629648, 'reg_lambda': 1.4058861714641284, 'random_state': 42}


In [None]:
# === Step 0: Imports ===
import time
import optuna
import joblib
import numpy as np
import os
from tqdm import tqdm
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner

# === Step 0.5: Load your GridSearchCV best‐estimators & extract their params ===
print("Loading gridsearch best models…")
rf_best  = joblib.load("rf_gridsearch.joblib")
xgb_best = joblib.load("xgb_gridsearch.joblib")

grid_rf_best = {
    'n_estimators':      rf_best.n_estimators,
    'max_depth':         rf_best.max_depth,
    'min_samples_split': rf_best.min_samples_split,
    'min_samples_leaf':  rf_best.min_samples_leaf,
    'max_features':      rf_best.max_features,
    'bootstrap':         rf_best.bootstrap
}

grid_xgb_best = {
    'n_estimators':     xgb_best.n_estimators,
    'max_depth':        xgb_best.max_depth,
    'learning_rate':    xgb_best.learning_rate,
    'subsample':        xgb_best.subsample,
    'colsample_bytree': xgb_best.colsample_bytree,
    'gamma':            xgb_best.gamma,
    'reg_alpha':        xgb_best.reg_alpha,
    'reg_lambda':       xgb_best.reg_lambda
}

print("rf_grid_best =", grid_rf_best)
print("xgb_grid_best =", grid_xgb_best)

# === Step 1: Helper Functions ===
def track_time(func):
    def wrapper(*args, **kwargs):
        print(f"\nStarting: {func.__name__}")
        start = time.time()
        result = func(*args, **kwargs)
        print(f"Completed: {func.__name__} in {time.time() - start:.1f}s")
        return result
    return wrapper

def save_optuna_study(study, filename):
    joblib.dump(study, filename)
    print(f"Saved Optuna study to {filename}")

def load_optuna_study(filename):
    if os.path.exists(filename):
        print(f"Loading existing Optuna study from {filename}…")
        return joblib.load(filename)
    return None

# === Step 2: Objective Functions (±20% around grid best) ===
def objective_rf(trial, X, y, cv_folds=5):
    lo_ne = max(10, int(grid_rf_best['n_estimators'] * 0.8))
    hi_ne = int(grid_rf_best['n_estimators'] * 1.2)
    lo_md = max(1, int(grid_rf_best['max_depth'] * 0.7))
    hi_md = int(grid_rf_best['max_depth'] * 1.3)

    params = {
        'n_estimators':      trial.suggest_int('n_estimators', lo_ne, hi_ne),
        'max_depth':         trial.suggest_int('max_depth', lo_md, hi_md),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf':  trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features':      trial.suggest_categorical('max_features', ['sqrt', 'log2']),
        'bootstrap':         trial.suggest_categorical('bootstrap', [True, False]),
        'random_state':      42
    }
    clf = RandomForestClassifier(**params)
    scores = cross_val_score(clf, X, y, cv=cv_folds, scoring='accuracy', n_jobs=-1)
    return scores.mean()

def objective_xgb(trial, X, y, cv_folds=5):
    lo_ne = max(10, int(grid_xgb_best['n_estimators'] * 0.8))
    hi_ne = int(grid_xgb_best['n_estimators'] * 1.2)
    lo_md = max(1, int(grid_xgb_best['max_depth'] * 0.7))
    hi_md = int(grid_xgb_best['max_depth'] * 1.3)
    lo_lr = max(1e-3, grid_xgb_best['learning_rate'] * 0.5)
    hi_lr = min(1.0, grid_xgb_best['learning_rate'] * 1.5)

    params = {
        'n_estimators':      trial.suggest_int('n_estimators', lo_ne, hi_ne),
        'max_depth':         trial.suggest_int('max_depth', lo_md, hi_md),
        'learning_rate':     trial.suggest_float('learning_rate', lo_lr, hi_lr),
        'subsample':         trial.suggest_float('subsample',
                                    max(0.1, grid_xgb_best['subsample'] - 0.2),
                                    min(1.0, grid_xgb_best['subsample'] + 0.2)),
        'colsample_bytree':  trial.suggest_float('colsample_bytree',
                                    max(0.1, grid_xgb_best['colsample_bytree'] - 0.2),
                                    min(1.0, grid_xgb_best['colsample_bytree'] + 0.2)),
        'gamma':             trial.suggest_float('gamma',
                                    max(0.0, grid_xgb_best['gamma'] - 0.1),
                                    grid_xgb_best['gamma'] + 0.1),
        'reg_alpha':         trial.suggest_float('reg_alpha',
                                    max(0.0, grid_xgb_best['reg_alpha'] - 0.5),
                                    grid_xgb_best['reg_alpha'] + 0.5),
        'reg_lambda':        trial.suggest_float('reg_lambda',
                                    max(0.1, grid_xgb_best['reg_lambda'] - 1),
                                    grid_xgb_best['reg_lambda'] + 1),
        'use_label_encoder': False,
        'eval_metric':       'mlogloss',
        'tree_method':       'hist',
        'predictor':         'cpu_predictor',
        'n_jobs':            -1,
        'random_state':      42
    }
    clf = XGBClassifier(**params)
    scores = cross_val_score(clf, X, y, cv=cv_folds, scoring='accuracy', n_jobs=-1)
    return scores.mean()

# === Step 3: Tuning + warm-start ===
@track_time
def optuna_tune(name, objective_fn, X, y, study_filename, n_trials=30, grid_params=None):
    study = load_optuna_study(study_filename)
    if study is None:
        study = optuna.create_study(
            direction='maximize',
            sampler=TPESampler(seed=42),
            pruner=MedianPruner(n_warmup_steps=5)
        )
        if grid_params:
            study.enqueue_trial(grid_params)

        with tqdm(total=n_trials, desc=f"{name} Optuna", ncols=80) as pbar:
            study.optimize(
                lambda t: objective_fn(t, X, y),
                n_trials=n_trials,
                callbacks=[lambda study, trial: pbar.update(1)]
            )
        save_optuna_study(study, study_filename)
    else:
        print(f"Loaded existing Optuna study from {study_filename}, best_params = {study.best_params}")
    return study

# === Step 4: Full Optuna Pipeline ===
def full_optuna_pipeline(X_train, y_train):
    return {
        'rf':  optuna_tune('RF',  objective_rf, X_train, y_train, 'rf_optuna_warm.joblib',  n_trials=30, grid_params=grid_rf_best),
        'xgb': optuna_tune('XGB', objective_xgb, X_train, y_train, 'xgb_optuna_warm.joblib', n_trials=30, grid_params=grid_xgb_best)
    }

# === Step 5: Execute ===
best_studies = full_optuna_pipeline(X_train, y_train_encoded)


Loading gridsearch best models…
rf_grid_best = {'n_estimators': 635, 'max_depth': 25, 'min_samples_split': 7, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'bootstrap': False}
xgb_grid_best = {'n_estimators': 303, 'max_depth': 5, 'learning_rate': 0.22365300524649903, 'subsample': 0.8654007076432223, 'colsample_bytree': 0.6336559859980195, 'gamma': 0.08081435704730688, 'reg_alpha': 0.009197051616629648, 'reg_lambda': 1.4058861714641284}

Starting: optuna_tune
Loading existing Optuna study from rf_optuna_warm.joblib…
Loaded existing Optuna study from rf_optuna_warm.joblib, best_params = {'n_estimators': 526, 'max_depth': 33, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'bootstrap': False}
Completed: optuna_tune in 0.0s

Starting: optuna_tune
Loading existing Optuna study from xgb_optuna_warm.joblib…
Loaded existing Optuna study from xgb_optuna_warm.joblib, best_params = {'n_estimators': 274, 'max_depth': 6, 'learning_rate': 0.28417166725519344, 'subsample': 0.9

# Save Preprocessed Datasets for MLP Training

The following `.npy` files are saved:

- `X_train_scaled.npy`: Scaled features for training.
- `y_train_label_encoded.npy`: Encoded labels for training.
- `X_test_scaled.npy`: Scaled features for testing.
- `y_test_label_encoded.npy`: Encoded labels for testing.
- `X_eval_scaled.npy`: Scaled features for competition evaluation.

Important: It is critical to save `X_eval_scaled` and not older partial matrices (such as `X_scaled_evaluation`) to ensure feature dimensions match between training and evaluation.


In [None]:
np.save("X_train_scaled.npy", X_train_scaled)
np.save("y_train_label_encoded.npy", y_train_label_encoded)
np.save("X_test_scaled.npy", X_test_scaled)
np.save("y_test_label_encoded.npy", y_test_label_encoded)
np.save("X_eval_scaled.npy", X_eval_scaled)



In [None]:
# === Step 0: Imports ===
import numpy as np
import pandas as pd
import joblib
import optuna
import os
import time
from tqdm import tqdm

from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score, StratifiedKFold

# === Step 1: Load Data ===

print("\nLoading datasets...")

X_train_scaled = np.load("X_train_scaled.npy")
y_train_label_encoded = np.load("y_train_label_encoded.npy")
X_test_scaled = np.load("X_test_scaled.npy")
y_test_label_encoded = np.load("y_test_label_encoded.npy")
X_eval_scaled = np.load("X_eval_scaled.npy")

print(f"Training set shape: {X_train_scaled.shape}")
print(f"Testing set shape: {X_test_scaled.shape}")
print(f"Evaluation set shape: {X_eval_scaled.shape}")

# === Step 2: Define Optuna Objective Function (10-Fold CV, Dynamic Early Stopping) ===

def objective(trial):
    # Hyperparameter search space
    hidden_layer_1 = trial.suggest_int('hidden_layer_1', 64, 1024, step=64)
    hidden_layer_2 = trial.suggest_int('hidden_layer_2', 32, hidden_layer_1, step=32)
    activation = trial.suggest_categorical('activation', ['relu', 'tanh'])
    learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-2, log=True)
    alpha = trial.suggest_float('alpha', 1e-5, 1e-1, log=True)  # L2 regularization
    early_stopping = trial.suggest_categorical('early_stopping', [True, False])

    if early_stopping:
        validation_fraction = trial.suggest_float('validation_fraction', 0.05, 0.2)
    else:
        validation_fraction = 0.1  # Default; will not matter because early_stopping=False

    mlp = MLPClassifier(
        hidden_layer_sizes=(hidden_layer_1, hidden_layer_2),
        activation=activation,
        learning_rate_init=learning_rate,
        alpha=alpha,
        early_stopping=early_stopping,
        validation_fraction=validation_fraction,
        n_iter_no_change=15,
        max_iter=500,
        random_state=42,
        verbose=False
    )

    # 10-fold Stratified Cross Validation
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(mlp, X_train_scaled, y_train_label_encoded, cv=cv, scoring='accuracy', n_jobs=-1)
    return scores.mean()

# === Step 3: Run Optuna Study ===

print("\nStarting Optuna study to tune MLP hyperparameters...")

study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(seed=42))
study.optimize(objective, n_trials=50)

print("\nBest Hyperparameters Found:")
print(study.best_params)

# Save the Optuna study (optional)
joblib.dump(study, "optuna_mlp_study.joblib")
print("Optuna study saved as optuna_mlp_study.joblib")

# === Step 4: Train Final Best MLP Model ===

best_params = study.best_params

best_mlp_model = MLPClassifier(
    hidden_layer_sizes=(best_params['hidden_layer_1'], best_params['hidden_layer_2']),
    activation=best_params['activation'],
    learning_rate_init=best_params['learning_rate'],
    alpha=best_params['alpha'],
    early_stopping=best_params['early_stopping'],
    validation_fraction=best_params.get('validation_fraction', 0.1),
    n_iter_no_change=15,
    max_iter=500,
    random_state=42,
    verbose=True
)

print("\nTraining final best MLP model on full training data...")

best_mlp_model.fit(X_train_scaled, y_train_label_encoded)

# === Step 5: Save Best MLP Model ===

joblib.dump(best_mlp_model, "best_mlp_model_optuna.joblib")
print("\nBest MLP model (Optuna tuned) saved as best_mlp_model_optuna.joblib")

# === Step 6: Evaluate on Test Set ===

print("\nEvaluating final model on test set...")

y_pred_test = best_mlp_model.predict(X_test_scaled)

print("\n=== Test Set Evaluation ===")
print("Accuracy:", accuracy_score(y_test_label_encoded, y_pred_test))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test_label_encoded, y_pred_test))
print("\nClassification Report:")
print(classification_report(y_test_label_encoded, y_pred_test))

# === Step 7: Predict on Evaluation Set ===

print("\nGenerating competition evaluation predictions...")

y_pred_eval = best_mlp_model.predict(X_eval_scaled)
np.save("y_pred_mlp_eval_optuna.npy", y_pred_eval)

print("\nMLP predictions on evaluation set saved as y_pred_mlp_eval_optuna.npy")


[I 2025-04-28 22:27:32,745] A new study created in memory with name: no-name-7cbf4648-2566-4c06-a4e7-d02a0e861ab4



Loading datasets...
Training set shape: (544, 4922)
Testing set shape: (137, 4922)
Evaluation set shape: (171, 4922)

Starting Optuna study to tune MLP hyperparameters...


[I 2025-04-28 22:28:20,737] Trial 0 finished with value: 0.9484848484848485 and parameters: {'hidden_layer_1': 384, 'hidden_layer_2': 384, 'activation': 'relu', 'learning_rate': 0.0002051338263087451, 'alpha': 4.207053950287933e-05, 'early_stopping': False}. Best is trial 0 with value: 0.9484848484848485.
[I 2025-04-28 22:29:50,038] Trial 1 finished with value: 0.9320538720538719 and parameters: {'hidden_layer_1': 640, 'hidden_layer_2': 480, 'activation': 'tanh', 'learning_rate': 0.004622589001020831, 'alpha': 7.068974950624602e-05, 'early_stopping': False}. Best is trial 0 with value: 0.9484848484848485.
[I 2025-04-28 22:30:20,661] Trial 2 finished with value: 0.9392929292929292 and parameters: {'hidden_layer_1': 320, 'hidden_layer_2': 192, 'activation': 'relu', 'learning_rate': 0.0016738085788752138, 'alpha': 3.613894271216525e-05, 'early_stopping': False}. Best is trial 0 with value: 0.9484848484848485.
[I 2025-04-28 22:31:02,455] Trial 3 finished with value: 0.9264646464646464 and 


Best Hyperparameters Found:
{'hidden_layer_1': 448, 'hidden_layer_2': 320, 'activation': 'relu', 'learning_rate': 0.00016752982011682978, 'alpha': 4.359545639994281e-05, 'early_stopping': False}
Optuna study saved as optuna_mlp_study.joblib

Training final best MLP model on full training data...
Iteration 1, loss = 1.96315979
Iteration 2, loss = 0.68537782
Iteration 3, loss = 0.31518991
Iteration 4, loss = 0.16237408
Iteration 5, loss = 0.08558686
Iteration 6, loss = 0.05078589
Iteration 7, loss = 0.02946432
Iteration 8, loss = 0.01935015
Iteration 9, loss = 0.01368698
Iteration 10, loss = 0.00996452
Iteration 11, loss = 0.00768884
Iteration 12, loss = 0.00617040
Iteration 13, loss = 0.00518563
Iteration 14, loss = 0.00443581
Iteration 15, loss = 0.00387174
Iteration 16, loss = 0.00348439
Iteration 17, loss = 0.00317171
Iteration 18, loss = 0.00292495
Iteration 19, loss = 0.00272195
Iteration 20, loss = 0.00255806
Iteration 21, loss = 0.00241008
Iteration 22, loss = 0.00228943
Iterati

# Ensemble Model

In [None]:
# === Step 0: Imports ===
import numpy as np
import pandas as pd
import joblib
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# === Step 1: Load Final Models ===
print("Loading models...")
rf_model = joblib.load("rf_optuna_warm.joblib")        # Random Forest
xgb_model = joblib.load("xgb_optuna_warm.joblib")      # XGBoost
mlp_model = joblib.load("best_mlp_model_optuna.joblib") # MLP

# === Step 2: Load Data ===
print("Loading data...")
X_test_scaled = np.load("X_test_scaled.npy")
X_eval_scaled = np.load("X_eval_scaled.npy")
y_test_label_encoded = np.load("y_test_label_encoded.npy")

# === Step 3: Build Soft Voting Ensemble ===
print("Building ensemble...")
voting_clf = VotingClassifier(
    estimators=[
        ('rf', rf_model),
        ('xgb', xgb_model),
        ('mlp', mlp_model)
    ],
    voting='soft',
    n_jobs=-1
)

# === Step 4: Evaluate on Test Set ===
print("Evaluating ensemble on test set...")
voting_clf.fit(X_test_scaled, y_test_label_encoded)

y_pred_test = voting_clf.predict(X_test_scaled)

print("\n=== Ensemble Test Set Evaluation ===")
print("Accuracy:", accuracy_score(y_test_label_encoded, y_pred_test))
print("\nConfusion Matrix:\n", confusion_matrix(y_test_label_encoded, y_pred_test))
print("\nClassification Report:\n", classification_report(y_test_label_encoded, y_pred_test))

# === Step 5: Predict on Evaluation Set ===
print("Predicting on evaluation set...")
y_pred_eval = voting_clf.predict(X_eval_scaled)

# === Step 6: Prepare Submission CSV ===
print("Preparing submission file...")
entry_df = pd.read_csv("X_eval_full_with_entry.csv")

submission = pd.DataFrame({
    "Entry": entry_df["Entry"],
    "ProteinClass": y_pred_eval
})

submission.to_csv("ensemble_submission.csv", index=False)
print("\n Submission file saved as ensemble_submission.csv")


Loading models...
Loading data...
Building ensemble...
Evaluating ensemble on test set...


ValueError: The estimator Study should be a classifier.

# Evaluating model

In [None]:
def evaluate_model(model, X_test, y_test, model_name):

    y_pred = model.predict(X_test)

    # Evaluation
    accuracy = accuracy_score(y_test, y_pred)
    print(f"\n=== {model_name} ===")
    print(f"Accuracy for testing data: {accuracy:.4f}")
    return y_pred

def evaluate_model_testing(model, X_test, model_name):

    y_pred = model.predict(X_test)

    print(f"\n=== {model_name} ===")
    print("Output Evaluated")
    return y_pred

In [None]:
# Evaluate my model
y_pred_my_model = evaluate_model(my_model, X_test_scaled, y_test_label_encoded, model_name="My Random Model")
# Convert back to labels
y_pred_my_model = label_encoder.inverse_transform(y_pred_my_model)

# Evaluate Logistic Regression
y_pred_lr_model = evaluate_model(lr_model, X_test_scaled, y_test, model_name="Logistic Regression")


# Evaluate Logistic Regression Best estimator
y_pred_best_lr_model = evaluate_model(best_lr_model, X_test_scaled, y_test, model_name="Logistic Regression Best")



## To Do ##
# Evaluate all models



=== My Random Model ===
Accuracy for testing data: 0.0511

=== Logistic Regression ===
Accuracy for testing data: 0.0000

=== Logistic Regression Best ===
Accuracy for testing data: 0.9781


## Evaluate the provided data for the competition

In [None]:
# Evaluate my model
y_pred_my_model = evaluate_model_testing(my_model, X_evaluation, model_name="My Random Model")
# Convert back to labels
y_pred_my_model_evaluation = label_encoder.inverse_transform(y_pred_my_model)


# Evaluate Logistic Regression
y_pred_lr_model_evaluation = evaluate_model_testing(lr_model, X_scaled_evaluation, model_name="Logistic Regression")


# Evaluate Logistic Regression Best Model
y_pred_lr_best_model_evaluation = evaluate_model_testing(best_lr_model, X_scaled_evaluation, model_name="Logistic Regression Best")

## To Do ##
# Evaluate all the models with the competition data



=== My Random Model ===
Output Evaluated


ValueError: X has 34 features, but LogisticRegression is expecting 4922 features as input.

In [None]:
# Prediction of my model on testing data
print("\n=== My Model ===")
print(y_pred_my_model_evaluation[:5])

# Prediction of my LR on testing data
print("\n=== Logistic Regression ===")
print(y_pred_lr_model_evaluation[:5])

# Prediction of my LR on testing data
print("\n=== Logistic Regression Best ===")
print(y_pred_lr_best_model_evaluation[:5])

## To Do ##
# Print the first 5 examples for all model output for the competition data
# Make sure that the output is converted back into a list of string for the Protien class

# Creat csv file for Evaluation

In [None]:
# Function to creat csv output for uploading
def save_predictions(_fn, _y_pred, _df):
    import csv
    with open(_fn, 'w') as fout:
        writer = csv.writer(fout, delimiter=',', lineterminator='\n')
        writer.writerow(['Entry', 'ProteinClass'])
        for y, Entry in zip(_df['Entry'], _y_pred):
            writer.writerow([y, Entry])

In [None]:
# Saving My random model output
save_predictions('Student_name_attempt_1.csv', y_pred_my_model_evaluation, Dataset_evaluation )
# Saving LR model output
save_predictions('Student_name_attempt_2.csv', y_pred_lr_model_evaluation, Dataset_evaluation )
# Saving Best LR model output
save_predictions('Student_name_attempt_3.csv', y_pred_lr_best_model_evaluation, Dataset_evaluation )

## To Do ##
# Save the output of your models

In [None]:
## To Do ##
# Make sure to upload at least 3 attemps to the website
# Good Luck