# Protein Classification Challenge – Fourth Challenge
 
**Name:** AJ Book  
**Course:** EN.605.656.8VL – Computational Drug Discovery and Development  
**Due Date:** 04/21/2025  
 


This module begins by loading the preprocessed training set (`metadata_org_w_features.csv`) and the held‐out evaluation set (`testing_data_w_features.csv`).  Feature augmentation is then performed by merging:

1. **Pfam** domain presence/absence profiles  
2. **Peptide descriptors** (Kidera factors, VHSE-8, Atchley factors, Z-scales)  
3. **Protlearn** physicochemical descriptors (e.g., Shannon entropy, autocorrelation)  
4. **ProtBert** contextual embeddings  
5. **ESM2** evolutionary language model embeddings  

Each feature block is validated for row and column consistency before proceeding to preprocessing, model training, evaluation, and final submission formatting.  


In [None]:
# === Standard Library Imports ===
import os
import re
import time
import random
import warnings
import itertools
from pathlib import Path
from collections import Counter

# === Third-Party Scientific Computing Libraries ===
import numpy as np
import pandas as pd
from scipy.stats import randint, uniform, loguniform

# === Bioinformatics Libraries ===
from Bio.SeqUtils.ProtParam import ProteinAnalysis

# === Machine Learning Libraries (scikit-learn) ===
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import (
    train_test_split, RandomizedSearchCV, cross_val_score, GridSearchCV, cross_validate, StratifiedKFold
)
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    confusion_matrix, classification_report
)
from sklearn.decomposition import PCA
from sklearn.inspection import permutation_importance
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif

# === Specialized Machine Learning Libraries ===
from xgboost import XGBClassifier

# === Visualization Libraries ===
import matplotlib.pyplot as plt
import seaborn as sns

# === Utilities ===
import joblib
from tqdm import tqdm



# Read in data

In [2]:
# Read in data
data = pd.read_csv("metadata_org_w_features.csv")
data_evaluation = pd.read_csv("testing_data_w_features.csv")

# Information and sample data

In [3]:
# information on data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 681 entries, 0 to 680
Columns: 589 entries, Entry to Gravy
dtypes: float64(584), int64(1), object(4)
memory usage: 3.1+ MB


In [4]:
# describe on data
data.describe()

Unnamed: 0,SequenceLength,A,C,D,E,F,G,H,I,K,...,DDD,MolecularWeight,IsoelectricPoint,Aromaticity,InstabilityIndex,Flexibility,Helix,Sheet,Turn,Gravy
count,681.0,681.0,681.0,681.0,681.0,681.0,681.0,681.0,681.0,681.0,...,681.0,681.0,681.0,681.0,681.0,681.0,681.0,681.0,681.0,681.0
mean,888.660793,0.071173,0.023348,0.044993,0.055845,0.046743,0.065586,0.022028,0.056255,0.047262,...,0.002206,99103.867857,7.097584,0.096863,42.16057,0.995021,0.304995,0.279123,0.389817,-0.051027
std,827.859592,0.022832,0.0123,0.013231,0.017925,0.015308,0.01884,0.00775,0.018204,0.017596,...,0.003108,92308.45646,1.551349,0.021673,7.412366,0.006859,0.029081,0.032611,0.043874,0.325872
min,103.0,0.024283,0.0,0.007605,0.005714,0.007782,0.016807,0.004065,0.00984,0.002915,...,0.0,11710.27,4.383673,0.033074,15.359896,0.974438,0.222539,0.18315,0.24863,-0.757339
25%,384.0,0.056695,0.015251,0.037879,0.04428,0.035479,0.052764,0.016825,0.044171,0.034237,...,0.0,42771.0835,5.723255,0.082587,37.259593,0.991538,0.284024,0.257261,0.361333,-0.284269
50%,676.0,0.06746,0.021417,0.046046,0.058594,0.044747,0.065963,0.021544,0.056726,0.048017,...,0.001377,74697.6635,6.556855,0.094744,42.218182,0.996686,0.302326,0.277,0.382857,-0.12035
75%,1015.0,0.081784,0.030753,0.053254,0.067669,0.055838,0.078212,0.026769,0.068,0.058943,...,0.003229,112207.0978,8.660054,0.109974,46.958534,0.999503,0.323615,0.300946,0.416667,0.136484
max,7096.0,0.164,0.074873,0.116598,0.123102,0.110701,0.153509,0.056093,0.125461,0.104956,...,0.031637,794048.9258,11.114366,0.180812,66.024733,1.013136,0.403005,0.419863,0.546125,1.008772


In [5]:
data.head()

Unnamed: 0,Entry,CleanSequence,Selected_PDB,ProteinClass,SequenceLength,A,C,D,E,F,...,DDD,MolecularWeight,IsoelectricPoint,Aromaticity,InstabilityIndex,Flexibility,Helix,Sheet,Turn,Gravy
0,P21611,MGKAAAVVLVTLVALLGLAQADLTPKVQVYSRFPASAGTKNVLNCF...,3p73,MHC,119,0.109244,0.016807,0.058824,0.042017,0.058824,...,0.0,13041.7928,5.834204,0.109244,22.404202,0.996347,0.336134,0.277311,0.369748,-0.047059
1,Q66GT5,MAASAWLEAGLARVLFYPTLLYTVFRGRVRGPAHRDWYHRIDHTVL...,3rgo,Phosphatase,193,0.108808,0.015544,0.020725,0.067358,0.025907,...,0.0,21942.2575,9.739448,0.082902,37.915078,0.992205,0.367876,0.186528,0.38342,-0.16114
2,Q9Y006,MNLTIKEEDFTNTFMKNEESFNTFRVTKVKRWNAKRLFKILFVTVF...,3qvc,Protease,451,0.028825,0.008869,0.04878,0.068736,0.079823,...,0.002227,51692.5641,8.043414,0.137472,37.618204,1.002078,0.305987,0.290466,0.43459,-0.249667
3,P05622,MGLPGVIPALVLRGQLLLSVLWLLGPQTSRGLVITPPGPEFVLNIS...,1aya,RTK,1098,0.043716,0.017304,0.056466,0.069217,0.030055,...,0.00365,122788.2434,4.99401,0.083789,47.564763,0.999045,0.281421,0.311475,0.385246,-0.203097
4,P06343,MALQIPSLLLLAAVVVLTVLSSPGTEGGNSERHFVHQFQPFCYFTN...,1d9k,MHC,263,0.041825,0.019011,0.022814,0.076046,0.038023,...,0.0,29966.8989,8.99168,0.091255,46.750989,0.99569,0.262357,0.254753,0.39924,-0.303042


# Extract the relevant fields from the dataset

In [6]:
def extract_fields_from_data(data):
    column_start = data.columns.get_loc("SequenceLength")+1
    # Entry and ProteinClass
    df = data.loc[:,['Entry', 'ProteinClass']]
    # PDB
    selected_PDB = data.loc[:,['Selected_PDB']]
    # Sequence
    seq = data.loc[:,['CleanSequence']]
    # Sequence Length
    seq_L = data.loc[:,['SequenceLength']]
    # Amino Acid Frequencies
    freq = data.iloc[:, column_start : column_start+20]
    # All possible dipeptide frequencies
    dipep = data.iloc[:, column_start+20 : column_start+20 + 400]
    # Reduced Amino Acid Alphabet Frequencies
    red_freq = data.iloc[:, column_start+20 + 400 : column_start+20 + 400 + 5]
    # N-Gram Profiles of Reduced Amino Acid Alphabet
    red_ngram = data.iloc[:, column_start+20 + 400 + 5 : column_start+20 + 400 + 5 + 150]
    # Protein Properties
    prop = data.iloc[:, column_start+20 + 400 + 5 + 150 :]

    return df, selected_PDB, seq, seq_L, freq, dipep, red_freq, red_ngram, prop

# For data with no ProteinClass
def extract_fields_from_data_evaluation(data):
    column_start = data.columns.get_loc("SequenceLength")+1
    # Entry and ProteinClass
    df = data.loc[:,['Entry']]
    # PDB
    selected_PDB = data.loc[:,['Selected_PDB']]
    # Sequence
    seq = data.loc[:,['CleanSequence']]
    # Sequence Length
    seq_L = data.loc[:,['SequenceLength']]
    # Amino Acid Frequencies
    freq = data.iloc[:, column_start : column_start+20]
    # All possible dipeptide frequencies
    dipep = data.iloc[:, column_start+20 : column_start+20 + 400]
    # Reduced Amino Acid Alphabet Frequencies
    red_freq = data.iloc[:, column_start+20 + 400 : column_start+20 + 400 + 5]
    # N-Gram Profiles of Reduced Amino Acid Alphabet
    red_ngram = data.iloc[:, column_start+20 + 400 + 5 : column_start+20 + 400 + 5 + 150]
    # Protein Properties
    prop = data.iloc[:, column_start+20 + 400 + 5 + 150 :]

    return df, selected_PDB, seq, seq_L, freq, dipep, red_freq, red_ngram, prop

In [7]:
(df, selected_PDB, seq, seq_L, freq, dipep,
 red_freq, red_ngram, prop) = extract_fields_from_data(data)

# Evaluation data
(df_evaluation, selected_PDB_evaluation, seq_evaluation, seq_L_evaluation,
  freq_evaluation, dipep_evaluation, red_freq_evaluation, red_ngram_evaluation, prop_evaluation) = extract_fields_from_data_evaluation(data_evaluation)

# You can use this section if you wanted to add more features to the data

## Merge External Features into Dataset
To enhance protein classification, external biological and structural features are incorporated. These include Pfam domain annotations, peptide physicochemical descriptors, intrinsic disorder predictions (AIUPred), and deep embeddings (ProtBert, ESM2).

In [8]:
# Feel free to add other feature to this data

# --- Step 1: Merge extracted Base features ---
# You already extracted: df, freq, dipep, red_freq, red_ngram, prop

# Merge base feature blocks into one
base_features = pd.concat([freq, dipep, red_freq, red_ngram, prop], axis=1)

# Combine base features with Entry and ProteinClass
df_base = pd.concat([df, base_features], axis=1)

print(f"Base features merged shape: {df_base.shape}")
print("Sample columns:", df_base.columns[:10].tolist())


# --- Step 2: Prefix Base Features with 'base_' ---

# Identify all columns that are actual features (not Entry or ProteinClass)
base_feature_cols = [col for col in df_base.columns if col not in ["Entry", "ProteinClass"]]

# Rename these columns by prefixing with 'base_'
df_base.rename(columns={col: f"base_{col}" for col in base_feature_cols}, inplace=True)

print(f"Prefixed base feature columns. Now total columns: {len(df_base.columns)}")
print(f" Sample prefixed columns: {df_base.columns[:10].tolist()}")



Base features merged shape: (681, 586)
Sample columns: ['Entry', 'ProteinClass', 'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I']
Prefixed base feature columns. Now total columns: 586
 Sample prefixed columns: ['Entry', 'ProteinClass', 'base_A', 'base_C', 'base_D', 'base_E', 'base_F', 'base_G', 'base_H', 'base_I']


In [9]:
# --- Step 2: Load ProtLearn and Peptide descriptors ---

protlearn_df = pd.read_csv("protlearn_features.csv")
peptide_df = pd.read_csv("peptide_descriptors.csv")

# Clean column names
protlearn_df.columns = protlearn_df.columns.str.strip()
peptide_df.columns = peptide_df.columns.str.strip()

# Insert Entry column manually
protlearn_df.insert(0, "Entry", data["Entry"].values)  # assumes correct order
peptide_df.insert(0, "Entry", data["Entry"].values)

print("ProtLearn columns:", protlearn_df.columns[:5])
print("Peptide columns:", peptide_df.columns[:5])



# Start from existing correctly prefixed df_base
# Merge ProtLearn and Peptide descriptors

df_full = df_base.merge(protlearn_df, on="Entry", how="left")
df_full = df_full.merge(peptide_df, on="Entry", how="left")

print(f"Full merged feature matrix shape after adding ProtLearn and Peptides: {df_full.shape}")

# Build modeling matrices
X_full = df_full.drop(columns=["Entry", "ProteinClass"])  # only numeric features
y_full = df_full[["Entry", "ProteinClass"]]                # Entry and class labels

print("X_full shape (features only):", X_full.shape)
print("y_full shape (Entry + Label):", y_full.shape)


ProtLearn columns: Index(['Entry', 'PL_AAIndex1_0', 'PL_AAIndex1_1', 'PL_AAIndex1_2',
       'PL_AAIndex1_3'],
      dtype='object')
Peptide columns: Index(['Entry', 'PEP_AF1', 'PEP_AF2', 'PEP_AF3', 'PEP_AF4'], dtype='object')
Full merged feature matrix shape after adding ProtLearn and Peptides: (681, 2392)
X_full shape (features only): (681, 2390)
y_full shape (Entry + Label): (681, 2)


In [10]:
df_full.head()

Unnamed: 0,Entry,ProteinClass,base_A,base_C,base_D,base_E,base_F,base_G,base_H,base_I,...,PEP_VSTPV2,PEP_VSTPV3,PEP_VSTPV4,PEP_VSTPV5,PEP_VSTPV6,PEP_Z1,PEP_Z2,PEP_Z3,PEP_Z4,PEP_Z5
0,P21611,MHC,0.109244,0.016807,0.058824,0.042017,0.058824,0.058824,0.02521,0.016807,...,0.027647,0.413613,-0.383109,0.097479,-0.35042,-0.116134,-0.56958,-0.063361,-0.327899,0.277479
1,Q66GT5,Phosphatase,0.108808,0.015544,0.020725,0.067358,0.025907,0.051813,0.041451,0.046632,...,0.15,0.398601,-0.355959,0.152902,-0.195492,0.006114,-0.442176,-0.514404,-0.167306,0.208187
2,Q9Y006,Protease,0.028825,0.008869,0.04878,0.068736,0.079823,0.053215,0.015521,0.062084,...,0.104235,0.298448,-0.326984,0.184612,-0.244235,-0.061197,-0.288692,-0.267118,-0.491641,0.207406
3,P05622,RTK,0.043716,0.017304,0.056466,0.069217,0.030055,0.06102,0.021858,0.050091,...,0.063643,0.376421,-0.366494,0.159189,-0.229271,0.004599,-0.49327,-0.203634,-0.507723,0.244117
4,P06343,MHC,0.041825,0.019011,0.022814,0.076046,0.038023,0.072243,0.030418,0.041825,...,0.108479,0.340684,-0.393688,0.182548,-0.224068,0.076274,-0.435741,-0.441939,-0.360951,0.158023


In [None]:
# Step 1: Define clean function to load Pfam hits

def load_clean_pfam(domtblout_path: Path, entries: pd.Series, e_cut: float = 1e-3) -> pd.DataFrame:
    """
    Parse hmmscan domtblout.txt and build a clean binary Pfam domain hit matrix.

    Args:
        domtblout_path (Path): Path to domtblout.txt file.
        entries (pd.Series): List of Entry IDs (protein IDs).
        e_cut (float): E-value cutoff for accepting a domain hit.

    Returns:
        pd.DataFrame: Binary matrix (Entry × PfamAcc), 1 = domain present, 0 = absent.
    """
    rows = []
    with domtblout_path.open() as fh:
        for line in fh:
            if line.startswith("#"):
                continue
            parts = line.strip().split()
            if len(parts) < 23:
                continue
            pfam_acc = parts[1]
            entry = parts[3]
            evalue = float(parts[6])
            if evalue <= e_cut:
                rows.append((entry, pfam_acc))

    hits_df = pd.DataFrame(rows, columns=["Entry", "PfamAcc"])
    pfam_matrix = pd.crosstab(hits_df["Entry"], hits_df["PfamAcc"])
    pfam_matrix = pfam_matrix.reindex(entries.values, fill_value=0)
    pfam_matrix.index.name = "Entry"

    return pfam_matrix

In [12]:
# Step 2: Load clean Pfam domains

domtblout_path = Path("data/pfam/domtblout.txt")

pfam_df_clean = load_clean_pfam(domtblout_path, df_full["Entry"])
pfam_df_clean = pfam_df_clean.reset_index()

print(f"Clean Pfam matrix shape: {pfam_df_clean.shape}")
print(f"Example Pfam columns: {pfam_df_clean.columns[1:6].tolist()}")

# Step 3: Check PF-prefixed columns BEFORE dropping

# Find PF-prefixed columns
pfam_cols_in_full = [c for c in df_full.columns if c.startswith("PF")]

print(f"Found {len(pfam_cols_in_full)} columns starting with 'PF':")
print(pfam_cols_in_full)

# Optional: Preview a few values
print("\nPreview of existing PF-prefixed columns before dropping:")
print(df_full[["Entry"] + pfam_cols_in_full].head())




Clean Pfam matrix shape: (681, 529)
Example Pfam columns: ['PF00001.24', 'PF00002.27', 'PF00003.25', 'PF00004.32', 'PF00005.30']
Found 0 columns starting with 'PF':
[]

Preview of existing PF-prefixed columns before dropping:
    Entry
0  P21611
1  Q66GT5
2  Q9Y006
3  P05622
4  P06343


In [13]:
# (Pause here to manually review if needed)

# Step 4: Drop old PF columns after verifying

# Only proceed after verifying!
df_full = df_full.drop(columns=pfam_cols_in_full, errors="ignore")
print(f"Dropped {len(pfam_cols_in_full)} old PF-prefixed columns before clean merge.")

# Step 5: Merge clean Pfam matrix into df_full

df_full = df_full.merge(pfam_df_clean, on="Entry", how="left").fillna(0)
print(f"Final df_full shape after clean Pfam merge: {df_full.shape}")

# Step 6: Final Sanity Check

pfam_columns = [c for c in df_full.columns if c.startswith("PF")]
print(f"Final number of Pfam domain features: {len(pfam_columns)}")
print("\nPreview of final clean Pfam features:")
print(df_full[["Entry"] + pfam_columns[:5]].head(10))

Dropped 0 old PF-prefixed columns before clean merge.
Final df_full shape after clean Pfam merge: (681, 2920)
Final number of Pfam domain features: 528

Preview of final clean Pfam features:
    Entry  PF00001.24  PF00002.27  PF00003.25  PF00004.32  PF00005.30
0  P21611           0           0           0           0           0
1  Q66GT5           0           0           0           0           0
2  Q9Y006           0           0           0           0           0
3  P05622           0           0           0           0           0
4  P06343           0           0           0           0           0
5  P50281           0           0           0           0           0
6  Q13255           0           0           1           0           0
7  P08069           0           0           0           0           0
8  B0V2N1           0           0           0           0           0
9  Q9Y5Z0           0           0           0           0           0


In [14]:
# --- See what Pfam columns look like after merging ---

# Step 1: Find all Pfam domain columns
pfam_columns = [c for c in df_full.columns if c.startswith("PF")]

print(f" Found {len(pfam_columns)} Pfam domain features.")

# Step 2: Look at the first few Pfam columns and first few proteins
print(df_full[["Entry"] + pfam_columns[:5]].head(10))

# Step 3: Look for proteins that have no Pfam hits
no_pfam = df_full[pfam_columns].sum(axis=1) == 0

print(f" Proteins with NO Pfam hits: {no_pfam.sum()} out of {len(df_full)}")

# See a few proteins with no Pfam hits
print(df_full.loc[no_pfam, ["Entry"] + pfam_columns[:5]].head())


 Found 528 Pfam domain features.
    Entry  PF00001.24  PF00002.27  PF00003.25  PF00004.32  PF00005.30
0  P21611           0           0           0           0           0
1  Q66GT5           0           0           0           0           0
2  Q9Y006           0           0           0           0           0
3  P05622           0           0           0           0           0
4  P06343           0           0           0           0           0
5  P50281           0           0           0           0           0
6  Q13255           0           0           1           0           0
7  P08069           0           0           0           0           0
8  B0V2N1           0           0           0           0           0
9  Q9Y5Z0           0           0           0           0           0
 Proteins with NO Pfam hits: 0 out of 681
Empty DataFrame
Columns: [Entry, PF00001.24, PF00002.27, PF00003.25, PF00004.32, PF00005.30]
Index: []


In [None]:


# ---  Load ProtBERT embeddings (training set) ---

protbert_path = "embed_models/protbert_train_full.npy"
protbert_embeddings = np.load(protbert_path)  # Should be shape (681, 1024)

# Convert to DataFrame and add Entry
protbert_df = pd.DataFrame(
    protbert_embeddings,
    columns=[f"ProtBERT_{i}" for i in range(protbert_embeddings.shape[1])]
)
protbert_df.insert(0, "Entry", df_full["Entry"].values)

print(f"ProtBERT embedding DataFrame shape: {protbert_df.shape}")
print(f"Sample ProtBERT columns: {protbert_df.columns[:5].tolist()}")

# ---  Merge ProtBERT embeddings ---

df_full = df_full.merge(protbert_df, on="Entry", how="left")
print(f"Shape after merging ProtBERT embeddings: {df_full.shape}")

# ---  Load ESM2 embeddings (training set) ---

esm2_path = "embed_models/esm2_train_full.npy"
esm2_embeddings = np.load(esm2_path)  # Should be shape (681, 1280)

# Convert to DataFrame and add Entry
esm2_df = pd.DataFrame(
    esm2_embeddings,
    columns=[f"ESM2_{i}" for i in range(esm2_embeddings.shape[1])]
)
esm2_df.insert(0, "Entry", df_full["Entry"].values)

print(f"ESM2 embedding DataFrame shape: {esm2_df.shape}")
print(f"Sample ESM2 columns: {esm2_df.columns[:5].tolist()}")

# ---  Merge ESM2 embeddings ---

df_full = df_full.merge(esm2_df, on="Entry", how="left")
print(f"Shape after merging ESM2 embeddings: {df_full.shape}")

# ---  Final Sanity Check ---

print("\nFinal columns after embedding merge:")
print(df_full.columns[:10])

# Build final modeling matrices
X_full = df_full.drop(columns=["Entry", "ProteinClass"])  # Only features
y_full = df_full[["Entry", "ProteinClass"]]                # Entry + Class label

print(f" Final X_full shape: {X_full.shape}")
print(f" Final y_full shape: {y_full.shape}")


ProtBERT embedding DataFrame shape: (681, 1025)
Sample ProtBERT columns: ['Entry', 'ProtBERT_0', 'ProtBERT_1', 'ProtBERT_2', 'ProtBERT_3']
Shape after merging ProtBERT embeddings: (681, 3944)
ESM2 embedding DataFrame shape: (681, 1281)
Sample ESM2 columns: ['Entry', 'ESM2_0', 'ESM2_1', 'ESM2_2', 'ESM2_3']
Shape after merging ESM2 embeddings: (681, 5224)

Final columns after embedding merge:
Index(['Entry', 'ProteinClass', 'base_A', 'base_C', 'base_D', 'base_E',
       'base_F', 'base_G', 'base_H', 'base_I'],
      dtype='object')
 Final X_full shape: (681, 5222)
 Final y_full shape: (681, 2)


In [16]:
# --- Step 5: Final Data Double-Check ---

# Check for missing values
n_missing_X = X_full.isnull().sum().sum()
n_missing_y = y_full.isnull().sum().sum()

print(f" Missing values in X_full: {n_missing_X}")
print(f"Missing values in y_full: {n_missing_y}")

assert n_missing_X == 0, " Found missing values in X_full after merging!"
assert n_missing_y == 0, " Found missing values in y_full after merging!"

print(" Double-check passed: No missing values.")

 Missing values in X_full: 0
Missing values in y_full: 0
 Double-check passed: No missing values.


In [17]:
df_full.head()

Unnamed: 0,Entry,ProteinClass,base_A,base_C,base_D,base_E,base_F,base_G,base_H,base_I,...,ESM2_1270,ESM2_1271,ESM2_1272,ESM2_1273,ESM2_1274,ESM2_1275,ESM2_1276,ESM2_1277,ESM2_1278,ESM2_1279
0,P21611,MHC,0.109244,0.016807,0.058824,0.042017,0.058824,0.058824,0.02521,0.016807,...,0.004602,-0.082712,-0.138449,-0.015294,-0.006374,-0.047119,0.072656,-0.077782,-0.014603,0.038721
1,Q66GT5,Phosphatase,0.108808,0.015544,0.020725,0.067358,0.025907,0.051813,0.041451,0.046632,...,-0.061771,-0.063909,-0.09209,-0.00935,0.042447,0.023802,0.101591,-0.167533,0.036926,0.024987
2,Q9Y006,Protease,0.028825,0.008869,0.04878,0.068736,0.079823,0.053215,0.015521,0.062084,...,0.183224,0.005093,-0.064172,0.07959,-0.049639,0.000521,0.14536,-0.155799,-0.135549,-0.052576
3,P05622,RTK,0.043716,0.017304,0.056466,0.069217,0.030055,0.06102,0.021858,0.050091,...,0.041743,0.00576,-0.123973,0.010424,0.021136,-0.079837,0.051444,-0.076187,0.03254,0.041431
4,P06343,MHC,0.041825,0.019011,0.022814,0.076046,0.038023,0.072243,0.030418,0.041825,...,0.032086,-0.010952,-0.108737,-0.008614,0.030564,0.006659,0.102912,-0.017738,0.003438,0.00277


# Construct Dataset with any of the extracted relevant fields

In [18]:
# This is an example of construction of the data with freq and red_freq, but you can add and make any field
Dataset = pd.concat([df, freq, red_freq, prop], axis=1)
Dataset_evaluation = pd.concat([df_evaluation, freq_evaluation, red_freq_evaluation, prop_evaluation], axis=1)

In [19]:
# Training Data with ProteinClass
Dataset.head()

Unnamed: 0,Entry,ProteinClass,A,C,D,E,F,G,H,I,...,D.1,MolecularWeight,IsoelectricPoint,Aromaticity,InstabilityIndex,Flexibility,Helix,Sheet,Turn,Gravy
0,P21611,MHC,0.109244,0.016807,0.058824,0.042017,0.058824,0.058824,0.02521,0.016807,...,0.10084,13041.7928,5.834204,0.109244,22.404202,0.996347,0.336134,0.277311,0.369748,-0.047059
1,Q66GT5,Phosphatase,0.108808,0.015544,0.020725,0.067358,0.025907,0.051813,0.041451,0.046632,...,0.088083,21942.2575,9.739448,0.082902,37.915078,0.992205,0.367876,0.186528,0.38342,-0.16114
2,Q9Y006,Protease,0.028825,0.008869,0.04878,0.068736,0.079823,0.053215,0.015521,0.062084,...,0.117517,51692.5641,8.043414,0.137472,37.618204,1.002078,0.305987,0.290466,0.43459,-0.249667
3,P05622,RTK,0.043716,0.017304,0.056466,0.069217,0.030055,0.06102,0.021858,0.050091,...,0.125683,122788.2434,4.99401,0.083789,47.564763,0.999045,0.281421,0.311475,0.385246,-0.203097
4,P06343,MHC,0.041825,0.019011,0.022814,0.076046,0.038023,0.072243,0.030418,0.041825,...,0.098859,29966.8989,8.99168,0.091255,46.750989,0.99569,0.262357,0.254753,0.39924,-0.303042


In [20]:
# Evaluation Data without ProteinClass
Dataset_evaluation.head()

Unnamed: 0,Entry,A,C,D,E,F,G,H,I,K,...,D.1,MolecularWeight,IsoelectricPoint,Aromaticity,InstabilityIndex,Flexibility,Helix,Sheet,Turn,Gravy
0,Q9LF79,0.081006,0.010242,0.054935,0.053073,0.031657,0.089385,0.01676,0.069832,0.065177,...,0.108007,116172.6688,7.86561,0.05959,33.680642,0.999017,0.323091,0.292365,0.384544,0.028026
1,P9WI81,0.108626,0.003195,0.073482,0.043131,0.023962,0.08147,0.019169,0.052716,0.028754,...,0.116613,66509.0117,5.223413,0.049521,28.408163,1.001449,0.263578,0.332268,0.34984,-0.173163
2,P04439,0.09863,0.013699,0.065753,0.063014,0.021918,0.079452,0.024658,0.035616,0.030137,...,0.128767,40840.2477,5.655275,0.090411,36.957014,1.000302,0.293151,0.265753,0.345205,-0.49863
3,Q16581,0.056017,0.03112,0.045643,0.03112,0.070539,0.045643,0.020747,0.051867,0.026971,...,0.076763,53863.6435,6.20218,0.114108,40.862697,0.989447,0.255187,0.307054,0.425311,0.204979
4,Q6QNK2,0.072082,0.020595,0.026316,0.040046,0.050343,0.064073,0.04119,0.050343,0.038902,...,0.066362,96528.7218,8.021753,0.110984,33.635481,0.990173,0.283753,0.28833,0.416476,0.15595


In [21]:
## To Do ##
# Construct your own dataset with these field and any other field that you see fit
# Make sure you do both the training and evaluation data as haveing the same fields


# === Build Correct Base Feature DataFrame for Evaluation ===

# 1. Concatenate all the extracted base feature blocks
base_features_eval = pd.concat(
    [freq_evaluation, dipep_evaluation, red_freq_evaluation, red_ngram_evaluation, prop_evaluation],
    axis=1
)

# 2. Prefix all base feature columns with 'base_'
base_features_eval.columns = [f"base_{col}" for col in base_features_eval.columns]

# 3. Merge base features with Entry metadata
df_base_evaluation = pd.concat([df_evaluation, base_features_eval], axis=1)

# 4. Preview
print(f"df_base_evaluation shape: {df_base_evaluation.shape}")
print("Sample columns:", df_base_evaluation.columns[:10].tolist())


df_base_evaluation shape: (171, 585)
Sample columns: ['Entry', 'base_A', 'base_C', 'base_D', 'base_E', 'base_F', 'base_G', 'base_H', 'base_I', 'base_K']


In [22]:
# === Load ProtLearn and Peptide descriptors for Evaluation ===

# Correct relative paths (same level)
protlearn_eval_df = pd.read_csv("protlearn_eval_features.csv")
peptide_eval_df = pd.read_csv("peptide_eval_descriptors.csv")

# Clean column names
protlearn_eval_df.columns = protlearn_eval_df.columns.str.strip()
peptide_eval_df.columns = peptide_eval_df.columns.str.strip()

# Preview to confirm
print("ProtLearn evaluation columns:", protlearn_eval_df.columns[:5].tolist())
print("Peptide evaluation columns:", peptide_eval_df.columns[:5].tolist())

# === Merge into base evaluation DataFrame ===

# Start from prefixed base features
df_evaluation_full = df_base_evaluation.copy()

# Merge ProtLearn and Peptide using 'Entry'
df_evaluation_full = df_evaluation_full.merge(protlearn_eval_df, on="Entry", how="left")
df_evaluation_full = df_evaluation_full.merge(peptide_eval_df, on="Entry", how="left")

print(f"Full merged evaluation feature matrix shape after adding ProtLearn and Peptides: {df_evaluation_full.shape}")

# X matrix only (no labels in evaluation set)
X_eval_full = df_evaluation_full.drop(columns=["Entry"])

print("X_eval_full shape (features only):", X_eval_full.shape)


ProtLearn evaluation columns: ['Entry', 'PL_AAIndex1_0', 'PL_AAIndex1_1', 'PL_AAIndex1_2', 'PL_AAIndex1_3']
Peptide evaluation columns: ['Entry', 'PEP_AF1', 'PEP_AF2', 'PEP_AF3', 'PEP_AF4']
Full merged evaluation feature matrix shape after adding ProtLearn and Peptides: (171, 2391)
X_eval_full shape (features only): (171, 2390)


In [23]:
# Write evaluation sequences to FASTA for Pfam scanning
with open("data/pfam/query_eval.fasta", "w") as fh:
    for entry, seq in zip(df_evaluation["Entry"], seq_evaluation["CleanSequence"]):
        fh.write(f">{entry}\n{seq}\n")

print("Evaluation FASTA for hmmscan prepared at: data/pfam/query_eval.fasta")


Evaluation FASTA for hmmscan prepared at: data/pfam/query_eval.fasta


In [24]:
#!hmmscan --cpu 8 --domtblout data/pfam/domtblout_eval.txt data/pfam/Pfam-A.hmm data/pfam/query_eval.fasta > data/pfam/hmmscan_eval.log


In [None]:
 
# Load Pfam matrix
pfam_eval_df = load_clean_pfam(Path("data/pfam/domtblout_eval.txt"), df_evaluation["Entry"])
pfam_eval_df = pfam_eval_df.reset_index()

# Drop wrong index name if it exists
pfam_eval_df.columns.name = None  # << KEY LINE

# Preview again
print("Cleaned Pfam matrix preview:")
print(pfam_eval_df.head())

print(f"Evaluation Pfam domain matrix shape: {pfam_eval_df.shape}")


Cleaned Pfam matrix preview:
    Entry  PF00001.24  PF00002.27  PF00003.25  PF00004.32  PF00005.30  \
0  Q9LF79           0           0           0           0           0   
1  P9WI81           0           0           0           0           0   
2  P04439           0           0           0           0           0   
3  Q16581           2           0           0           0           0   
4  Q6QNK2           0           1           0           0           0   

   PF00023.33  PF00026.26  PF00027.32  PF00036.35  ...  PF18861.4  PF19028.3  \
0           0           0           0           0  ...          0          0   
1           0           0           0           0  ...          0          0   
2           0           0           0           0  ...          0          0   
3           0           0           0           0  ...          0          0   
4           0           0           0           0  ...          0          0   

   PF19030.3  PF19035.3  PF19188.3  PF19285.2  PF19

In [26]:
# Merge Pfam Evaluation Matrix into df_evaluation_full

df_evaluation_full = df_evaluation_full.merge(pfam_eval_df, on="Entry", how="left").fillna(0)

print(f"Shape after merging Pfam domains: {df_evaluation_full.shape}")


Shape after merging Pfam domains: (171, 2651)


In [27]:
X_eval_full = df_evaluation_full.drop(columns=["Entry"])
print(f"X_eval_full shape (features only): {X_eval_full.shape}")


X_eval_full shape (features only): (171, 2650)


In [None]:

# === Load ProtBERT Evaluation Embeddings ===

protbert_eval_path = "embed_models/protbert_eval_full.npy"
protbert_eval_embeddings = np.load(protbert_eval_path)  # Should be (171, 1024)

# Convert to DataFrame and add Entry
protbert_eval_df = pd.DataFrame(
    protbert_eval_embeddings,
    columns=[f"ProtBERT_{i}" for i in range(protbert_eval_embeddings.shape[1])]
)
protbert_eval_df.insert(0, "Entry", df_evaluation_full["Entry"].values)

print(f"ProtBERT evaluation DataFrame shape: {protbert_eval_df.shape}")
print(f"Sample ProtBERT columns: {protbert_eval_df.columns[:5].tolist()}")

# Merge ProtBERT into df_evaluation_full
df_evaluation_full = df_evaluation_full.merge(protbert_eval_df, on="Entry", how="left")
print(f"Shape after merging ProtBERT embeddings: {df_evaluation_full.shape}")

# === Load ESM2 Evaluation Embeddings ===

esm2_eval_path = "embed_models/esm2_eval_full.npy"
esm2_eval_embeddings = np.load(esm2_eval_path)  # Should be (171, 1280)

# Convert to DataFrame and add Entry
esm2_eval_df = pd.DataFrame(
    esm2_eval_embeddings,
    columns=[f"ESM2_{i}" for i in range(esm2_eval_embeddings.shape[1])]
)
esm2_eval_df.insert(0, "Entry", df_evaluation_full["Entry"].values)

print(f"ESM2 evaluation DataFrame shape: {esm2_eval_df.shape}")
print(f"Sample ESM2 columns: {esm2_eval_df.columns[:5].tolist()}")

# Merge ESM2 into df_evaluation_full
df_evaluation_full = df_evaluation_full.merge(esm2_eval_df, on="Entry", how="left")
print(f"Shape after merging ESM2 embeddings: {df_evaluation_full.shape}")

# === Final Sanity Check ===

print("\nFinal columns after embedding merge:")
print(df_evaluation_full.columns[:10].tolist())

# Build final modeling matrix (no labels in evaluation set)
X_eval_full = df_evaluation_full.drop(columns=["Entry"])  # Only features

print(f"\nFinal X_eval_full shape (features only): {X_eval_full.shape}")


ProtBERT evaluation DataFrame shape: (171, 1025)
Sample ProtBERT columns: ['Entry', 'ProtBERT_0', 'ProtBERT_1', 'ProtBERT_2', 'ProtBERT_3']
Shape after merging ProtBERT embeddings: (171, 3675)
ESM2 evaluation DataFrame shape: (171, 1281)
Sample ESM2 columns: ['Entry', 'ESM2_0', 'ESM2_1', 'ESM2_2', 'ESM2_3']
Shape after merging ESM2 embeddings: (171, 4955)

Final columns after embedding merge:
['Entry', 'base_A', 'base_C', 'base_D', 'base_E', 'base_F', 'base_G', 'base_H', 'base_I', 'base_K']

Final X_eval_full shape (features only): (171, 4954)


In [29]:
print(f"Total final number of evaluation samples: {X_eval_full.shape[0]}")  # Should be 171
print(f"Total number of features: {X_eval_full.shape[1]}")  # Should match your training feature set


Total final number of evaluation samples: 171
Total number of features: 4954


In [30]:
# === Check Matching of Columns ===

# Check if columns match exactly
train_columns = X_full.columns.tolist()
eval_columns = X_eval_full.columns.tolist()

# Compare sets (order does NOT matter for set comparison)
columns_match = set(train_columns) == set(eval_columns)

print(f"\nDo training and evaluation feature columns match (ignoring order)? {columns_match}")

# === Force Same Order ===

if columns_match:
    # Align evaluation columns in the same order as training
    X_eval_full = X_eval_full[train_columns]
    print("Aligned evaluation columns to match training feature order.")
else:
    # If mismatch, print which columns are different
    print("\nMismatch detected! Columns missing or different:")
    print("In training but not in evaluation:", set(train_columns) - set(eval_columns))
    print("In evaluation but not in training:", set(eval_columns) - set(train_columns))



Do training and evaluation feature columns match (ignoring order)? False

Mismatch detected! Columns missing or different:
In training but not in evaluation: {'PF13492.9', 'PF12947.10', 'PF08344.14', 'PF10591.12', 'PF02393.19', 'PF00970.27', 'PF10609.12', 'PF00175.24', 'PF20141.2', 'PF12763.10', 'PF06365.15', 'PF13432.9', 'PF15494.9', 'PF06736.14', 'PF04389.20', 'PF00130.25', 'PF01034.23', 'PF01738.21', 'PF14670.9', 'PF16491.8', 'PF03717.18', 'PF01062.24', 'PF14559.9', 'PF14292.9', 'PF13087.9', 'PF12697.10', 'PF18651.4', 'PF02815.22', 'PF13479.9', 'PF03924.16', 'PF06512.16', 'PF01403.22', 'PF11933.11', 'PF08447.15', 'PF00431.23', 'PF03521.17', 'PF01278.23', 'PF00430.21', 'PF20067.2', 'PF12146.11', 'PF13428.9', 'PF17813.4', 'PF05409.16', 'PF09303.13', 'PF13903.9', 'PF01108.20', 'PF02724.17', 'PF16602.8', 'PF08716.13', 'PF01833.27', 'PF13176.9', 'PF11633.11', 'PF02191.19', 'PF13604.9', 'PF00654.23', 'PF00864.22', 'PF13426.10', 'PF13672.9', 'PF16526.8', 'PF00915.23', 'PF00073.23', 'PF009

In [31]:
# === Find full set of all columns in training and evaluation ===

full_feature_set = list(sorted(set(X_full.columns) | set(X_eval_full.columns)))

print(f"\nTotal unified feature set length: {len(full_feature_set)}")

# === Reindex X_full and X_eval_full to this full feature set ===

X_full = X_full.reindex(columns=full_feature_set, fill_value=0)
X_eval_full = X_eval_full.reindex(columns=full_feature_set, fill_value=0)

# ===  Confirm final match ===

print(f"\nX_full shape after reindexing: {X_full.shape}")
print(f"X_eval_full shape after reindexing: {X_eval_full.shape}")

# Final check
print("\nColumns match after reindexing?", set(X_full.columns) == set(X_eval_full.columns))



Total unified feature set length: 5254

X_full shape after reindexing: (681, 5254)
X_eval_full shape after reindexing: (171, 5254)

Columns match after reindexing? True


In [32]:
# Save full dataframes for safety
df_full.to_csv("X_train_full_with_labels.csv", index=False)
X_full.to_csv("X_train_full_features_only.csv", index=False)
y_full.to_csv("y_train_full_labels.csv", index=False)

df_evaluation_full.to_csv("X_eval_full_with_entry.csv", index=False)
X_eval_full.to_csv("X_eval_full_features_only.csv", index=False)

print("All datasets saved successfully.")


All datasets saved successfully.


In [33]:
# === Step 1: Define feature groups ===

base_cols = [col for col in df_full.columns if col.startswith("base_")]
pfam_cols_train = [col for col in df_full.columns if col.startswith("PF")]
pfam_cols_eval = [col for col in df_evaluation_full.columns if col.startswith("PF")]
common_pfam_cols = list(sorted(set(pfam_cols_train) & set(pfam_cols_eval)))

embed_cols = [col for col in df_full.columns if col.startswith("ProtBERT_") or col.startswith("ESM2_")]
embed_cols_eval = [col for col in df_evaluation_full.columns if col.startswith("ProtBERT_") or col.startswith("ESM2_")]

# Other descriptors (ProtLearn + Peptide) — optional in base
other_cols = [col for col in df_full.columns if col.startswith("PL_") or col.startswith("PEP_")]
other_cols_eval = [col for col in df_evaluation_full.columns if col.startswith("PL_") or col.startswith("PEP_")]

# === Step 2: Build Train and Evaluation Datasets ===

# Dataset: base only
Dataset = pd.concat([
    df[["Entry", "ProteinClass"]],
    df_full[base_cols]
], axis=1)

Dataset_eval = pd.concat([
    df_evaluation[["Entry"]],
    df_evaluation_full[base_cols]
], axis=1)

# Dataset_pfam: base + common Pfam
Dataset_pfam = pd.concat([
    df[["Entry", "ProteinClass"]],
    df_full[base_cols + common_pfam_cols]
], axis=1)

Dataset_pfam_eval = pd.concat([
    df_evaluation[["Entry"]],
    df_evaluation_full[base_cols + common_pfam_cols]
], axis=1)

# Dataset_embed: base + embeddings
Dataset_embed = pd.concat([
    df[["Entry", "ProteinClass"]],
    df_full[base_cols + embed_cols]
], axis=1)

Dataset_embed_eval = pd.concat([
    df_evaluation[["Entry"]],
    df_evaluation_full[base_cols + embed_cols]
], axis=1)

# Dataset_full: everything (base + pfam + protlearn + peptide + embeddings)
# But SAFELY — using intersection of available features
full_feature_cols_train = base_cols + pfam_cols_train + other_cols + embed_cols
full_feature_cols_eval = base_cols + pfam_cols_eval + other_cols_eval + embed_cols_eval

# Find the intersection
common_full_cols = list(sorted(set(full_feature_cols_train) & set(full_feature_cols_eval)))

Dataset_full = pd.concat([
    df[["Entry", "ProteinClass"]],
    df_full[common_full_cols]
], axis=1)

Dataset_full_eval = pd.concat([
    df_evaluation[["Entry"]],
    df_evaluation_full[common_full_cols]
], axis=1)

# === Step 3: Confirm Shapes ===

print(f"Dataset shape: {Dataset.shape}")
print(f"Dataset_eval shape: {Dataset_eval.shape}")

print(f"Dataset_pfam shape: {Dataset_pfam.shape}")
print(f"Dataset_pfam_eval shape: {Dataset_pfam_eval.shape}")

print(f"Dataset_embed shape: {Dataset_embed.shape}")
print(f"Dataset_embed_eval shape: {Dataset_embed_eval.shape}")

print(f"Dataset_full shape: {Dataset_full.shape}")
print(f"Dataset_full_eval shape: {Dataset_full_eval.shape}")


Dataset shape: (681, 586)
Dataset_eval shape: (171, 585)
Dataset_pfam shape: (681, 814)
Dataset_pfam_eval shape: (171, 813)
Dataset_embed shape: (681, 2890)
Dataset_embed_eval shape: (171, 2889)
Dataset_full shape: (681, 4924)
Dataset_full_eval shape: (171, 4923)


# Processing Steps

In [34]:
def processing_data(df):
    # Define X and y
    y = df['ProteinClass']
    X = df.drop(['Entry', 'ProteinClass'], axis=1)
    if 0:
        print("Feature sample:")
        print(X.head())
        print("\nTarget sample:")
        print(y.head())

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42)

    # Feature scaling (important for logistic regression, gradient boosting, MLP, etc.)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)


    # Defined labels
    labels = np.array(['ATPase', 'Aquaporin', 'Channel', 'GPCR', 'Integrin', 'MHC',
       'Phosphatase', 'Protease', 'RTK', 'Ser:Thr'])
    # Create a Label Encoder
    label_encoder = LabelEncoder()
    # Fit the label encoder on all possible labels, will make the labels [0, 1, 2, 3 ...]
    label_encoder.fit(labels)
    # Transform both training and testing labels
    y_label_encoded = label_encoder.transform(y)
    y_train_label_encoded = label_encoder.transform(y_train)
    y_test_label_encoded = label_encoder.transform(y_test)
    # print('first sample label encoded:', y_train_label_encoded[0])

    return (label_encoder, X.values, X_scaled, X_train, X_train_scaled, X_test, X_test_scaled,
            y, y_label_encoded, y_train, y_train_label_encoded, y_test, y_test_label_encoded, scaler)


def processing_data_evaluation(df,scaler):
    # Define X
    X = df.drop(['Entry'], axis=1)

    # Feature scaling (important for logistic regression, gradient boosting, MLP, etc.)
    X_scaled = scaler.fit_transform(X)

    return (X.values, X_scaled)


In [35]:
(label_encoder, X, X_scaled, X_train, X_train_scaled, X_test, X_test_scaled, y, y_label_encoded,
y_train, y_train_label_encoded, y_test, y_test_label_encoded, scaler)  = processing_data(Dataset)

# Evaluation data
X_evaluation, X_scaled_evaluation = processing_data_evaluation(Dataset_evaluation, scaler)

In [36]:
# === Define processing functions ===

def processing_data(df):
    """
    Process the training dataset:
    - Split into X and y
    - Train/test split
    - Standard scaling
    - Label encoding
    """
    y = df['ProteinClass']
    X = df.drop(['Entry', 'ProteinClass'], axis=1)

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Feature scaling (important for logistic regression, gradient boosting, MLP, etc.)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Define all possible labels
    labels = np.array([
        'ATPase', 'Aquaporin', 'Channel', 'GPCR', 'Integrin', 'MHC',
        'Phosphatase', 'Protease', 'RTK', 'Ser:Thr'
    ])

    # Create a Label Encoder
    label_encoder = LabelEncoder()
    label_encoder.fit(labels)

    # Encode labels
    y_label_encoded = label_encoder.transform(y)
    y_train_label_encoded = label_encoder.transform(y_train)
    y_test_label_encoded = label_encoder.transform(y_test)

    return (label_encoder, X.values, X_scaled,
            X_train, X_train_scaled, X_test, X_test_scaled,
            y, y_label_encoded, y_train, y_train_label_encoded, y_test, y_test_label_encoded, scaler)


In [37]:
def processing_data_evaluation(df, scaler):
    """
    Process the evaluation dataset:
    - Only X (no y)
    - Apply scaling based on training scaler
    """
    X = df.drop(['Entry'], axis=1)

    # Use the same scaler fitted on training data
    X_scaled = scaler.transform(X)

    return (X.values, X_scaled)

In [38]:
# ===  Apply processing to Dataset_full ===

(label_encoder,
 X_full_values, X_full_scaled,
 X_train, X_train_scaled, X_test, X_test_scaled,
 y_full, y_full_label_encoded, y_train, y_train_label_encoded, y_test, y_test_label_encoded,
 scaler_full) = processing_data(Dataset_full)

# === Apply processing to Dataset_full_eval ===

X_eval_values, X_eval_scaled = processing_data_evaluation(Dataset_full_eval, scaler_full)

# === Step 4: Confirm shapes ===

print("\nTraining X shape:", X_train.shape)
print("Training y shape:", y_train.shape)

print("Testing X shape:", X_test.shape)
print("Testing y shape:", y_test.shape)

print("Evaluation X shape:", X_eval_values.shape)


Training X shape: (544, 4922)
Training y shape: (544,)
Testing X shape: (137, 4922)
Testing y shape: (137,)
Evaluation X shape: (171, 4922)


# Variable and Model Tracker

This cell documents all major variables and models used in the tuning and evaluation process to ensure clarity and prevent confusion.

---

## Feature and Label Matrices
- **X_train** → Raw training features (not scaled)
- **X_train_scaled** → Scaled training features (using StandardScaler)
- **X_test** → Raw testing features (not scaled)
- **X_test_scaled** → Scaled testing features (using StandardScaler)

- **y_train** → Original training labels (not label-encoded)
- **y_train_label_encoded** → Encoded training labels (after LabelEncoder)
- **y_test** → Original testing labels (not label-encoded)
- **y_test_label_encoded** → Encoded testing labels (after LabelEncoder)

---

## Base (Untuned) Models
- **lr_model** → Logistic Regression model (before tuning)
- **rf_model** → Random Forest model (before tuning)
- **xgb_model** → XGBoost model (before tuning)

---

## Tuned (After RandomizedSearchCV) Models
- **best_lr_model** → Tuned Logistic Regression model
- **best_rf_model** → Tuned Random Forest model
- (Planned) **best_xgb_model** → Tuned XGBoost model

---

## Model Performance Variables
- **y_pred_lr** → Predictions from base Logistic Regression on test set
- **y_pred_rf** → Predictions from base Random Forest on test set
- **y_pred_xgb** → Predictions from base XGBoost on test set

- **y_pred_best_lr** → Predictions from tuned Logistic Regression
- **y_pred_best_rf** → Predictions from tuned Random Forest
- (Planned) **y_pred_best_xgb** → Predictions from tuned XGBoost

- **acc_lr** → Test accuracy of base Logistic Regression
- **acc_rf** → Test accuracy of base Random Forest
- **acc_xgb** → Test accuracy of base XGBoost

- **acc_best_lr** → Test accuracy of tuned Logistic Regression
- **acc_best_rf** → Test accuracy of tuned Random Forest
- (Planned) **acc_best_xgb** → Test accuracy of tuned XGBoost

---

# Important Notes:
- Always use **X_train_scaled** and **X_test_scaled** for models that require feature scaling, such as Logistic Regression.
- Random Forest and XGBoost typically perform better on raw (non-scaled) features, so **X_train** and **X_test** are used.
- Carefully distinguish between base models and tuned models, along with their corresponding predictions and accuracies.
- The RandomizedSearchCV results are stored in separate "best_" model variables to avoid overwriting the original models.

---

(Last Updated: [Fill in date])


# Creating Model

In [39]:
# === Model 1: Random Model (Dr. Yasin starter) ===

class random_model:
    def __init__(self):
        pass

    def fit(self, X, y):
        pass

    def predict(self, X):
        pred = []
        for x in X:
            pred.append(np.random.randint(10))  # Random class between 0–9
        return np.array(pred)

    def get_params(self, deep=True):
        return {}

# Initialize Random Model
my_model = random_model()

# === Model 2: Logistic Regression (Dr. Yasin starter) ===

from sklearn.linear_model import LogisticRegression

# Define the base logistic regression model 
lr_model = LogisticRegression(
    penalty="l2",
    solver="lbfgs",
    max_iter=1000,
    random_state=42
)


# === Model 3: Random Forest Classifier ===

from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=20,
    random_state=42,
    n_jobs=-1
)

# === Model 4: XGBoost Classifier ===

from xgboost import XGBClassifier

xgb_model = XGBClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=10,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="multi:softprob",
    eval_metric="mlogloss",
    use_label_encoder=False,
    random_state=42,
    n_jobs=-1
)

# === Confirm Initialization ===

print("Models initialized:")
print("- Random Model (baseline)")
print("- Logistic Regression")
print("- Random Forest")
print("- XGBoost")


Models initialized:
- Random Model (baseline)
- Logistic Regression
- Random Forest
- XGBoost


# Train Model

In [40]:
# Fit My Model to the training data
my_model.fit(X_train, y_train)

# Fit the model to the training data without tuning
lr_model.fit(X_train_scaled, y_train)

## To Do ##
# Fit your models to the training data you can use
# raw, scaled, or other forms

In [None]:
# === Baseline Fitting and Timing ===

# Random Model
start = time.time()
my_model.fit(X_train, y_train_label_encoded)
end = time.time()
print(f"My Random Model fit time: {end - start:.2f} seconds")

# Evaluate Random Model (FIX: use X_test.values)
y_pred_random = my_model.predict(X_test.values)  # FIXED
acc_random = accuracy_score(y_test_label_encoded, y_pred_random)
print(f"My Random Model baseline accuracy: {acc_random:.4f}")

# Logistic Regression
start = time.time()
lr_model.fit(X_train_scaled, y_train_label_encoded)
end = time.time()
print(f"Logistic Regression fit time: {end - start:.2f} seconds")

# Evaluate Logistic Regression
y_pred_lr = lr_model.predict(X_test_scaled)
acc_lr = accuracy_score(y_test_label_encoded, y_pred_lr)
print(f"Logistic Regression baseline accuracy: {acc_lr:.4f}")

# Random Forest
start = time.time()
rf_model.fit(X_train, y_train_label_encoded)
end = time.time()
print(f"Random Forest fit time: {end - start:.2f} seconds")

# Evaluate Random Forest
y_pred_rf = rf_model.predict(X_test)
acc_rf = accuracy_score(y_test_label_encoded, y_pred_rf)
print(f"Random Forest baseline accuracy: {acc_rf:.4f}")

# XGBoost
start = time.time()
xgb_model.fit(X_train, y_train_label_encoded)
end = time.time()
print(f"XGBoost fit time: {end - start:.2f} seconds")

# Evaluate XGBoost
y_pred_xgb = xgb_model.predict(X_test)
acc_xgb = accuracy_score(y_test_label_encoded, y_pred_xgb)
print(f"XGBoost baseline accuracy: {acc_xgb:.4f}")


My Random Model fit time: 0.00 seconds
My Random Model baseline accuracy: 0.0803
Logistic Regression fit time: 0.35 seconds
Logistic Regression baseline accuracy: 0.9781
Random Forest fit time: 1.97 seconds
Random Forest baseline accuracy: 0.9562


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost fit time: 148.22 seconds
XGBoost baseline accuracy: 0.9708


# Training and Tuning

In [None]:
# Define the parameter distribution for 'C'
param_dist_lr = {
    'C': loguniform(1e-3, 1e3),
    'penalty': ['l2'],
    'solver': ['lbfgs'],
    'max_iter': [500, 1000, 2000],
    'random_state': [42]
}

# Set up RandomizedSearchCV
random_search_lr = RandomizedSearchCV(
    estimator=lr_model,
    param_distributions=param_dist_lr,
    n_iter=50,
    cv=5,
    scoring='accuracy',
    random_state=42,
    n_jobs=-1
)

# Fit the model to the training data with tuning
random_search_lr.fit(X_train_scaled, y_train)
# Get the best estimator
best_lr_model = random_search_lr.best_estimator_
print("Best parameters found for Logistic Regression:")
print(random_search_lr.best_params_)

## To Do ##
# Fit your models to the training data you can use
# raw, scaled, or other forms
# Make sure to do tuning this time

Best parameters found for Logistic Regression:
{'C': 0.04848496183873291, 'max_iter': 500, 'penalty': 'l2', 'random_state': 42, 'solver': 'lbfgs'}


## Random Forest Tuning Setup

- **rf_model**: Random Forest Classifier base model before tuning.
- **param_dist_rf**: Dictionary defining the hyperparameter search space for RandomizedSearchCV.
- **random_search_rf**: RandomizedSearchCV object to perform tuning on rf_model.
- **best_rf_model**: The best Random Forest model selected after tuning.


In [None]:
from sklearn.ensemble import RandomForestClassifier

# ----------------------------
# Define hyperparameter space
# ----------------------------

param_dist_rf = {
    'n_estimators': randint(100, 1000),    # Number of trees in the forest
    'max_depth': randint(3, 30),            # Maximum depth of each tree
    'min_samples_split': randint(2, 20),    # Minimum samples to split an internal node
    'min_samples_leaf': randint(1, 20),     # Minimum samples at a leaf node
    'max_features': ['sqrt', 'log2'],       # How many features to consider when looking for best split
    'bootstrap': [True, False],             # Whether bootstrap samples are used
    'random_state': [42]                    # For reproducibility
}

# ----------------------------
# Set up RandomizedSearchCV
# ----------------------------

random_search_rf = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=param_dist_rf,
    n_iter=50,             # Number of parameter settings to sample
    cv=5,                  # 5-fold cross-validation
    scoring='accuracy',    # Metric to optimize
    random_state=42,
    n_jobs=-1              # Use all available cores
)

# ----------------------------
# Fit the RandomizedSearchCV
# ----------------------------

random_search_rf.fit(X_train, y_train)

# ----------------------------
# Retrieve the best model
# ----------------------------

best_rf_model = random_search_rf.best_estimator_

print("Best parameters found for Random Forest:")
print(random_search_rf.best_params_)


Best parameters found for Random Forest:
{'bootstrap': False, 'max_depth': 26, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 7, 'n_estimators': 610, 'random_state': 42}


In [None]:
# Suppress any warning messages for clean output
warnings.filterwarnings('ignore')

# === Step 1: Define Helper Functions for Tracking ===

# Define a decorator to track and print how long functions take to run
def track_time(func):
    """
    Decorator to track the time a function takes.
    """
    def wrapper(*args, **kwargs):
        print(f"Starting: {func.__name__}")
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        print(f"Completed: {func.__name__} in {end_time - start_time:.2f} seconds.")
        return result
    return wrapper

# Define a function to save the best trained model to disk
def save_best_model(model, filename):
    """
    Save the best model to disk.
    """
    joblib.dump(model, filename)
    print(f"Model saved as {filename}")

# === Step 2: Define Parameter Space for Random Forest RandomizedSearch ===

# Set the hyperparameter space to sample from for Random Forest
rf_param_grid = {
    'n_estimators': randint(500, 700),           # Number of trees in the forest
    'max_depth': randint(20, 30),                # Maximum depth of each tree
    'min_samples_split': randint(2, 10),         # Minimum samples to split an internal node
    'min_samples_leaf': randint(1, 4),           # Minimum samples at a leaf node
    'max_features': ['sqrt', 'log2'],            # Strategy to choose the number of features at each split
    'bootstrap': [True, False],                  # Whether bootstrap samples are used when building trees
    'random_state': [42]                         # Random seed for reproducibility
}

# === Step 3: Define Randomized Search Tuning Function ===

@track_time
def randomized_search_rf(X_train, y_train, n_iter=50, cv_folds=5, random_state=42):
    """
    Perform RandomizedSearchCV for RandomForestClassifier.
    Searches across the hyperparameter space and identifies the best combination.
    """
    rf_base = RandomForestClassifier()
    random_search = RandomizedSearchCV(
        estimator=rf_base,
        param_distributions=rf_param_grid,
        n_iter=n_iter,              # Number of random combinations to try
        cv=cv_folds,                # Cross-validation folds
        scoring='accuracy',         # Evaluation metric
        random_state=random_state,  # For reproducibility
        n_jobs=-1,                  # Parallelize across all CPU cores
        verbose=1                   # Verbosity level to show progress
    )
    random_search.fit(X_train, y_train)
    print("\nBest Parameters Found:")
    print(random_search.best_params_)
    return random_search.best_estimator_

# === Step 4: Final 10-Fold Cross-Validation on Best Model ===

@track_time
def final_cross_validation(model, X_train, y_train, cv_folds=10):
    """
    Perform a final k-fold cross-validation on the best found model.
    Provides an unbiased estimate of the model's generalization performance.
    """
    scores = cross_val_score(model, X_train, y_train, cv=cv_folds, scoring='accuracy', n_jobs=-1)
    print(f"\nCross-Validation Accuracy Scores: {scores}")
    print(f"Mean Accuracy: {np.mean(scores):.4f}")
    print(f"Standard Deviation: {np.std(scores):.4f}")
    return scores

# === Step 5: Full Pipeline Function ===

@track_time
def full_random_forest_pipeline(X_train, y_train):
    """
    Complete tuning pipeline for Random Forest:
    - Hyperparameter search
    - Save the best model
    - Evaluate with final 10-fold cross-validation
    """
    best_rf = randomized_search_rf(X_train, y_train)
    save_best_model(best_rf, "best_random_forest_model.joblib")
    print("\nPerforming Final 10-Fold Cross-Validation:")
    cv_scores = final_cross_validation(best_rf, X_train, y_train)
    return best_rf, cv_scores

# === Step 6: Run the Full Pipeline ===

# Execute the full Random Forest tuning and evaluation pipeline
best_rf_model, rf_cv_scores = full_random_forest_pipeline(X_train, y_train)


Starting: full_random_forest_pipeline
Starting: randomized_search_rf
Fitting 5 folds for each of 50 candidates, totalling 250 fits

Best Parameters Found:
{'bootstrap': False, 'max_depth': 27, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 7, 'n_estimators': 685, 'random_state': 42}
Completed: randomized_search_rf in 555.67 seconds.
Model saved as best_random_forest_model.joblib

Performing Final 10-Fold Cross-Validation:
Starting: final_cross_validation

Cross-Validation Accuracy Scores: [0.94545455 0.90909091 0.92727273 0.89090909 0.90740741 0.92592593
 0.94444444 0.94444444 0.94444444 0.94444444]
Mean Accuracy: 0.9284
Standard Deviation: 0.0188
Completed: final_cross_validation in 59.95 seconds.
Completed: full_random_forest_pipeline in 613.56 seconds.


In [49]:
# Suppress Warnings
warnings.filterwarnings('ignore')

# === Step 1: Define Helper Functions for Tracking ===

def track_time(func):
    """
    Decorator to track the time a function takes.
    """
    def wrapper(*args, **kwargs):
        print(f"Starting: {func.__name__}")
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        print(f"Completed: {func.__name__} in {end_time - start_time:.2f} seconds.")
        return result
    return wrapper

def save_best_model(model, filename):
    """
    Save the best model to disk.
    """
    joblib.dump(model, filename)
    print(f"Model saved as {filename}")

# === Step 2: Encode y_train to Integer Labels ===

# Only do this once at the beginning
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)

# === Step 3: Define Parameter Space for XGBoost RandomizedSearch ===

xgb_param_grid = {
    'n_estimators': randint(100, 700),
    'max_depth': randint(3, 12),
    'learning_rate': uniform(0.01, 0.3),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4),
    'gamma': uniform(0, 0.5),
    'reg_alpha': uniform(0, 1),
    'reg_lambda': uniform(1, 4),
    'random_state': [42]
}

# === Step 4: Define Randomized Search Tuning Function ===

@track_time
def randomized_search_xgb(X_train, y_train, n_iter=50, cv_folds=5, random_state=42):
    """
    Perform RandomizedSearchCV for XGBClassifier with fast settings.
    """
    xgb_base = XGBClassifier(
        use_label_encoder=False,
        eval_metric='mlogloss',
        verbosity=0,
        tree_method='hist',             # <--- FAST SPLITTING
        predictor='cpu_predictor',       # <--- CPU PREDICTION
        n_jobs=-1                        # <--- Full CPU parallelism
    )
    random_search = RandomizedSearchCV(
        estimator=xgb_base,
        param_distributions=xgb_param_grid,
        n_iter=n_iter,
        cv=cv_folds,
        scoring='accuracy',
        random_state=random_state,
        n_jobs=-1,
        verbose=1
    )
    random_search.fit(X_train, y_train)
    print("Best Parameters Found:")
    print(random_search.best_params_)
    return random_search.best_estimator_

# === Step 5: Final 10-Fold Cross-Validation on Best Model ===

@track_time
def final_cross_validation(model, X_train, y_train, cv_folds=10):
    """
    Perform a final k-fold cross-validation.
    """
    scores = cross_val_score(model, X_train, y_train, cv=cv_folds, scoring='accuracy', n_jobs=-1)
    print(f"Cross-Validation Accuracy Scores: {scores}")
    print(f"Mean Accuracy: {np.mean(scores):.4f}")
    print(f"Standard Deviation: {np.std(scores):.4f}")
    return scores

# === Step 6: Full Pipeline Function ===

@track_time
def full_xgboost_pipeline(X_train, y_train):
    """
    Complete tuning pipeline: Randomized Search + CV evaluation.
    """
    best_xgb = randomized_search_xgb(X_train, y_train)
    save_best_model(best_xgb, "best_xgboost_model.joblib")
    print("\nPerforming Final 10-Fold Cross-Validation:")
    cv_scores = final_cross_validation(best_xgb, X_train, y_train)
    return best_xgb, cv_scores

# === Step 7: Run the Pipeline ===

best_xgb_model, xgb_cv_scores = full_xgboost_pipeline(X_train, y_train_encoded)


Starting: full_xgboost_pipeline
Starting: randomized_search_xgb
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best Parameters Found:
{'colsample_bytree': 0.6336559859980195, 'gamma': 0.08081435704730688, 'learning_rate': 0.2795662565581238, 'max_depth': 6, 'n_estimators': 353, 'random_state': 42, 'reg_alpha': 0.009197051616629648, 'reg_lambda': 1.4058861714641284, 'subsample': 0.8654007076432223}
Completed: randomized_search_xgb in 11067.81 seconds.
Model saved as best_xgboost_model.joblib

Performing Final 10-Fold Cross-Validation:
Starting: final_cross_validation
Cross-Validation Accuracy Scores: [0.98181818 0.94545455 0.92727273 0.94545455 0.88888889 0.96296296
 0.94444444 0.96296296 0.96296296 0.96296296]
Mean Accuracy: 0.9485
Standard Deviation: 0.0245
Completed: final_cross_validation in 243.57 seconds.
Completed: full_xgboost_pipeline in 11311.45 seconds.


In [None]:
"""
Run full hyperparameter tuning and evaluation for multiple models.
- Loads pre-trained model from disk if available (skips re-training).
- If not available, performs RandomizedSearchCV to find best hyperparameters.
- Performs final 10-fold cross-validation for evaluation.
- Tracks progress with a tqdm progress bar.

Returns:
    best_models (dict): Dictionary of {model_name: (best_model, cv_scores)}.
"""

# Suppress Warnings for Clean Output
warnings.filterwarnings('ignore')

# === Step 1: Define Helper Functions ===

def track_time(func):
    """
    Decorator to track and print the time a function takes to run.
    """
    def wrapper(*args, **kwargs):
        print(f"Starting: {func.__name__}")
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        print(f"Completed: {func.__name__} in {end_time - start_time:.2f} seconds.")
        return result
    return wrapper

def save_best_model(model, filename):
    """
    Save the best model to disk using joblib.
    """
    joblib.dump(model, filename)
    print(f"Model saved as {filename}")

def load_or_train_model(filename, train_function, X_train, y_train):
    """
    Load model from file if it exists, otherwise train and save it.
    """
    if os.path.exists(filename):
        print(f"\nLoading existing model from {filename}...")
        model = joblib.load(filename)
    else:
        print(f"\n{filename} not found. Training new model...")
        model = train_function(X_train, y_train)
        save_best_model(model, filename)
    return model

# === Step 2: Encode y_train Labels (Only Once) ===

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)

# === Step 3: Define Hyperparameter Spaces for Tuning ===

# Random Forest hyperparameter space
rf_param_grid = {
    'n_estimators': randint(500, 700),
    'max_depth': randint(20, 30),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 4),
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True, False],
    'random_state': [42]
}

# XGBoost hyperparameter space
xgb_param_grid = {
    'n_estimators': randint(100, 700),
    'max_depth': randint(3, 12),
    'learning_rate': uniform(0.01, 0.3),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4),
    'gamma': uniform(0, 0.5),
    'reg_alpha': uniform(0, 1),
    'reg_lambda': uniform(1, 4),
    'random_state': [42]
}

# === Step 4: Define Tuning Functions ===

@track_time
def randomized_search_rf(X_train, y_train, n_iter=50, cv_folds=5, random_state=42):
    """
    Perform RandomizedSearchCV for RandomForestClassifier.
    """
    rf_base = RandomForestClassifier()
    random_search = RandomizedSearchCV(
        estimator=rf_base,
        param_distributions=rf_param_grid,
        n_iter=n_iter,
        cv=cv_folds,
        scoring='accuracy',
        random_state=random_state,
        n_jobs=-1,
        verbose=1
    )
    random_search.fit(X_train, y_train)
    print("\nBest Parameters Found for Random Forest:")
    print(random_search.best_params_)
    return random_search.best_estimator_

@track_time
def randomized_search_xgb(X_train, y_train, n_iter=50, cv_folds=5, random_state=42):
    """
    Perform RandomizedSearchCV for XGBClassifier.
    """
    xgb_base = XGBClassifier(
        use_label_encoder=False,
        eval_metric='mlogloss',
        verbosity=0,
        tree_method='hist',            # Fast split finding
        predictor='cpu_predictor',      # Use CPU for prediction
        n_jobs=-1
    )
    random_search = RandomizedSearchCV(
        estimator=xgb_base,
        param_distributions=xgb_param_grid,
        n_iter=n_iter,
        cv=cv_folds,
        scoring='accuracy',
        random_state=random_state,
        n_jobs=-1,
        verbose=1
    )
    random_search.fit(X_train, y_train)
    print("\nBest Parameters Found for XGBoost:")
    print(random_search.best_params_)
    return random_search.best_estimator_

# === Step 5: Define Final Cross-Validation Function ===

@track_time
def final_cross_validation(model, X_train, y_train, cv_folds=10):
    """
    Perform 10-fold cross-validation and report mean accuracy.
    """
    scores = cross_val_score(model, X_train, y_train, cv=cv_folds, scoring='accuracy', n_jobs=-1)
    print(f"\nCross-Validation Accuracy Scores: {scores}")
    print(f"Mean Accuracy: {np.mean(scores):.4f}")
    print(f"Standard Deviation: {np.std(scores):.4f}")
    return scores

# === Step 6: Full Combined Pipeline ===

def full_combined_pipeline(X_train, y_train):
    """
    Run full tuning and evaluation for Random Forest and XGBoost with tqdm progress.
    Load model from disk if it already exists.
    """
    steps = [
        ("Random Forest", randomized_search_rf, "best_rf_model.joblib"),
        ("XGBoost", randomized_search_xgb, "best_xgboost_model.joblib")
    ]

    best_models = {}

    with tqdm(total=len(steps), desc="Combined Model Tuning Pipeline", ncols=100) as pbar:
        for model_name, tuning_function, filename in steps:
            print(f"\n--- {model_name} Tuning and Evaluation ---")
            model = load_or_train_model(filename, tuning_function, X_train, y_train)
            print(f"\nPerforming Final 10-Fold Cross-Validation for {model_name}:")
            scores = final_cross_validation(model, X_train, y_train)
            best_models[model_name] = (model, scores)
            pbar.update(1)

    return best_models

# === Step 7: Run Pipeline ===

best_models_dict = full_combined_pipeline(X_train, y_train_encoded)


# Evaluating model

In [44]:
def evaluate_model(model, X_test, y_test, model_name):

    y_pred = model.predict(X_test)

    # Evaluation
    accuracy = accuracy_score(y_test, y_pred)
    print(f"\n=== {model_name} ===")
    print(f"Accuracy for testing data: {accuracy:.4f}")
    return y_pred

def evaluate_model_testing(model, X_test, model_name):

    y_pred = model.predict(X_test)

    print(f"\n=== {model_name} ===")
    print("Output Evaluated")
    return y_pred

In [45]:
# Evaluate my model
y_pred_my_model = evaluate_model(my_model, X_test_scaled, y_test_label_encoded, model_name="My Random Model")
# Convert back to labels
y_pred_my_model = label_encoder.inverse_transform(y_pred_my_model)

# Evaluate Logistic Regression
y_pred_lr_model = evaluate_model(lr_model, X_test_scaled, y_test, model_name="Logistic Regression")


# Evaluate Logistic Regression Best estimator
y_pred_best_lr_model = evaluate_model(best_lr_model, X_test_scaled, y_test, model_name="Logistic Regression Best")



## To Do ##
# Evaluate all models



=== My Random Model ===
Accuracy for testing data: 0.1314

=== Logistic Regression ===
Accuracy for testing data: 0.0000

=== Logistic Regression Best ===
Accuracy for testing data: 0.9781


## Evaluate the provided data for the competition

In [46]:
# Evaluate my model
y_pred_my_model = evaluate_model_testing(my_model, X_evaluation, model_name="My Random Model")
# Convert back to labels
y_pred_my_model_evaluation = label_encoder.inverse_transform(y_pred_my_model)


# Evaluate Logistic Regression
y_pred_lr_model_evaluation = evaluate_model_testing(lr_model, X_scaled_evaluation, model_name="Logistic Regression")


# Evaluate Logistic Regression Best Model
y_pred_lr_best_model_evaluation = evaluate_model_testing(best_lr_model, X_scaled_evaluation, model_name="Logistic Regression Best")

## To Do ##
# Evaluate all the models with the competition data



=== My Random Model ===
Output Evaluated


ValueError: X has 34 features, but LogisticRegression is expecting 4922 features as input.

In [None]:
# Prediction of my model on testing data
print("\n=== My Model ===")
print(y_pred_my_model_evaluation[:5])

# Prediction of my LR on testing data
print("\n=== Logistic Regression ===")
print(y_pred_lr_model_evaluation[:5])

# Prediction of my LR on testing data
print("\n=== Logistic Regression Best ===")
print(y_pred_lr_best_model_evaluation[:5])

## To Do ##
# Print the first 5 examples for all model output for the competition data
# Make sure that the output is converted back into a list of string for the Protien class

# Creat csv file for Evaluation

In [None]:
# Function to creat csv output for uploading
def save_predictions(_fn, _y_pred, _df):
    import csv
    with open(_fn, 'w') as fout:
        writer = csv.writer(fout, delimiter=',', lineterminator='\n')
        writer.writerow(['Entry', 'ProteinClass'])
        for y, Entry in zip(_df['Entry'], _y_pred):
            writer.writerow([y, Entry])

In [None]:
# Saving My random model output
save_predictions('Student_name_attempt_1.csv', y_pred_my_model_evaluation, Dataset_evaluation )
# Saving LR model output
save_predictions('Student_name_attempt_2.csv', y_pred_lr_model_evaluation, Dataset_evaluation )
# Saving Best LR model output
save_predictions('Student_name_attempt_3.csv', y_pred_lr_best_model_evaluation, Dataset_evaluation )

## To Do ##
# Save the output of your models

In [None]:
## To Do ##
# Make sure to upload at least 3 attemps to the website
# Good Luck