# CAFA6 ‚Äî Combined ProtBERT Embedding + Multi-label Classifier
**Author:** Wasiq Ali

**Goal:** Combine ideas from multiple CAFA6 notebooks (ProtBERT embeddings + classifiers) into a single unified approach.


In [None]:
!pip install Bio

## 1. import library

In [None]:
# 1. Setup
import os, gc, json, glob
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from Bio import SeqIO
from sklearn.model_selection import KFold
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import roc_auc_score
import numpy as np
import lightgbm as lgb
import gc, time
import torch
from transformers import AutoTokenizer, AutoModel

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Device:', DEVICE)

## 2. load the dataset

In [None]:
import os

# Step 1: Kaggle competition dataset ka base path
base_path = "/kaggle/input/cafa-6-protein-function-prediction"

# Step 2: Directory walk se har file ka full path print karo
for dirname, _, filenames in os.walk(base_path):
    for filename in filenames:
        full_path = os.path.join(dirname, filename)
        print(full_path)


In [None]:
import os
import pandas as pd
from Bio import SeqIO

# 1. Define paths
INPUT_DIR = '/kaggle/input/cafa-6-protein-function-prediction'
TRAIN_PATH = os.path.join(INPUT_DIR, 'Train/train_terms.tsv')
TEST_PATH = os.path.join(INPUT_DIR, 'Test/testsuperset.fasta')
TEST_PATHS = os.path.join(INPUT_DIR, 'Test/testsuperset-taxon-list.tsv')
SAMPLE_PATH = os.path.join(INPUT_DIR, 'sample_submission.tsv')

# 2. Load train data (.tsv)
train_df = pd.read_csv(TRAIN_PATH, sep='\t')
print("‚úÖ Train data loaded:", train_df.shape)

# 2. Load train data (.tsv)
test_df = pd.read_csv(TEST_PATHS, sep='\t')
print("‚úÖ Train data loaded:", train_df.shape)

# 3. Load test sequences (.fasta)
test_sequences = list(SeqIO.parse(TEST_PATH, "fasta"))
print("‚úÖ Test sequences loaded:", len(test_sequences))

# 4. Manually read valid lines from sample_submission.tsv
sample_data = []
with open(SAMPLE_PATH, 'r') as file:
    for line in file:
        parts = line.strip().split('\t')
        if len(parts) == 3:
            sample_data.append(parts)

# Convert to DataFrame
sample_sub = pd.DataFrame(sample_data, columns=['ProteinID', 'GO_term', 'score'])
print("‚úÖ Sample submission loaded:", sample_sub.shape)

# Preview
print(sample_sub.head())


## 3. explore the dataset

In [None]:
train_df.head()

In [None]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

# Step 1: Assume train_df is already loaded and looks like:
# EntryID     term        aspect
# Q5W0B1      GO:0000785  C
# Q5W0B1      GO:0004842  F
# Q5W0B1      GO:0051865  P

# ‚úÖ Step 2: Create 'labels' column ‚Äî semicolon separated GO terms
new_df = train_df.groupby('EntryID')['term'].apply(lambda x: ';'.join(x)).reset_index()
new_df.columns = ['EntryID', 'labels']

# ‚úÖ Step 3: Use your original label parsing + binarization code
def parse_labels(x):
    if pd.isna(x): return []
    return [t.strip() for t in str(x).split(';') if t.strip()]

new_df['labels_list'] = new_df['labels'].apply(parse_labels)

mlb = MultiLabelBinarizer()
Y = mlb.fit_transform(new_df['labels_list'])

# ‚úÖ Step 4: Output confirmation
print('‚úÖ Num proteins:', new_df.shape[0])
print('‚úÖ Num classes:', len(mlb.classes_))
print('‚úÖ Y shape:', Y.shape)
print(new_df.head())


In [None]:
# Step 1: Load fasta file and extract sequences
fasta_path = "/kaggle/input/cafa-6-protein-function-prediction/Train/train_sequences.fasta"
seq_records = SeqIO.parse(fasta_path, "fasta")

# Step 2: Convert to DataFrame
seq_data = []
for record in seq_records:
    seq_data.append({
        "EntryID": record.id,            # e.g. Q5W0B1
        "sequence": str(record.seq)      # e.g. MAASEYKLYG...
    })

seq_df = pd.DataFrame(seq_data)

# Step 3: Merge with your original train_df
train_df = pd.merge(train_df, seq_df, on="EntryID", how="left")

# (Optional) Check if any sequences are missing
missing = train_df[train_df['sequence'].isna()]
print("Missing sequences:", len(missing))


## 4. Sequence

In [None]:
def simple_features(sequences):
    features = []
    for seq in sequences:
        length = len(seq)
        a_count = seq.count('A') / length
        g_count = seq.count('G') / length
        c_count = seq.count('C') / length
        t_count = seq.count('T') / length
        features.append([length, a_count, g_count, c_count, t_count])
    return np.array(features)

train_emb = simple_features(train_df['sequence'].astype(str).tolist())


## 5. merging

In [None]:
import os
import pandas as pd
from Bio import SeqIO
import numpy as np

# 1. Define paths
INPUT_DIR = '/kaggle/input/cafa-6-protein-function-prediction'
TRAIN_TERMS_PATH = os.path.join(INPUT_DIR, 'Train', 'train_terms.tsv')
TRAIN_FASTA_PATH = os.path.join(INPUT_DIR, 'Train', 'train_sequences.fasta')
TEST_FASTA_PATH = os.path.join(INPUT_DIR, 'Test', 'testsuperset.fasta')
SAMPLE_SUB_PATH = os.path.join(INPUT_DIR, 'sample_submission.tsv')

# 2. Load train terms (.tsv)
train_df = pd.read_csv(TRAIN_TERMS_PATH, sep='\t')
print("‚úÖ Train data loaded:", train_df.shape)
print("Columns:", train_df.columns.tolist())

# 3. Load sample submission safely
sample_data = []
with open(SAMPLE_SUB_PATH, 'r') as f:
    for line in f:
        parts = line.strip().split('\t')
        if len(parts) == 3:
            sample_data.append(parts)

sample_sub = pd.DataFrame(sample_data, columns=['ID', 'GO_term', 'score'])
print("‚úÖ Sample submission loaded:", sample_sub.shape)
print("Sample sub columns:", sample_sub.columns.tolist())

# 4. Load FASTA sequences into DataFrame
def load_fasta_to_df(fasta_path):
    ids = []
    seqs = []
    for rec in SeqIO.parse(fasta_path, "fasta"):
        ids.append(rec.id)
        seqs.append(str(rec.seq))
    return pd.DataFrame({'id': ids, 'sequence': seqs})

train_seqs_df = load_fasta_to_df(TRAIN_FASTA_PATH)
test_seqs_df = load_fasta_to_df(TEST_FASTA_PATH)
print("‚úÖ Train sequences DF:", train_seqs_df.shape)
print("‚úÖ Test sequences DF:", test_seqs_df.shape)

# 5. Merge train_df with train_seq
train_df = pd.merge(train_df.rename(columns={'EntryID': 'id'}), train_seqs_df, on='id', how='left')
print("üîé After merge train_df:", train_df.shape)
print("Missing sequences in train:", train_df['sequence'].isna().sum())

# 6. Merge sample_sub (test) with sequences
test_df = pd.merge(sample_sub.rename(columns={'ID': 'id'}), test_seqs_df, on='id', how='left')
print("üîé After merge test_df:", test_df.shape)
print("Missing sequences in test:", test_df['sequence'].isna().sum())

# 7. Feature extraction function
def simple_features(sequences):
    feats = []
    for seq in sequences:
        if not isinstance(seq, str) or len(seq) == 0:
            feats.append([0, 0, 0, 0, 0])
            continue
        length = len(seq)
        a = seq.count('A') / length
        g = seq.count('G') / length
        c = seq.count('C') / length
        t = seq.count('T') / length
        feats.append([length, a, g, c, t])
    return np.array(feats)

# 8. Build features for train & test
train_emb = simple_features(train_df['sequence'].astype(str).tolist())
test_emb = simple_features(test_df['sequence'].astype(str).tolist())
print("‚úÖ train_feats shape:", train_emb.shape)
print("‚úÖ test_feats shape:", test_emb.shape)

# 9. (Optional) Save features
np.save('train_feats.npy', train_emb)
np.save('test_feats.npy', test_emb)


In [None]:
# Check columns to identify the correct ID field
print("train_df columns:", train_df.columns.tolist())
print("test_df columns:", test_df.columns.tolist())
print("train_seqs_df columns:", train_seqs_df.columns.tolist())
print("test_seqs_df columns:", test_seqs_df.columns.tolist())


In [None]:
from Bio import SeqIO

fasta_path = "/kaggle/input/cafa-6-protein-function-prediction/Train/train_sequences.fasta"
all_seq_ids = [rec.id for rec in SeqIO.parse(fasta_path, "fasta")]
print("üîπ Total IDs in fasta:", len(all_seq_ids))

terms_ids = set(grouped['seq_id'])
print("üîπ Total IDs in train_terms.tsv:", len(terms_ids))

common_ids = terms_ids.intersection(all_seq_ids)
print("‚úÖ Common IDs between fasta & terms:", len(common_ids))


In [None]:
# Normalize both fasta and terms IDs (strip prefixes)
def clean_id(x):
    return x.split('|')[-1].strip().upper()

fasta_clean = [clean_id(i) for i in all_seq_ids]
terms_clean = [clean_id(i) for i in grouped['seq_id']]

# Mapping again
id_to_idx = {sid: i for i, sid in enumerate(fasta_clean)}

label_indices = [id_to_idx[sid] for sid in terms_clean if sid in id_to_idx]
print("‚úÖ Found matching embeddings for", len(label_indices), "labeled sequences")

train_red = train_red[label_indices]
print("Filtered train_red shape:", train_red.shape)


In [None]:
print("train_red:", train_red.shape)
print("Y:", Y.shape)
assert train_red.shape[0] == Y.shape[0], "‚ùå Still mismatched!"


## 6. Reduce dimension


In [None]:
# 6. Reduce dimension
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=5, random_state=42)
train_red = svd.fit_transform(train_emb)
test_red = svd.transform(test_emb)
print(train_red.shape, test_red.shape)

In [None]:
from sklearn.decomposition import PCA
import numpy as np

print("Applying PCA compression to reduce embedding size...")

# Convert dtype
train_red = train_red.astype(np.float32)
test_red = test_red.astype(np.float32)

# Determine valid number of components
n_samples, n_features = train_red.shape
n_components = min(256, n_samples, n_features // 2)
print(f"Using PCA n_components={n_components} (auto-adjusted)")

# Apply PCA
pca = PCA(n_components=n_components, random_state=42)
train_red = pca.fit_transform(train_red)
test_red = pca.transform(test_red)

print("‚úÖ PCA compression complete:", train_red.shape, test_red.shape)


In [None]:
print("train_red shape:", train_red.shape)


In [None]:
print("train_red shape:", test_red.shape)


In [None]:
# Cell A: Load / detect embeddings and inspect shapes
import os, sys, numpy as np, pandas as pd, gc
from pathlib import Path

# Try to use existing variables if present; otherwise load .npy files if available
def get_var(name, globals_):
    return globals_.get(name, None)

# Use the current notebook globals
g = globals()

train_red = get_var('train_red', g)
test_red  = get_var('test_red', g)

# Try to load .npy if variables missing
if train_red is None or test_red is None:
    # change these paths if your files are elsewhere
    train_path = '/kaggle/input/embeddings/train_embeddings.npy'
    test_path  = '/kaggle/input/embeddings/test_embeddings.npy'
    if os.path.exists(train_path) and os.path.exists(test_path):
        print("Loading embeddings from .npy files...")
        train_red = np.load(train_path)
        test_red  = np.load(test_path)
    else:
        raise FileNotFoundError("train_red/test_red not found in session and default .npy paths missing. "
                                "Either create variables train_red/test_red earlier or upload .npy embeddings to /kaggle/input/embeddings/")

print("train_red shape:", train_red.shape, " dtype:", train_red.dtype)
print("test_red  shape:", test_red.shape,  " dtype:", test_red.dtype)

# Force float32 to reduce memory
train_red = train_red.astype('float32', copy=False)
test_red  = test_red.astype('float32', copy=False)

gc.collect()


In [None]:
# Cell B: Create Y and mlb from train_terms.tsv (only run if you do not already have Y)
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

terms_path = '/kaggle/input/cafa-6-protein-function-prediction/Train/train_terms.tsv'
seq_path   = '/kaggle/input/cafa-6-protein-function-prediction/Train/train_sequences.fasta'

# Load train_terms.tsv: assume two cols: sequence_id \t term (or similar). We'll parse safely.
df_terms = pd.read_csv(terms_path, sep='\t', header=None, names=['seq_id','term'])
# Group terms per seq id
grouped = df_terms.groupby('seq_id')['term'].apply(list).reset_index()
print("Unique train sequences with terms:", len(grouped))

# Create MultiLabelBinarizer
mlb = MultiLabelBinarizer(sparse_output=False)
Y = mlb.fit_transform(grouped['term'].values)  # shape: (n_samples, n_classes)

print("Y shape:", Y.shape, "Num classes:", len(mlb.classes_))
# NOTE: you must ensure train_red rows align with grouped order; if they don't, you need to align them.
# If your embeddings correspond to the same order as grouped['seq_id'], OK.


In [None]:
# Cell C: Adaptive PCA compression (safe)
from sklearn.decomposition import PCA
import numpy as np, gc

print("Running adaptive PCA...")

n_samples, n_features = train_red.shape
# choose target max components (256 preferred), but bound by data
target_components = 256
n_components = min(target_components, n_samples - 1, n_features)
if n_components <= 0:
    n_components = min(1, n_features)   # fallback safe option
print(f"Using PCA n_components = {n_components} (samples={n_samples}, features={n_features})")

if n_components < n_features:
    pca = PCA(n_components=n_components, svd_solver='randomized', random_state=42)
    train_red = pca.fit_transform(train_red)
    test_red  = pca.transform(test_red)
    print("PCA done. Shapes:", train_red.shape, test_red.shape)
else:
    print("Skipping PCA because n_components >= n_features")

# Ensure float32
train_red = train_red.astype('float32', copy=False)
test_red  = test_red.astype('float32', copy=False)
gc.collect()


In [None]:
# # --- ALIGN EMBEDDINGS TO LABELLED SEQUENCES ---

# # 1. Load the sequence IDs in the same order as your embeddings were generated.
# #    (Replace this with the correct path to your fasta file.)
# from Bio import SeqIO

# fasta_path = "/kaggle/input/cafa-6-protein-function-prediction/Train/train_sequences.fasta"
# all_seq_ids = [rec.id for rec in SeqIO.parse(fasta_path, "fasta")]
# print("Total embeddings/sequences:", len(all_seq_ids))

# # 2. Create a mapping from seq_id ‚Üí row index in train_red
# id_to_idx = {sid: i for i, sid in enumerate(all_seq_ids)}

# # 3. Select only the embeddings whose seq_id appears in grouped['seq_id']
# label_indices = [id_to_idx[sid] for sid in grouped['seq_id'] if sid in id_to_idx]
# train_red = train_red[label_indices]

# print("Filtered train_red shape (aligned):", train_red.shape)


In [None]:
# --- REPLACEMENT for Cell 27: ALIGN EMBEDDINGS WITH LABELLED SEQUENCES (robust) ---
from Bio import SeqIO
import re, numpy as np, gc

fasta_path = "/kaggle/input/cafa-6-protein-function-prediction/Train/train_sequences.fasta"

def extract_uniprot_id(header):
    """
    Extract a UniProt-like ID from FASTA header.
    Handles common formats like:
      >sp|Q9V3I5|PROT_DROME description...
      >Q9V3I5 description...
    Returns the best candidate ID string.
    """
    # try pattern with pipes (sp|ID|...)
    m = re.search(r"\|([A-Z0-9]+)\|", header)
    if m:
        return m.group(1)
    # fallback: take first token and strip common prefixes
    first_tok = header.split()[0]
    # remove leading '>' if present
    return first_tok.lstrip('>').split('|')[-1]

# Parse FASTA IDs (in the exact order they appear in the FASTA)
all_seq_ids = [extract_uniprot_id(rec.id) for rec in SeqIO.parse(fasta_path, "fasta")]
print("Total FASTA IDs (parsed):", len(all_seq_ids))

# Ensure grouped exists (from Cell 25) and contains labelled seq ids
try:
    terms_ids = grouped['seq_id'].astype(str).str.strip().tolist()
except NameError:
    raise NameError("Variable `grouped` not found. Run the cell that creates `grouped`/`Y` from train_terms.tsv first (Cell 25).")

print("Total labelled IDs (from train_terms):", len(terms_ids))

# Build mapping and find matches
id_to_idx = {sid: i for i, sid in enumerate(all_seq_ids)}

label_indices = [id_to_idx[sid] for sid in terms_ids if sid in id_to_idx]
print("Matched labelled embeddings:", len(label_indices), " / ", len(terms_ids))

if len(label_indices) == 0:
    # helpful debug output
    print("Example fasta ids:", all_seq_ids[:5])
    print("Example label ids:", terms_ids[:5])
    raise ValueError("No matches found between train_terms IDs and FASTA IDs. Check formats; may need custom parsing.")

# Subset embeddings to only labelled rows (order follows grouped['seq_id'])
train_red = train_red[label_indices]
print("Filtered train_red shape:", train_red.shape)
gc.collect()

# Final assertion: shapes must match Y
print("Y shape:", getattr(globals().get('Y', None), 'shape', None))
if 'Y' in globals():
    assert train_red.shape[0] == Y.shape[0], "train_red and Y row counts still mismatched!"
else:
    print("Warning: Y not found in namespace; ensure you built Y earlier.")


## 7. LogisticRegression baseline


In [None]:
# Cell D: Memory-safe LogisticRegression baseline (REPLACE 3rd-last cell)
import numpy as np, gc, time
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.multiclass import OneVsRestClassifier

# If train_red has extremely low dims (e.g., shape[1] <= 2), try to load richer embeddings if available
# Uncomment and edit paths if you have original embeddings stored as .npy
# richer_train_path = '/kaggle/input/embeddings/orig_train_embeddings.npy'
# richer_test_path  = '/kaggle/input/embeddings/orig_test_embeddings.npy'
# if train_red.shape[1] <= 2 and os.path.exists(richer_train_path):
#     print("Switching to richer embeddings for LR baseline.")
#     train_lr = np.load(richer_train_path).astype('float32')
#     test_lr  = np.load(richer_test_path).astype('float32')
# else:
train_lr = train_red
test_lr  = test_red

print("train_lr shape:", train_lr.shape, "test_lr shape:", test_lr.shape)

# Setup
n_folds = 3
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

# If Y not created yet, ensure you created it earlier (Cell B)
# We limit to top_k labels by frequency to keep memory small
label_freq = Y.sum(axis=0)
top_k = min(50, Y.shape[1])   # change to 20 or 10 if still heavy
top_label_idx = np.argsort(-label_freq)[:top_k]

oof = np.zeros((len(train_lr), top_k), dtype=np.float32)
preds = np.zeros((len(test_lr), top_k), dtype=np.float32)

start = time.time()
auc_scores = []
for idx_pos, label_i in enumerate(top_label_idx):
    y_label = Y[:, label_i]
    print(f"\nTraining label {idx_pos+1}/{top_k}  (global label index {label_i})")
    fold_scores = []
    for fold, (tr, val) in enumerate(kf.split(train_lr)):
        X_tr, X_val = train_lr[tr], train_lr[val]
        y_tr, y_val = y_label[tr], y_label[val]

        # lightweight solver, smaller max_iter
        clf = LogisticRegression(max_iter=500, solver='liblinear')
        clf.fit(X_tr, y_tr)

        oof[val, idx_pos] = clf.predict_proba(X_val)[:,1]
        preds[:, idx_pos] += clf.predict_proba(test_lr)[:,1] / n_folds

        # evaluate fold
        try:
            fold_auc = roc_auc_score(y_val, oof[val, idx_pos])
            fold_scores.append(fold_auc)
        except Exception:
            pass

        # cleanup memory
        del X_tr, X_val, y_tr, y_val, clf
        gc.collect()

    if fold_scores:
        mean_auc = np.mean(fold_scores)
        print(f"Label {idx_pos+1} mean AUC: {mean_auc:.4f}")
        auc_scores.append(mean_auc)

elapsed = time.time() - start
print(f"\nFinished LR baseline for top {top_k} labels. Time: {elapsed/60:.2f} min")
if auc_scores:
    print("Mean AUC across trained labels:", np.mean(auc_scores))


In [None]:
# --- REPLACEMENT for Cell 30: Memory-safe LogisticRegression baseline ---
import numpy as np, gc, time
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

train_lr = train_red
test_lr  = test_red

print("train_lr shape:", train_lr.shape, " test_lr shape:", test_lr.shape)
n_folds = 3
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

label_freq = Y.sum(axis=0)
top_k = min(50, Y.shape[1])   # reduce to 20 if kernel still dies
top_label_idx = np.argsort(-label_freq)[:top_k]

oof = np.zeros((len(train_lr), top_k), dtype=np.float32)
preds = np.zeros((len(test_lr), top_k), dtype=np.float32)

start = time.time()
auc_scores = []
for idx_pos, label_i in enumerate(top_label_idx):
    y_label = Y[:, label_i]
    print(f"\nTraining label {idx_pos+1}/{top_k}  (global label index {label_i})")
    fold_scores = []
    for fold, (tr, val) in enumerate(kf.split(train_lr)):
        X_tr, X_val = train_lr[tr], train_lr[val]
        y_tr, y_val = y_label[tr], y_label[val]

        clf = LogisticRegression(max_iter=500, solver='liblinear')
        clf.fit(X_tr, y_tr)

        oof[val, idx_pos] = clf.predict_proba(X_val)[:,1]
        preds[:, idx_pos] += clf.predict_proba(test_lr)[:,1] / n_folds

        try:
            fold_auc = roc_auc_score(y_val, oof[val, idx_pos])
            fold_scores.append(fold_auc)
        except Exception:
            pass

        del X_tr, X_val, y_tr, y_val, clf
        gc.collect()

    if fold_scores:
        mean_auc = np.mean(fold_scores)
        print(f"Label {idx_pos+1} mean AUC: {mean_auc:.4f}")
        auc_scores.append(mean_auc)

elapsed = time.time() - start
print(f"\nFinished LR baseline for top {top_k} labels. Time: {elapsed/60:.2f} min")
if auc_scores:
    print("Mean AUC across trained labels:", np.mean(auc_scores))


In [None]:
# # 7. LogisticRegression baseline
# kf = KFold(n_splits=5, shuffle=True, random_state=42)
# oof = np.zeros((len(train_red), Y.shape[1]))
# preds = np.zeros((len(test_red), Y.shape[1]))

# # Generate splits based on the number of samples in the label matrix Y
# for fold, (tr, val) in enumerate(kf.split(Y)):  # Changed from kf.split(train_red) to kf.split(Y)
#     print('Fold', fold)
#     X_tr, X_val = train_red[tr], train_red[val]
#     y_tr, y_val = Y[tr], Y[val]
#     model_lr = OneVsRestClassifier(LogisticRegression(max_iter=1000))
#     model_lr.fit(X_tr, y_tr)
#     oof[val] = model_lr.predict_proba(X_val)
#     preds += model_lr.predict_proba(test_red) / kf.n_splits

# auc_scores = []
# for i in range(Y.shape[1]):
#     try: auc_scores.append(roc_auc_score(Y[:,i], oof[:,i]))
#     except: pass
# print('Mean AUC:', np.mean(auc_scores))

## 8. LightGBM

In [None]:
# 8. LightGBM stacking (Optimized Memory-Safe Version)
import numpy as np
import lightgbm as lgb
import gc, time

# ‚úÖ Reduce number of classes
class_counts = train_df['labels_list'].explode().value_counts()
top_classes = class_counts.head(10).index.tolist()  # üî∏ Top 10 only
top_idx = [np.where(mlb.classes_ == c)[0][0] for c in top_classes]

# Parameters
NUM_BOOST_ROUND = 50
EARLY_STOP = 10

start_time = time.time()
for j, ci in enumerate(top_idx):
    y = Y[:, ci]
    print(f"\nüß† Training class {j+1}/{len(top_idx)} (index {ci})")

    # Single fold instead of all folds ‚Üí lower memory
    tr, val = next(iter(kf.split(train_red)))

    try:
        X_tr = train_red[tr].astype(np.float32)
        X_val = train_red[val].astype(np.float32)
        y_tr = y[tr]
        y_val = y[val]

        dtrain = lgb.Dataset(X_tr, label=y_tr, free_raw_data=True)
        dval = lgb.Dataset(X_val, label=y_val, reference=dtrain, free_raw_data=True)

        params = {
            'objective': 'binary',
            'metric': 'auc',
            'verbosity': -1,
            'num_threads': 2,
            'learning_rate': 0.05,
            'num_leaves': 31,
            'feature_fraction': 0.8,
            'bagging_fraction': 0.8,
            'force_col_wise': True
        }

        bst = lgb.train(
            params,
            dtrain,
            num_boost_round=NUM_BOOST_ROUND,
            valid_sets=[dval],
            early_stopping_rounds=EARLY_STOP,
            verbose_eval=False
        )

        preds[:, ci] += bst.predict(test_red, num_iteration=bst.best_iteration)

        # Memory cleanup
        del X_tr, X_val, y_tr, y_val, dtrain, dval, bst
        gc.collect()

    except Exception as e:
        print(f"‚ö†Ô∏è Skipped class {ci} due to error: {e}")
        gc.collect()
        continue

elapsed = time.time() - start_time
print(f"\n‚úÖ LightGBM stacking done in {elapsed/60:.2f} minutes (safe mode)")


In [None]:
# there would be having problem in runing this cell becuase the kernal would have die
# # 8. LightGBM stacking on top 50 classes
# class_counts = train_df['labels_list'].explode().value_counts()
# top_classes = class_counts.head(50).index.tolist()
# top_idx = [np.where(mlb.classes_ == c)[0][0] for c in top_classes]

# for j, ci in enumerate(top_idx):
#     y = Y[:, ci]
#     for fold, (tr, val) in enumerate(kf.split(train_red)):
#         X_tr, X_val = train_red[tr], train_red[val]
#         y_tr, y_val = y[tr], y[val]
#         dtrain = lgb.Dataset(X_tr, label=y_tr)
#         dval = lgb.Dataset(X_val, label=y_val)
#         params = {'objective': 'binary', 'metric': 'auc', 'verbosity': -1}
#         bst = lgb.train(params, dtrain, valid_sets=[dtrain, dval], num_boost_round=200, early_stopping_rounds=20, verbose_eval=False)
#         preds[:, ci] += bst.predict(test_red, num_iteration=bst.best_iteration) / kf.n_splits
# print('LightGBM stacking done')

## 9. submission

In [None]:
# 9. Create submission
sub = sample_sub.copy()
for i, cname in enumerate(mlb.classes_):
    if cname in sub.columns:
        sub[cname] = preds[:, i]
sub.to_csv('submission.tsv', index=False)
print('submission.tsv saved')