In [None]:
#Imports

import os, glob, gzip, re
import pandas as pd, numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
from scipy import sparse
import math

!pip install -q biopython

# test import immediately
try:
    from Bio import SeqIO
    print('Biopython installed and import OK:', SeqIO.__name__)
except Exception as e:
    print('Import failed — you may need to restart the kernel. Error:', e) 

In [None]:
#File Detection

INPUT_ROOT = '/kaggle/input'
all_files = glob.glob(os.path.join(INPUT_ROOT, '**', '*'), recursive=True)

fasta_train = None
fasta_test = None
terms_tsv = None

for p in all_files:
    low = p.lower()
    if low.endswith('.fasta') or low.endswith('.fa') or low.endswith('.fasta.gz') or low.endswith('.fa.gz'):
        if 'train' in low and fasta_train is None:
            fasta_train = p
        elif 'test' in low and fasta_test is None:
            fasta_test = p
        elif fasta_train is None:
            fasta_train = p
    if (low.endswith('.tsv') or low.endswith('.csv')) and terms_tsv is None:
        if 'term' in low or 'label' in low or 'annotation' in low or 'train_terms' in low:
            terms_tsv = p

comp_folder = os.path.join(INPUT_ROOT, 'cafa-6-protein-function-prediction')
if os.path.exists(comp_folder):
    if fasta_train is None and os.path.exists(os.path.join(comp_folder, 'train_sequences.fasta')):
        fasta_train = os.path.join(comp_folder, 'train_sequences.fasta')
    if fasta_test is None and os.path.exists(os.path.join(comp_folder, 'test_sequences.fasta')):
        fasta_test = os.path.join(comp_folder, 'test_sequences.fasta')
    if terms_tsv is None and os.path.exists(os.path.join(comp_folder, 'train_terms.tsv')):
        terms_tsv = os.path.join(comp_folder, 'train_terms.tsv')

if not fasta_train or not fasta_test or not terms_tsv:
    print('Auto-detection failed. Found:')
    print(' fasta_train =', fasta_train)
    print(' fasta_test  =', fasta_test)
    print(' terms_tsv   =', terms_tsv)
    raise SystemExit('Attach the CAFA-6 dataset: Add data -> cafa-6-protein-function-prediction')

print('Using files:')
print(' train fasta:', fasta_train)
print(' test  fasta:', fasta_test)
print(' terms tsv :', terms_tsv)

In [None]:
#Defining Helpers

def read_fasta_ids(path, limit=None):
    open_fn = gzip.open if str(path).endswith('.gz') else open
    ids = []
    with open_fn(path, 'rt') as handle:
        for rec in SeqIO.parse(handle, 'fasta'):
            ids.append(rec.id)
            if limit and len(ids) >= limit:
                break
    return ids

def read_fasta_dict(path):
    open_fn = gzip.open if str(path).endswith('.gz') else open
    seqs = {}
    with open_fn(path, 'rt') as handle:
        for rec in SeqIO.parse(handle, 'fasta'):
            seqs[rec.id] = str(rec.seq)
    return seqs

In [None]:
#previewing and debudding

train_sample_ids = read_fasta_ids(fasta_train, limit=20)
test_sample_ids = read_fasta_ids(fasta_test, limit=20)
print('\nExample train FASTA ids (20):', train_sample_ids[:20])
print('Example test  FASTA ids (20):', test_sample_ids[:20])

In [None]:
#read and detect terms_tsv format

open_fn = gzip.open if str(terms_tsv).endswith('.gz') else open
print('\n--- first 12 raw lines of terms_tsv ---')
with open_fn(terms_tsv, 'rt') as f:
    for i, line in enumerate(f):
        print(i+1, line.strip())
        if i >= 11:
            break

In [None]:
#trying with more rows 

df = pd.read_csv(terms_tsv, sep='\t', header=None, dtype=str, engine='python')
print('\nRead shape (terms file):', df.shape)
print('First rows:')
print(df.head(6))

In [None]:
# heuristic- check for GO or similar

col0_is_go = df[0].astype(str).str.match(r'^GO:\\d{7}').sum() if 0 in df.columns else 0
col1_is_go = df[1].astype(str).str.match(r'^GO:\\d{7}').sum() if 1 in df.columns else 0
print(f'col0 GO-like count = {col0_is_go}, col1 GO-like count = {col1_is_go}')

if col0_is_go > col1_is_go:
    print('Detected first column contains GO terms. Interpreting file as (term_id, protein_id) -> swapping.')
    terms_df = df.rename(columns={0:'term_id', 1:'protein_id'})[['protein_id','term_id']]
else:
    print('Detected first column contains protein ids. Using (protein_id, term_id).')
    terms_df = df.rename(columns={0:'protein_id', 1:'term_id'})[['protein_id','term_id']]

print('\nterms_df sample:')
print(terms_df.head(6))

In [None]:
#Normalization 

def norm_variants(s):
    s = '' if s is None else str(s)
    out = []
    out.append(s)
    if '|' in s:
        parts = s.split('|')
        for p in parts:
            if p: out.append(p)
        out.append(parts[-1])
        if len(parts) > 1: out.append(parts[1])
    out.append(s.split()[0])
    if '.' in s: out.append(s.split('.')[0])
    out.append(re.sub('[^A-Za-z0-9_\\-]', '', s))
    for t in re.split('[\\|\\s]', s):
        t = t.strip()
        if 4 <= len(t) <= 12:
            out.append(t)
    uniq = []
    for v in out:
        if v and v not in uniq: uniq.append(v)
    return uniq

label_ids = pd.unique(terms_df['protein_id'])
norm2labels = {}
for lid in label_ids:
    for nv in norm_variants(lid):
        norm2labels.setdefault(nv, set()).add(lid)

In [None]:
print('\nSample match count (train sample):', sum(1 for tid in train_sample_ids if any(c in norm2labels for c in norm_variants(tid))), '/', len(train_sample_ids))


In [None]:
#Map train (fasta ids -> lable ids)

train_all_ids = read_fasta_ids(fasta_train, limit=None)
mapped_train_to_label = {}
unmatched = []
for tid in train_all_ids:
    found = False
    for cand in norm_variants(tid):
        if cand in norm2labels:
            mapped_train_to_label[tid] = list(norm2labels[cand])[0]
            found = True
            break
    if not found:
        unmatched.append(tid)

print('Total train sequences:', len(train_all_ids))
print('Mapped train->label count:', len(mapped_train_to_label))
print('Unmatched train sequences:', len(unmatched))
print('Example unmatched (10):', unmatched[:10])

if len(mapped_train_to_label) == 0:
    raise SystemExit('No mapping between fasta ids and label ids found automatically. Paste the first raw lines above here for further rule craft.')


In [None]:
#mapping terms_df

label_to_fasta = {}
for f_id, l_id in mapped_train_to_label.items():
    label_to_fasta.setdefault(l_id, set()).add(f_id)

def map_label_row(lid):
    if lid in label_to_fasta:
        return list(label_to_fasta[lid])[0]
    for nv in norm_variants(lid):
        if nv in label_to_fasta:
            return list(label_to_fasta[nv])[0]
    return None

terms_df['mapped_fasta_id'] = terms_df['protein_id'].apply(map_label_row)
mapped_count = int(terms_df['mapped_fasta_id'].notnull().sum())
print('Mapped label rows to fasta ids:', mapped_count, '/', len(terms_df))

mapped_df = terms_df[terms_df['mapped_fasta_id'].notnull()].copy()
grouped = mapped_df.groupby('mapped_fasta_id')['term_id'].apply(list).reset_index().rename(columns={'mapped_fasta_id':'protein_id'})
print('Grouped unique proteins with labels (after mapping):', len(grouped))

In [None]:
#loading seq dict

train_seqs = read_fasta_dict(fasta_train)
test_seqs = read_fasta_dict(fasta_test)

grouped = grouped[grouped['protein_id'].isin(train_seqs.keys())].reset_index(drop=True)
print('Grouped after filtering to available sequences:', len(grouped))
if len(grouped) == 0:
    raise SystemExit('No labelled proteins remain after mapping and filtering. Inspect unmatched examples above.')

grouped['sequence'] = grouped['protein_id'].map(train_seqs)

In [None]:
#k-mers

K = 3
def kmers(seq, k=K):
    if not isinstance(seq, str) or len(seq) == 0: return ['']
    if len(seq) < k: return [seq]
    return [seq[i:i+k] for i in range(len(seq)-k+1)]

train_texts = [' '.join(kmers(s)) for s in grouped['sequence']]

vectorizer = HashingVectorizer(n_features=2**15, alternate_sign=False, token_pattern=r'[^\\s]+')  # 32k features
X_full = vectorizer.transform(train_texts)
if 'mlb' not in globals():
    from sklearn.preprocessing import MultiLabelBinarizer
    mlb = MultiLabelBinarizer(sparse_output=True)
Y_sparse = mlb.fit_transform(grouped['term_id'])
print('X_full shape:', X_full.shape, 'Y shape:', Y_sparse.shape)

In [None]:
# training and saving artifact
import numpy as np
import pandas as pd
import gc
import pickle
from scipy import sparse
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer

BASE_PATH = '/kaggle/input/cafa-6-protein-function-prediction'
MODEL_DIR = '/kaggle/working' 
NGRAM_RANGE = (3, 4)
MAX_FEATURES = 20000

print("--- BẮT ĐẦU: TRAINING ---")

print("1. Đang tải dữ liệu Train...")
# Hàm đọc Fasta nhanh
def read_fasta(path):
    seqs = {}
    with open(path, 'r') as f:
        cid, cseq = "", []
        for line in f:
            line = line.strip()
            if line.startswith(">"):
                if cid: seqs[cid] = "".join(cseq)
                parts = line.split('|')
                cid = parts[1] if len(parts) > 1 else line[1:].split()[0]
                cseq = []
            else:
                cseq.append(line)
        if cid: seqs[cid] = "".join(cseq)
    return seqs

train_seqs = read_fasta(f"{BASE_PATH}/Train/train_sequences.fasta")
df_terms = pd.read_csv(f"{BASE_PATH}/Train/train_terms.tsv", sep='\t')

print("   -> Map nhãn vào sequence...")
grouped = df_terms.groupby('EntryID')['term'].apply(list).reset_index()
grouped_map = dict(zip(grouped['EntryID'], grouped['term']))

train_ids = list(train_seqs.keys())
train_sentences = [train_seqs[pid] for pid in train_ids]
train_labels = [grouped_map.get(pid, []) for pid in train_ids]

mask = [len(x) > 0 for x in train_labels]
train_sentences = [x for i, x in enumerate(train_sentences) if mask[i]]
train_labels = [x for i, x in enumerate(train_labels) if mask[i]]

print(f"   -> Số lượng mẫu Train hợp lệ: {len(train_sentences)}")
del train_seqs, df_terms, grouped
gc.collect()

print("2. Training Vectorizer (TF-IDF)...")
vectorizer = TfidfVectorizer(
    analyzer='char', ngram_range=NGRAM_RANGE, 
    max_features=MAX_FEATURES, dtype=np.float32, sublinear_tf=True
)
X_train = vectorizer.fit_transform(train_sentences)

print("3. Binarizing Labels...")
mlb = MultiLabelBinarizer(sparse_output=True)
Y_train = mlb.fit_transform(train_labels)

print(f"4. Lưu model xuống {MODEL_DIR}...")

with open(f'{MODEL_DIR}/vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)
    
with open(f'{MODEL_DIR}/mlb.pkl', 'wb') as f:
    pickle.dump(mlb, f)

sparse.save_npz(f'{MODEL_DIR}/X_train.npz', X_train)
sparse.save_npz(f'{MODEL_DIR}/Y_train.npz', Y_train)

print("✅ ĐÃ TRAIN VÀ LƯU XONG! BẠN CÓ THỂ RESET KERNEL ĐỂ CHẠY PHẦN 2.")

In [None]:
#prediction
import numpy as np
import pandas as pd
import pickle
import gc
import os
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity

BASE_PATH = '/kaggle/input/cafa-6-protein-function-prediction'
MODEL_DIR = '/kaggle/working'
SUBMISSION_FILE = 'submission.tsv'
BATCH_SIZE = 1000
TOP_K = 25             
TAXON_BONUS = 1.3       

print("--- BẮT ĐẦU: BIO-ENHANCED PREDICTION ---")

print("1. Loading Models...")
with open(f'{MODEL_DIR}/vectorizer.pkl', 'rb') as f:
    vectorizer = pickle.load(f)
with open(f'{MODEL_DIR}/mlb.pkl', 'rb') as f:
    mlb = pickle.load(f)
    
X_train = sparse.load_npz(f'{MODEL_DIR}/X_train.npz')
Y_train = sparse.load_npz(f'{MODEL_DIR}/Y_train.npz')
all_terms = mlb.classes_
print(f"   -> Model loaded. X_train shape: {X_train.shape}")

print("2. Loading Taxonomy & Lengths (Train)...")
train_tax_df = pd.read_csv(f"{BASE_PATH}/Train/train_taxonomy.tsv", sep='\t', header=None, names=['ID', 'TaxID'])
train_tax_map = dict(zip(train_tax_df['ID'], train_tax_df['TaxID']))

def load_train_metadata():
    # Load raw
    seqs = {}
    with open(f"{BASE_PATH}/Train/train_sequences.fasta", 'r') as f:
        cid, cseq = "", []
        for line in f:
            line = line.strip()
            if line.startswith(">"):
                if cid: seqs[cid] = len("".join(cseq))
                parts = line.split('|')
                cid = parts[1] if len(parts) > 1 else line[1:].split()[0]
                cseq = []
            else:
                cseq.append(line)
        if cid: seqs[cid] = len("".join(cseq))
    
    df_terms = pd.read_csv(f"{BASE_PATH}/Train/train_terms.tsv", sep='\t')
    valid_ids_with_labels = set(df_terms['EntryID'].unique())
        
    final_ids = [pid for pid in seqs.keys() if pid in valid_ids_with_labels]
    
    tax_arr = np.array([train_tax_map.get(pid, 0) for pid in final_ids], dtype=np.int32)
    len_arr = np.array([seqs[pid] for pid in final_ids], dtype=np.float32)
    return tax_arr, len_arr

train_tax_arr, train_len_arr = load_train_metadata()
print(f"   -> Metadata loaded. Count: {len(train_tax_arr)}")

if len(train_tax_arr) != X_train.shape[0]:
    print(f"⚠️ CẢNH BÁO: Số lượng metadata ({len(train_tax_arr)}) khác số lượng vector ({X_train.shape[0]}).")
    print("   -> Điều này sẽ gây lỗi lệch hàng. Hãy đảm bảo Logic lọc ở Part 1 và Part 2 giống hệt nhau!")

print("3. Loading Test Data...")
def read_test_fasta(path):
    data = [] # List of tuples (id, seq, tax, len)
    with open(path, 'r') as f:
        cid, cseq, ctax = "", [], 0
        for line in f:
            line = line.strip()
            if line.startswith(">"):
                if cid: 
                    full_seq = "".join(cseq)
                    data.append((cid, full_seq, ctax, len(full_seq)))
                
                parts = line[1:].split()
                cid = parts[0]
                ctax = int(parts[1]) if len(parts) > 1 and parts[1].isdigit() else 0
                cseq = []
            else:
                cseq.append(line)
        if cid:
            full_seq = "".join(cseq)
            data.append((cid, full_seq, ctax, len(full_seq)))
    return data

test_data = read_test_fasta(f"{BASE_PATH}/Test/testsuperset.fasta")
# Tách ra để xử lý batch
test_ids = [x[0] for x in test_data]
test_seqs = [x[1] for x in test_data]
test_tax = np.array([x[2] for x in test_data], dtype=np.int32)
test_lens = np.array([x[3] for x in test_data], dtype=np.float32)

del test_data
gc.collect()

print("4. Running Prediction with Bio-Heuristics...")
n_test = len(test_ids)
n_batches = int(np.ceil(n_test / BATCH_SIZE))

if os.path.exists(SUBMISSION_FILE): os.remove(SUBMISSION_FILE)

for b in range(n_batches):
    start = b * BATCH_SIZE
    end = min(n_test, (b + 1) * BATCH_SIZE)
    
    b_ids = test_ids[start:end]
    b_seqs = test_seqs[start:end]
    b_tax = test_tax[start:end]
    b_lens = test_lens[start:end]
    
    X_test = vectorizer.transform(b_seqs)
    sim_matrix = cosine_similarity(X_test, X_train)
    
    batch_rows = []
    
    for i in range(len(b_ids)):
        # Top K
        best_indices = np.argpartition(sim_matrix[i], -TOP_K)[-TOP_K:]
        scores = sim_matrix[i][best_indices]
        neighbor_indices = best_indices # Alias
        
        # heuristic 1: Taxonomy Bonus
        current_tax = b_tax[i]
        if current_tax != 0:
            neighbor_taxs = train_tax_arr[neighbor_indices]
            # Tạo mask những thằng trùng tax
            tax_match = (neighbor_taxs == current_tax)
            scores[tax_match] *= TAXON_BONUS
            
        # heuristic 2: Length Penalty 
        current_len = b_lens[i]
        neighbor_lens = train_len_arr[neighbor_indices]
        
        min_l = np.minimum(neighbor_lens, current_len)
        max_l = np.maximum(neighbor_lens, current_len)
        ratio = min_l / (max_l + 1e-5) # Tránh chia 0
        
        scores *= ratio
        
        sum_sim = np.sum(scores)
        if sum_sim > 0.001:
            neighbor_labels_mat = Y_train[neighbor_indices]
            w_scores = sparse.csr_matrix(scores)
            
            pred_scores = w_scores.dot(neighbor_labels_mat).toarray().flatten() / sum_sim
            
            top_term_indices = np.argsort(pred_scores)[-50:]
            
            for idx in top_term_indices:
                s = pred_scores[idx]
                if s > 0.01:
                    batch_rows.append(f"{b_ids[i]}\t{all_terms[idx]}\t{s:.3f}")
    
    with open(SUBMISSION_FILE, 'a') as f:
        if batch_rows:
            f.write('\n'.join(batch_rows) + '\n')
            
    if (b+1) % 10 == 0:
        print(f"   Batch {b+1}/{n_batches} xong.")
        gc.collect()

print(f"✅ HOÀN TẤT! File kết quả: {SUBMISSION_FILE}")