In [None]:
!pip install bio > /dev/null

In [None]:
import gc
import numpy as np
import pandas as pd

from Bio import SeqIO
from collections import *
from sklearn.neighbors import NearestNeighbors

In [None]:
train = pd.read_csv("/kaggle/input/cafa-6-protein-function-prediction/Train/train_terms.tsv", 
                   sep='\t', names=['p','t','o'], dtype={'p': 'category', 't': 'category'})

term_freq = Counter(train['t'])
MIN_TERM_FREQ = 10
filtered_terms = {t for t, cnt in term_freq.items() if cnt >= MIN_TERM_FREQ}

prot2terms = defaultdict(set)
for _, r in train.iterrows():
    if r['t'] in filtered_terms:
        prot2terms[r['p']].add(r['t'])

del train
gc.collect()

def load_fasta(f, max_seqs=None):
    seqs = {}
    count = 0
    
    for r in SeqIO.parse(f, 'fasta'):
        
        if max_seqs and count >= max_seqs:
            break
        
        parts = r.id.split('|')
        pid = parts[1] if len(parts) >= 2 else parts[0]
        seqs[pid] = str(r.seq)
        count += 1
    
    return seqs

train_fasta = load_fasta("/kaggle/input/cafa-6-protein-function-prediction/Train/train_sequences.fasta")
train_fasta = {k: v for k, v in train_fasta.items() if k in prot2terms}

test_fasta = load_fasta("/kaggle/input/cafa-6-protein-function-prediction/Test/testsuperset.fasta")

In [None]:
def kmer_vocab(sequences, top_k=500, k_sizes=[3, 4]):
    kmer_counter = Counter()
    
    for seq in sequences:
        for k_size in k_sizes:
            for i in range(len(seq) - k_size + 1):
                kmer = seq[i:i + k_size]
                kmer_counter[kmer] += 1
                
    return [k for k, _ in kmer_counter.most_common(top_k)]


kmers = kmer_vocab(train_fasta.values(), top_k=500, k_sizes=[3, 4])
vocab = kmers
k2i = {k: i for i, k in enumerate(vocab)}

def vec(s):
    if len(s) == 0:
        return np.zeros(len(vocab) + 6, dtype=np.float32)
    
    v = np.zeros(len(vocab) + 6, dtype=np.float32)
    total_kmers = 0
    
    for i in range(len(s) - 2):
        kmer3 = s[i:i+3]
        if kmer3 in k2i:
            v[k2i[kmer3]] += 1
            total_kmers += 1
    
    for i in range(len(s) - 3):
        kmer4 = s[i:i+4] 
        if kmer4 in k2i:
            v[k2i[kmer4]] += 1
            total_kmers += 1
            
    if total_kmers > 0:
        v[:len(vocab)] /= total_kmers
        
    if len(s) > 0:
        # Hydrophobicity (A, I, L, M, F, W, Y, V)
        hydrophobic = sum(a in 'AILMFWYV' for a in s) / len(s)
        
        # Charged amino acids (D, E, K, R, H)
        charged = sum(a in 'DEKRH' for a in s) / len(s)
        
        # Polar Amino Acids (N, Q, S, T, C)
        polar = sum(a in 'NQSTC' for a in s) / len(s)
        
        # Molecular weight (approximate proportion)
        weight_aa = 'GASPTCDNEQHKRMILFVWY'
        weight_contrib = sum(a in 'ILFVWYMR' for a in s) / len(s)
        
        # N-terminal properties (first 15 aa)
        n_term = s[:15] if len(s) >= 15 else s
        n_hydrophobic = sum(a in 'AILMFWYV' for a in n_term) / len(n_term)
        
        # C-terminal properties (last 15 aa)
        c_term = s[-15:] if len(s) >= 15 else s
        c_hydrophobic = sum(a in 'AILMFWYV' for a in c_term) / len(c_term)
        
        v[-6:] = [hydrophobic, charged, polar, weight_contrib, n_hydrophobic, c_hydrophobic]
    
    # L2 normalization
    norm = np.linalg.norm(v)
    if norm > 0:
        v /= norm
    
    return v

def build_matrix_batch(sequences, batch_size=2000):
    n_samples = len(sequences)
    n_features = len(vocab) + 6
    matrix = np.zeros((n_samples, n_features), dtype=np.float32)
    
    seqs_list = list(sequences.values())
    
    for i in range(0, n_samples, batch_size):
        end_idx = min(i + batch_size, n_samples)
        batch = seqs_list[i:end_idx]
        
        for j, seq in enumerate(batch):
            matrix[i + j] = vec(seq)
    
    return matrix

X_train = build_matrix_batch(train_fasta)
train_pids = list(train_fasta.keys())

X_test = build_matrix_batch(test_fasta)  
test_pids = list(test_fasta.keys())

del train_fasta, test_fasta
gc.collect()

neighbors = 25
SIMILARITY_THRESHOLD = 0.3

print(f"Fitting KNN with {neighbors} neighbors...")
knn = NearestNeighbors(n_neighbors=neighbors, metric='cosine', algorithm='brute', n_jobs=-1)
knn.fit(X_train)

distances, I = knn.kneighbors(X_test)
similarities = 1 - distances  # cosine similarity

In [None]:
submission = []
for i, pid in enumerate(test_pids):
    term_scores = {}
    neighbor_weights = []
    
    for j in range(neighbors):
        sim = similarities[i][j]
        if sim < SIMILARITY_THRESHOLD:
            continue
            
        neighbor_pid = train_pids[I[i][j]]
        neighbor_terms = prot2terms.get(neighbor_pid, set())
        
        for term in neighbor_terms:
            current_score = term_scores.get(term, 0)
            term_scores[term] = max(current_score, sim)
    
    if term_scores:
        max_score = max(term_scores.values())
        min_score = min(term_scores.values())
        
        if max_score > min_score:
            for term, score in term_scores.items():
                normalized_score = 0.1 + 0.9 * (score - min_score) / (max_score - min_score)
                submission.append(f"{pid}\t{term}\t{normalized_score:.3f}\n")
        else:
            for term, score in term_scores.items():
                submission.append(f"{pid}\t{term}\t{0.5:.3f}\n")

MAX_TERMS_PER_PROTEIN = 400

res = defaultdict(list)
for line in submission:
    p, t, s = line.strip().split('\t')
    res[p].append((t, float(s)))

final = []
for p, items in res.items():
    items.sort(key=lambda x: x[1], reverse=True)
    final.extend([f"{p}\t{t}\t{s:.3f}\n" for t, s in items[:MAX_TERMS_PER_PROTEIN]])

with open('/kaggle/working/submission.tsv', 'w') as f:
    f.writelines(final)

In [None]:
submission = pd.read_csv("submission.tsv", sep='\t', names=['p','t','o'])
submission.head()

In [None]:
submission.tail()

In [None]:
submission.sample(50)