# **GoID Fit & Predict: RandomForest**

In [None]:
!pip install biopython

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from Bio import SeqIO
import os
import gc

# Process Train Data

In [None]:
file_path = '/kaggle/input/cafa-6-protein-function-prediction/Train/train_sequences.fasta'

if not os.path.exists(file_path):
    print(f"Error: File not found at {file_path}")
else:
    sequences = list(SeqIO.parse(file_path, 'fasta'))
    n_sequences = len(sequences)
    print(f"âœ… Total number of sequences loaded: {n_sequences}")

    lengths = [len(record.seq) for record in sequences]

    df_lengths = pd.DataFrame(lengths, columns=['Length'])

if 'sequences' not in locals() or not sequences:
    print("Error: The 'sequences' list is not found or is empty. Please ensure the FASTA file loading was successful.")
else:
    # 1. Data Extraction
    data = []
    for record in sequences:
        # Split the EntryID by the '|' (pipe) delimiter
        parts = record.id.split('|')

        if len(parts) == 3:
            accession_type = parts[0]
            accession_id = parts[1]
            protein_name = parts[2]
        else:
            accession_type = 'N/A'
            accession_id = record.id
            protein_name = 'N/A'

        data.append({
            'Full_EntryID': record.id,
            'Length': len(record.seq),
            'Accession_Type': accession_type,
            'Accession_ID': accession_id,
            'Protein_Name': protein_name,
            'Sequence': str(record.seq)[0:32] # The sequence itself is usually omitted due to its length
        })

    # 2. DataFrame Creation
    df = pd.DataFrame(data)
    display(df.head())
    df.to_csv('fasta.csv',index=False)
    
    fasta1=df[['Accession_ID','Sequence']]
    fasta1.columns=['EntryID','Sequence']


    fasta0=pd.read_csv('/kaggle/input/fasta-protein-data-length-and-sequence/fasta.csv')
    fasta1=fasta0[['Accession_ID','Sequence']]
    fasta1.columns=['EntryID','Sequence']
    display(fasta1[0:3])

In [None]:
df0 = pd.read_csv('/kaggle/input/cafa-6-protein-function-prediction/Train/train_terms.tsv', sep='\t')
print(len(df0))
df1=df0[['EntryID','term']]
df1.columns=['EntryID','GoID']
df2=df1.merge(fasta1,on='EntryID',how='left')
display(df2)

df=df2[['EntryID','Sequence','GoID']][0:40000] #max=300000,#n=540000
df['value'] = 1
df_pivot = df.pivot_table(index=['EntryID','Sequence'], columns='GoID', values='value', fill_value=0)
display(df_pivot)
gc.collect()

# Fit

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import jaccard_score, hamming_loss, classification_report

# 1. Define the k-mer function (k=3)
def get_kmers(sequence, k=3):
    """Splits a sequence into overlapping k-mers."""
    return [sequence[i:i + k] for i in range(len(sequence) - k + 1)]

# 2. Extract Sequences (X_text) and Target Labels (Y)
# Use the Sequence index from the pivot table for alignment
sequences = df_pivot.index.get_level_values('Sequence').astype(str)
Y = df_pivot.values  # Target matrix (0s and 1s for each GoID)

# 3. Apply k-mer tokenization
# We join the k-mers back into a string for CountVectorizer
k = 3
X_kmers = [' '.join(get_kmers(seq, k=k)) for seq in sequences]
del sequences  
gc.collect()

# 4. Use CountVectorizer to create the feature matrix X
vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, 1))
X = vectorizer.fit_transform(X_kmers)

print(f"âœ… Feature Matrix (X) Shape: {X.shape}")
print(f"âœ… Target Matrix (Y) Shape: {Y.shape}")

In [None]:
# 5. Split the data (0.9:0.1 as requested)
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y,
    test_size=0.1,      # 10% for testing
    random_state=42     # for reproducibility
)
del X, Y
gc.collect()
print(f"Data Split: Train samples={X_train.shape[0]}, Test samples={X_test.shape[0]}")
print("-" * 50)

from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import jaccard_score, hamming_loss, classification_report
import numpy as np

# --- 6. Initialize and Train the Multi-Output Classifier ---
print("## 6. Model Training (Binary Relevance / MultiOutputClassifier)")

# Base estimator: Random Forest Classifier
rf_estimator = RandomForestClassifier(
    n_estimators=40,      # Use 100 decision trees
    random_state=42,
    n_jobs=-1              # Utilize all CPU cores for parallel processing
)

# Multi-label model wrapper (Binary Relevance Strategy)
multi_rf_model = MultiOutputClassifier(rf_estimator)

print("Starting model training... (Training a separate RF for each GO Term)")
multi_rf_model.fit(X_train, Y_train)
print("Training complete! ðŸŽ‰")
print("-" * 50)
gc.collect()

In [None]:
# --- 7. Prediction (Predicting Probabilities) - SAFELY MODIFIED ---
print("## 7. Predicting Probabilities & Defining Y_pred_thresholded")

Y_proba_list = []

#for model in multi_rf_model.estimators_:
for idx, model in enumerate(multi_rf_model.estimators_):
    # Predict probabilities for the current GO Term
    proba_output = model.predict_proba(X_test)
    
    # Check the shape for single-class models
    if proba_output.shape[1] == 2:
        # Standard case: Two columns available. Extract P(class=1).
        Y_proba_list.append(proba_output[:, 1])
    else:
        # Single-class case: Assume P(class=1) is 0.0 for safety.
        # This handles the IndexError and provides a valid probability column.
        n_samples = proba_output.shape[0]
        Y_proba_list.append(np.zeros(n_samples))
    
    if (idx + 1) % 1000 == 0:
        gc.collect()


# Combine the results into the final (n_samples, n_labels) NumPy array
import numpy as np
Y_proba = np.stack(Y_proba_list, axis=1)
print(f"Y_proba (Probabilities) successfully generated. Shape: {Y_proba.shape}")

# Convert probabilities to binary predictions using a 0.5 threshold
# THIS LINE DEFINES THE MISSING VARIABLE: Y_pred_thresholded
Y_pred_thresholded = (Y_proba >= 0.5).astype(int)
print("Binary predictions (Y_pred_thresholded) created using Threshold = 0.5.")
print("-" * 50)


# --- 8. Evaluation ---
print("## 8. Evaluation Metrics (Using Threshold 0.5)")
from sklearn.metrics import jaccard_score, hamming_loss, classification_report

# Evaluate using the thresholded binary predictions
jaccard = jaccard_score(Y_test, Y_pred_thresholded, average='samples')
hamming = hamming_loss(Y_test, Y_pred_thresholded)

print(f"Multi-Label Classification Results (Threshold 0.5):")
print(f"-> Jaccard Index (Similarity): {jaccard:.4f}")
print(f"-> Hamming Loss (Error Rate): {hamming:.4f}")

# Example: Get a detailed report for the first GO term (column index 0)
print("\nClassification Report for the first GO Term:")
print(classification_report(Y_test[:, 0], Y_pred_thresholded[:, 0], zero_division=0))

---

# Process Test Data and Predict

In [None]:
# ========== DISK-OPTIMIZED: Sparse predictions + Compression ==========

import numpy as np
import pandas as pd
from Bio import SeqIO
import gc
import gzip

# Load GO terms data
go_terms_raw = pd.read_csv('/kaggle/input/cafa-6-protein-function-prediction/Train/train_terms.tsv', sep='\t')

# Extract UNIQUE GO IDs from the pivot table columns (these are the trained models)
# Use the GO IDs that were actually trained on
unique_go_ids = df_pivot.columns.values  # These are the 10,045 GO terms with trained models
print(f"Total trained GO Terms: {len(unique_go_ids)}")
print(f"Number of trained models: {len(multi_rf_model.estimators_)}")

# Verify they match
assert len(unique_go_ids) == len(multi_rf_model.estimators_), "Mismatch between GO terms and models!"

tsequences = list(SeqIO.parse('/kaggle/input/cafa-6-protein-function-prediction/Test/testsuperset.fasta', 'fasta'))
print(f'n_test_sequences: {len(tsequences)}')

# Create DataFrame
data = {
    'EntryID': [record.id for record in tsequences],
    'Sequence': [str(record.seq)[0:320] for record in tsequences]
}

df_TEST = pd.DataFrame(data)
print(f"Test DataFrame shape: {df_TEST.shape}")
print("-" * 50)

# ===== Apply preprocessing =====
print("Applying k-mer vectorization to test data...")

def get_kmers(sequence, k=3):
    return [sequence[i:i + k] for i in range(len(sequence) - k + 1)]

k = 3
X_kmers_test = [' '.join(get_kmers(seq, k=k)) for seq in df_TEST['Sequence'].astype(str)]

X_TEST_vectorized = vectorizer.transform(X_kmers_test)
del X_kmers_test
gc.collect()

print(f"âœ… Test Feature Matrix Shape: {X_TEST_vectorized.shape}")
print("-" * 50)

# ===== DISK-EFFICIENT: Process + write TSV, filtering low scores =====
print("Processing predictions and writing to TSV file...")

submission_file_path = 'submission.tsv'
batch_size = 500
n_go_terms = len(unique_go_ids)
score_threshold = 0.01  # Only write scores > threshold (reduces file size)

with open(submission_file_path, 'w') as f:
    for batch_start in range(0, n_go_terms, batch_size):
        batch_end = min(batch_start + batch_size, n_go_terms)
        batch_size_actual = batch_end - batch_start
        
        batch_indices = range(batch_start, batch_end)
        
        # Process this batch of GO Terms
        batch_predictions = []
        
        for idx in batch_indices:
            model = multi_rf_model.estimators_[idx]
            proba_output = model.predict_proba(X_TEST_vectorized)
            
            if proba_output.shape[1] == 2:
                proba = proba_output[:, 1]
            else:
                proba = np.zeros(proba_output.shape[0])
            
            batch_predictions.append(proba)
        
        # Stack batch predictions
        Y_batch = np.stack(batch_predictions, axis=1)
        del batch_predictions
        
        # Get GO IDs for this batch
        go_ids_batch = unique_go_ids[batch_start:batch_end]
        
        # Write results - ONLY scores above threshold (sparse output)
        for i, entry_id in enumerate(df_TEST['EntryID'].values):
            for j, go_id in enumerate(go_ids_batch):
                score = Y_batch[i, j]
                # Only write if score exceeds threshold
                if score >= score_threshold:
                    f.write(f"{entry_id}\t{go_id}\t{score:.6f}\n")
        
        del Y_batch
        gc.collect()
        
        print(f"  Processed GO Terms {batch_end} / {n_go_terms}")

print(f"âœ… Submission file saved: {submission_file_path}")
print(f"   Predictions written with score >= {score_threshold}")
print(f"   File format: Tab-separated values (EntryID, GO_ID, Score)")