In [None]:
# https://www.kaggle.com/competitions/cafa-6-protein-function-prediction/discussion/613138
import gzip

def create_submission_pipeline():
    # Paths
    test_fasta_path = "/kaggle/input/cafa-6-protein-function-prediction/Test/testsuperset.fasta"
    gaf_path = "/kaggle/input/gaf-gz/temp_gaf.gz"
    output_path = "submission.tsv"
    
    print("Step 1: Reading test proteins...")
    test_proteins = set()
    with open(test_fasta_path, 'r') as f:
        for line in f:
            if line.startswith('>'):
                header = line[1:].strip()
                parts = header.split('|')
                protein_id = parts[1] if len(parts) >= 2 else header.split()[0]
                test_proteins.add(protein_id)
    
    print(f"Found {len(test_proteins)} test proteins")
    print("Sample protein IDs:", list(test_proteins)[:5])
    
    print("Step 2: Processing GAF file line by line...")
    
    # Process GAF line by line to avoid memory issues
    protein_go_map = {}
    line_count = 0
    
    with gzip.open(gaf_path, 'rt') as f:
        for line in f:
            line_count += 1
            if line_count % 100000000 == 0:
                print(f"Processed {line_count:,} lines...")
                
            # Skip comment lines
            if line.startswith('!'):
                continue
                
            # Parse tab-separated fields
            fields = line.strip().split('\t')
            if len(fields) < 5:
                continue
                
            protein_id = fields[1]
            qualifier = fields[3]
            go_id = fields[4]
            
            # Skip if not in test set or has NOT qualifier
            if protein_id not in test_proteins or 'NOT' in qualifier:
                continue
                
            # Add to our mapping
            if protein_id not in protein_go_map:
                protein_go_map[protein_id] = set()
            protein_go_map[protein_id].add(go_id)
    
    print(f"Finished processing {line_count:,} lines")
    
    print("Step 3: Creating submission file in correct format...")
    
    # Create submission in the correct format: protein_id, go_term, probability
    with open(output_path, 'w') as f:
        for protein_id in test_proteins:
            go_terms = protein_go_map.get(protein_id, set())
            for go_term in sorted(go_terms):
                # Assign probability 1.0 to all predictions for baseline
                f.write(f"{protein_id}\t{go_term}\t1.0\n")
    
    # Print stats
    total_predictions = sum(len(go_terms) for go_terms in protein_go_map.values())
    proteins_with_preds = len(protein_go_map)
    
    print(f"\nSubmission completed!")
    print(f"Total proteins: {len(test_proteins)}")
    print(f"Proteins with predictions: {proteins_with_preds}")
    print(f"Total predictions: {total_predictions}")
    print(f"Submission saved to: {output_path}")

# Run the pipeline
if __name__ == "__main__":
    create_submission_pipeline()