In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
from pathlib import Path

def parse_colabfold_results(output_text):
   # parse the colabfold output
   results = []
   
   # find lines with results - go through each line
   lines = output_text.split('\n')
   current_job = None
   current_metrics = {}
   
   for line in lines:
       line = line.strip()
       
       # get job name
       if '▶ Processing' in line:
           current_job = line.split('▶ Processing ')[1].split('...')[0]
           current_metrics = {}
           
       # get the metrics
       elif 'mean_plddt' in line and 'max_ptm' in line and 'max_iptm' in line:
           # extract numbers manually
           plddt_part = line.split("'mean_plddt': ")[1].split(',')[0]
           ptm_part = line.split("'max_ptm': ")[1].split(',')[0] 
           iptm_part = line.split("'max_iptm': ")[1].split('}')[0]
           
           current_metrics['mean_plddt'] = float(plddt_part)
           current_metrics['max_ptm'] = float(ptm_part)
           current_metrics['max_iptm'] = float(iptm_part)
           
       # get binding score
       elif 'Binding score:' in line:
           score_part = line.split('Binding score: ')[1]
           current_metrics['binding_score'] = float(score_part)
           
           # now we have everything for this job
           if current_job and len(current_metrics) == 4:
               job_name = current_job
               mean_plddt = current_metrics['mean_plddt']
               max_ptm = current_metrics['max_ptm']
               max_iptm = current_metrics['max_iptm'] 
               binding_score = current_metrics['binding_score']
               
               # get nanobody and antigen names from job name
               parts = job_name.split('_')
               nanobody = parts[0] + '_' + parts[1]  # like nbGFP_6xzf
               antigen = parts[2]  # like GFP
               
               result = {
                   'job_name': job_name,
                   'nanobody': nanobody,
                   'antigen': antigen,
                   'mean_plddt': mean_plddt,
                   'max_ptm': max_ptm,
                   'max_iptm': max_iptm,
                   'binding_score': binding_score
               }
               results.append(result)
   
   return pd.DataFrame(results)

def filter_and_rank_colabfold_results(df):
    """Add quality checks for ColabFold results"""
    # Quality flags based on confidence thresholds
    df['high_confidence'] = (df['max_iptm'] > 0.7) & (df['max_ptm'] > 0.7)
    df['good_interface'] = df['max_iptm'] > 0.5
    df['good_structure'] = df['mean_plddt'] > 80
    
    # Rank by binding score - higher is better
    df = df.sort_values('binding_score', ascending=False)
    df['rank'] = range(1, len(df) + 1)
    
    return df

def create_binding_matrix_visualization(df, score_column='binding_score', title_suffix='AlphaFold2 Multimer Score'):
    if len(df) == 0:
        print("No data to plot")
        return None
        
    # Sort the names alphabetically (case insensitive)
    nanobodies = sorted(df['nanobody'].unique(), key=str.lower)
    antigens = sorted(df['antigen'].unique(), key=str.lower)
    df = df.copy()
    # Make pivot table
    df = df.drop_duplicates(subset=['nanobody', 'antigen'], keep='first')
    matrix = df.pivot(index='nanobody', columns='antigen', values=score_column)
    matrix = matrix.reindex(index=nanobodies, columns=antigens)
    
    # Make the plot bigger
    plt.figure(figsize=(14, 12))
    
    # Use YlOrRd colormap like HADDOCK analysis (higher scores = better for AlphaFold)
    cmap = 'viridis'
    
    # get standard deviation from multiple metrics for error estimate
    df['score_std'] = ((df['max_iptm'] - df['max_iptm'].mean())**2 + 
                     (df['max_ptm'] - df['max_ptm'].mean())**2 + 
                     (df['mean_plddt']/100 - (df['mean_plddt']/100).mean())**2)**0.5

    std_matrix = df.pivot(index='nanobody', columns='antigen', values='score_std')
    std_matrix = std_matrix.reindex(index=nanobodies, columns=antigens)

    # Create annotation labels with score ± std
    annot_labels = matrix.round(3).astype(str) + '\n±' + std_matrix.round(3).astype(str)

    ax = sns.heatmap(
       matrix,
       annot=annot_labels,
       fmt='',
       cmap=cmap,
       square=True,
       linewidths=0.5,
       cbar_kws={'label': f'{title_suffix}'},
       annot_kws={'size': 16}  # smaller font to fit both lines
    )
    
    # Make colorbar label bigger
    cbar = ax.collections[0].colorbar
    cbar.set_label(f'{title_suffix}', size=20)
    
    # Highlight the diagonal - these should be the real binding pairs
    for i in range(min(len(nanobodies), len(antigens))):
        ax.add_patch(plt.Rectangle((i, i), 1, 1, fill=False, edgecolor='red', lw=5))
    
    plt.title(f'Nanobody-Antigen Binding Matrix ({title_suffix})', fontsize=22, pad=20)
    plt.xlabel('Antigens', fontsize=22)
    plt.ylabel('Nanobodies', fontsize=22)
    plt.xticks(rotation=45, ha='right', fontsize=18)
    plt.yticks(rotation=0, fontsize=18)
    plt.tight_layout()
    plt.savefig("binding_matrix_alphafold2_multimer.png", dpi=300)
    
    return matrix

def analyze_colabfold_results(df):
    """Analyze ColabFold results with HADDOCK-like output format"""
    print("=== AlphaFold2 Multimer Results Analysis ===")
    print()
    
    if len(df) == 0:
        print("ERROR: No data found - check file format")
        return
    
    print("Total experiments:", len(df))
    print("Number of nanobodies:", df['nanobody'].nunique())
    print("Number of antigens:", df['antigen'].nunique())
    print()
    
    # Check quality of results
    high_conf_count = df['high_confidence'].sum()
    good_interface_count = df['good_interface'].sum()
    good_structure_count = df['good_structure'].sum()
    
    print("Quality check:")
    print("  High confidence (ipTM>0.7, pTM>0.7):", str(high_conf_count) + "/" + str(len(df)))
    print("  Good interface (ipTM>0.5):", str(good_interface_count) + "/" + str(len(df)))
    print("  Good structure (pLDDT>80):", str(good_structure_count) + "/" + str(len(df)))
    print()
    
    # Best binding pairs based on binding score
    print("Top 5 binding pairs (highest confidence score):")
    top_pairs = df.nlargest(5, 'binding_score')
    for _, row in top_pairs.iterrows():
        if row['high_confidence']:
            flag = "high_conf"
        else:
            flag = "low_conf "
        print("  " + flag + " " + row['nanobody'] + " + " + row['antigen'] + ": " + str(round(row['binding_score'], 3)) + 
              " (ipTM: " + str(round(row['max_iptm'], 3)) + ", pTM: " + str(round(row['max_ptm'], 3)) + ")")
    print()
    
    # Check diagonal pairs - these should be the correct matches
    diagonal_pairs = []
    for _, row in df.iterrows():
        # Clean up names to check if they match
        # Extract base name from nanobody (remove nb prefix and PDB ID)
        nb_clean = row['nanobody'].replace('nb', '').split('_')[0].upper()
        ag_clean = row['antigen'].upper()
        if nb_clean == ag_clean:
            diagonal_pairs.append(row)
    
    if diagonal_pairs:
        print("Expected binding pairs (diagonal matches):")
        for pair in diagonal_pairs:
            if pair['high_confidence']:
                flag = "high_conf"
            else:
                flag = "low_conf "
            print("  " + flag + " " + pair['nanobody'] + " + " + pair['antigen'] + ": " + str(round(pair['binding_score'], 3)) + 
                  " (ipTM: " + str(round(pair['max_iptm'], 3)) + ", pTM: " + str(round(pair['max_ptm'], 3)) + ")")
        print()
    
    # Basic stats
    print("Summary statistics:")
    print("  Binding Score - Mean:", round(df['binding_score'].mean(), 3), 
          "Std:", round(df['binding_score'].std(), 3), 
          "Range:", round(df['binding_score'].min(), 3), "to", round(df['binding_score'].max(), 3))
    print("  ipTM - Mean:", round(df['max_iptm'].mean(), 3), 
          "Std:", round(df['max_iptm'].std(), 3), 
          "Range:", round(df['max_iptm'].min(), 3), "to", round(df['max_iptm'].max(), 3))
    print("  pTM - Mean:", round(df['max_ptm'].mean(), 3), 
          "Range:", round(df['max_ptm'].min(), 3), "to", round(df['max_ptm'].max(), 3))
    print("  pLDDT - Mean:", round(df['mean_plddt'].mean(), 1), 
          "Range:", round(df['mean_plddt'].min(), 1), "to", round(df['mean_plddt'].max(), 1))

# Run the analysis
if __name__ == "__main__":
    # Change this path to your file
    filename = "alphafold2_multimer_v3_combined_information.txt"
    
    # Fix encoding issue by specifying UTF-8
    try:
        with open(filename, 'r', encoding='utf-8') as file:
            file_content = file.read()
    except UnicodeDecodeError:
        # Fallback to utf-8 with error handling
        with open(filename, 'r', encoding='utf-8', errors='ignore') as file:
            file_content = file.read()
    except FileNotFoundError:
        # If file doesn't exist, you can paste the content directly here
        print(f"File not found: {filename}")
        print("Please paste your ColabFold output below or update the filename")
        file_content = ""
    
    # Parse the file
    df = parse_colabfold_results(file_content)
    
    if len(df) > 0:
        df = filter_and_rank_colabfold_results(df)
        
        # Make the heatmap
        matrix = create_binding_matrix_visualization(df, 'binding_score')
        
        # Print results
        analyze_colabfold_results(df)
        
        # Save to csv files
        df.to_csv('alphafold2_results.csv', index=False)
        if matrix is not None:
            matrix.to_csv('alphafold2_binding_matrix.csv')
    else:
        print("No data found - check file format")
    
    print("Done!")
