In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
from pathlib import Path

def parse_haddock_results(file_content):
    results = []
    
    # split by triple newlines - each experiment is separated this way
    experiments = file_content.strip().split('\n\n\n')
    
    for exp in experiments:
        if not exp.strip():
            continue
            
        lines = exp.strip().split('\n')
        if len(lines) < 2:
            continue
        job_name = lines[0].strip()  # job name
        
        # get nanobody and antigen names
        if '_' in job_name:
            parts = job_name.split('_')
            nanobody = parts[0]
            antigen = '_'.join(parts[1:])  
        else:
            continue
            
        # parse all the metrics
        data = {}
        for line in lines[2:]:  # skip job name and URL
            line = line.strip()
            if not line:
                continue
                
            # extract different values using regex
            if 'HADDOCK score' in line:
                parts = line.split()
                try:
                    data['haddock_score'] = float(parts[2])
                    data['haddock_score_std'] = float(parts[4])
                except (IndexError, ValueError):
                    print("Could not parse HADDOCK score from line:", line) 
            elif 'Cluster size' in line:
                match = re.search(r'Cluster size\s+(\d+)', line)
                if match:
                    data['cluster_size'] = int(match.group(1))
                    
            elif 'Van der Waals energy' in line:
                match = re.search(r'Van der Waals energy\s+([0-9.-]+)\s+\+/-\s+([0-9.-]+)', line)
                if match:
                    data['vdw_energy'] = float(match.group(1))
                    
            elif 'Electrostatic energy' in line:
                match = re.search(r'Electrostatic energy\s+([0-9.-]+)\s+\+/-\s+([0-9.-]+)', line)
                if match:
                    data['electrostatic_energy'] = float(match.group(1))
                    
            elif 'Desolvation energy' in line:
                match = re.search(r'Desolvation energy\s+([0-9.-]+)\s+\+/-\s+([0-9.-]+)', line)
                if match:
                    data['desolvation_energy'] = float(match.group(1))
                    
            elif 'Restraints violation energy' in line:
                match = re.search(r'Restraints violation energy\s+([0-9.-]+)\s+\+/-\s+([0-9.-]+)', line)
                if match:
                    data['restraints_energy'] = float(match.group(1))
                    
            elif 'Buried Surface Area' in line:
                match = re.search(r'Buried Surface Area\s+([0-9.-]+)\s+\+/-\s+([0-9.-]+)', line)
                if match:
                    data['bsa'] = float(match.group(1))
                    
            elif 'Z-Score' in line:
                match = re.search(r'Z-Score\s+(-?[0-9.]+)', line)
                if match:
                    data['z_score'] = float(match.group(1))
                    
            elif 'RMSD' in line:
                match = re.search(r'RMSD.*?([0-9.-]+)\s+\+/-\s+([0-9.-]+)', line)
                if match:
                    data['rmsd'] = float(match.group(1))
        
        # only keep if we got the main score
        if 'haddock_score' in data:
            result = {
                'job_name': job_name,
                'nanobody': nanobody,
                'antigen': antigen,
                **data
            }
            results.append(result)
    
    return pd.DataFrame(results)

def filter_and_rank_results(df):
    # add some quality checks 
    df['reliable'] = (df['restraints_energy'] < 500) & (df['z_score'] < -1.0)
    df['good_zscore'] = df['z_score'] < -1.5
    df['reasonable_restraints'] = df['restraints_energy'] < 100
    
    # rank by haddock score - lower is better
    df = df.sort_values('haddock_score')
    df['rank'] = range(1, len(df) + 1)
    
    return df

def create_binding_matrix_visualization(df, score_column='haddock_score', title_suffix='HADDOCK 2.4 Score'):
    if len(df) == 0:
        print("No data to plot")
        return None
        
    # sort the names alphabetically (case insensitive)
    nanobodies = sorted(df['nanobody'].unique(), key=str.lower)
    antigens = sorted(df['antigen'].unique(), key=str.lower)
    
    # make pivot table
    matrix = df.pivot(index='nanobody', columns='antigen', values=score_column)
    matrix = matrix.reindex(index=nanobodies, columns=antigens)
    
    # make the plot bigger
    plt.figure(figsize=(14, 12))
    
    # use different colormap for haddock score since lower=better
    if score_column == 'haddock_score':
        cmap = 'viridis'
    else:
        cmap = 'YlOrRd'

    std_matrix = df.pivot(index='nanobody', columns='antigen', values='haddock_score_std')
    std_matrix = std_matrix.reindex(index=nanobodies, columns=antigens)
    
    # Create annotation labels with score ± std
    annot_labels = matrix.astype(str) + '\n±' + std_matrix.round(1).astype(str)
    
    ax = sns.heatmap(
        matrix,
        annot=annot_labels,
        fmt='',
        cmap=cmap,
        square=True,
        linewidths=0.5,
        cbar_kws={'label': f'{title_suffix}'},
        annot_kws={'size': 18}  # smaller font to fit both lines
    )
    '''
    ax = sns.heatmap(
        matrix,
        annot=True,
        fmt='.1f',
        cmap=cmap,
        square=True,
        linewidths=0.5,
        cbar_kws={'label': f'{title_suffix}'},
        annot_kws={'size': 18}
    )
    '''    
    # make colorbar label bigger
    cbar = ax.collections[0].colorbar
    cbar.set_label(f'{title_suffix}', size=20)
    
    # highlight the diagonal - these should be the real binding pairs
    for i in range(min(len(nanobodies), len(antigens))):
        ax.add_patch(plt.Rectangle((i, i), 1, 1, fill=False, edgecolor='red', lw=5))
    
    plt.title(f'Nanobody-Antigen Binding Matrix ({title_suffix})', fontsize=22, pad=20)
    plt.xlabel('Antigens', fontsize=22)
    plt.ylabel('Nanobodies', fontsize=22)
    plt.xticks(rotation=45, ha='right', fontsize=18)
    plt.yticks(rotation=0, fontsize=18)
    plt.tight_layout()
    plt.savefig("binding_matrix_haddock_2_4.png", dpi=300)
    return matrix

def analyze_results(df):
    print("=== HADDOCK Results Analysis ===")
    print()
    
    if len(df) == 0:
        print("ERROR: No data found - check file format")
        return
    
    print("Total experiments:", len(df))
    print("Number of nanobodies:", df['nanobody'].nunique())
    print("Number of antigens:", df['antigen'].nunique())
    print()
    
    # check quality of results
    reliable_count = df['reliable'].sum()
    good_zscore_count = df['good_zscore'].sum()
    reasonable_restraints_count = df['reasonable_restraints'].sum()
    
    print("Quality check:")
    print("  Reliable results (Z<-1.0, restraints<500):", str(reliable_count) + "/" + str(len(df)))
    print("  Good Z-scores (<-1.5):", str(good_zscore_count) + "/" + str(len(df)))
    print("  Low restraints violations (<100):", str(reasonable_restraints_count) + "/" + str(len(df)))
    print()
    
    # best binding pairs based on haddock score
    print("Top 5 binding pairs (lowest HADDOCK score):")
    top_pairs = df.nsmallest(5, 'haddock_score')
    for _, row in top_pairs.iterrows():
        if row['reliable']:
            flag = "reliable"
        else:
            flag = "unreliable "
        print("  " + flag + " " + row['nanobody'] + " + " + row['antigen'] + ": " + str(round(row['haddock_score'], 1)) + 
              " (Z-score: " + str(round(row['z_score'], 1)) + ", restraints: " + str(int(row['restraints_energy'])) + ")")
    print()
    
    # check diagonal pairs - these should be the correct matches
    diagonal_pairs = []
    for _, row in df.iterrows():
        # clean up names to check if they match
        nb_clean = row['nanobody'].replace('nb', '').upper()
        ag_clean = row['antigen'].upper()
        if nb_clean == ag_clean:
            diagonal_pairs.append(row)
    
    if diagonal_pairs:
        print("Expected binding pairs (diagonal matches):")
        for pair in diagonal_pairs:
            if pair['reliable']:
                flag = "reliable"
            else:
                flag = "unreliable "
            print("  " + flag + " " + pair['nanobody'] + " + " + pair['antigen'] + ": " + str(round(pair['haddock_score'], 1)) + 
                  " (Z-score: " + str(round(pair['z_score'], 1)) + ", restraints: " + str(int(pair['restraints_energy'])) + ")")
        print()
    
    # basic stats
    print("Summary statistics:")
    print("  HADDOCK Score - Mean:", round(df['haddock_score'].mean(), 1), 
          "Std:", round(df['haddock_score'].std(), 1), 
          "Range:", round(df['haddock_score'].min(), 1), "to", round(df['haddock_score'].max(), 1))
    print("  Z-Score - Mean:", round(df['z_score'].mean(), 2), 
          "Std:", round(df['z_score'].std(), 2), 
          "Range:", round(df['z_score'].min(), 2), "to", round(df['z_score'].max(), 2))
    print("  Restraints violations - Mean:", int(df['restraints_energy'].mean()), 
          "Range:", int(df['restraints_energy'].min()), "to", int(df['restraints_energy'].max()))
    print("  BSA - Mean:", int(df['bsa'].mean()), 
          "Range:", int(df['bsa'].min()), "to", int(df['bsa'].max()))

# run the analysis
if __name__ == "__main__":
    # change this path to your file
    filename = "haddock2_4_webserver_evaluation_experiment.txt"
    
    with open(filename, 'r') as file:
        file_content = file.read()
    
    # parse the file
    df = parse_haddock_results(file_content)
    
    if len(df) > 0:
        df = filter_and_rank_results(df)
        
        # make the heatmap
        matrix = create_binding_matrix_visualization(df, 'haddock_score')
        
        # print results
        analyze_results(df)
        
        # save to csv files
        df.to_csv('haddock_results.csv', index=False)
        if matrix is not None:
            matrix.to_csv('binding_matrix.csv')
    else:
        print("No data found - check file format")
    
    print("Done!")