In [28]:
import pandas as pd

def calculate_averages_from_csv(csv_path):

    df = pd.read_csv(csv_path)

    scTM_mean = df['scTM Value'].mean()
    RMSD_mean = df['RMSD Value'].mean()

    return scTM_mean, RMSD_mean


# CATH4.2 MPNN

In [29]:
csv_file_path = '/home/zhengsun/code/protein/ProteinInvBench/results/scTM_RMSD/cath42mpnn_2.csv'
scTM_avg, RMSD_avg = calculate_averages_from_csv(csv_file_path)
print(f"Average scTM Value: {scTM_avg}, Average RMSD Value: {RMSD_avg}")

Average scTM Value: 0.8411653614787888, Average RMSD Value: 3.042430913785047


# CATH4.2 PiFold

In [30]:
csv_file_path = '/home/zhengsun/code/protein/ProteinInvBench/results/scTM_RMSD/cath42pifold_2.csv'
scTM_avg, RMSD_avg = calculate_averages_from_csv(csv_file_path)
print(f"Average scTM Value: {scTM_avg}, Average RMSD Value: {RMSD_avg}")

Average scTM Value: 0.841311563310198, Average RMSD Value: 3.1138291064836445


In [31]:
from Bio import SeqIO

def integrate_csv_fasta(csv_path, fasta_path, output_csv_path):
    csv_df = pd.read_csv(csv_path)
    
    sequences = {}
    for record in SeqIO.parse(fasta_path, "fasta"):
        pdb_id = record.id.split('.')[0]
        sequences[pdb_id] = str(record.seq)
    
    csv_df['Sequence'] = ''
    csv_df['Sequence Length'] = 0

    for index, row in csv_df.iterrows():
        pdb_id = row['Reference PDB'].split('.')[0]
        if pdb_id in sequences:
            sequence = sequences[pdb_id]
            csv_df.at[index, 'Sequence'] = sequence
            csv_df.at[index, 'Sequence Length'] = len(sequence)
    
    csv_df.to_csv(output_csv_path, index=False)
    
    return csv_df

In [6]:
csv_file_path = '/home/zhengsun/code/protein/ProteinInvBench/results/scTM_RMSD/cath42pifold_2.csv'
fasta_file_path = '/home/zhengsun/code/protein/ProteinInvBench/results/fasta_files/CATH4.2_PiFold.fasta'
output_csv_file_path = '/home/zhengsun/code/protein/ProteinInvBench/results/scTM_RMSD/cath42pifold_with_seq.csv'

integrate_csv_fasta(csv_file_path, fasta_file_path, output_csv_file_path)

Unnamed: 0,Reference PDB,ESMF PDB,scTM Value,RMSD Value,Sequence,Sequence Length
0,1xcc.pdb,1xcc.pdb,0.973317,0.756776,SLKIGDTFPNFSTEASGIDGKFNFYDAIHNKWAILFSFPLMFTPLD...,219
1,2jua.pdb,2jua.pdb,0.805052,2.383319,ASGKVSELASALEKALEKAKSNIKGGEEAKAEIKEKLSSVASALGA...,102
2,4kpn.pdb,4kpn.pdb,0.976995,0.718400,SPKLVIMNVSPGVLESMAIFLLLRSPSLELIAVTTVYGHVETPTAT...,318
3,4agh.pdb,4agh.pdb,0.909531,0.902830,GVEIDADGNPYVKISENTRIGVTEENDKKYVVIREYYEEGGKWLPG...,80
4,2es7.pdb,2es7.pdb,0.572770,11.979181,YEDLRQALLDRGWKPVSAELGYGVILLSGDPESPEAPEKVEALIEE...,103
...,...,...,...,...,...,...
851,5ddt.pdb,5ddt.pdb,0.924187,1.564970,MSFDVVIPTAGQGAKQAAGKCTGFRKLKGEPIIVWTLKVFDNHESC...,232
852,1wjz.pdb,1wjz.pdb,0.762537,3.752668,GSSGSSGLAKEETFKKDYFSILGVEPDASYEELKQKYKKHIDQYDP...,94
853,1fo8.pdb,1fo8.pdb,0.985043,0.644112,SEVIPILVLAADRTSVKLTLDQILKYRPSAEKFPVIVCEDAGAEAT...,330
854,3kkg.pdb,3kkg.pdb,0.949403,0.847771,GTALTPQEKANCETVLKLFTEGHGSVEGFEDVWRSTFTDDFRLVYD...,144


In [8]:
csv_file_path = '/home/zhengsun/code/protein/ProteinInvBench/results/scTM_RMSD/cath42mpnn_2.csv'
fasta_file_path = '/home/zhengsun/code/protein/ProteinInvBench/results/fasta_files/CATH4.2_ProteinMPNN.fasta'
output_csv_file_path = '/home/zhengsun/code/protein/ProteinInvBench/results/scTM_RMSD/cath42mpnn_with_seq.csv'

integrate_csv_fasta(csv_file_path, fasta_file_path, output_csv_file_path)

Unnamed: 0,Reference PDB,ESMF PDB,scTM Value,RMSD Value,Sequence,Sequence Length
0,1xcc.pdb,1xcc.pdb,0.978239,0.711377,XGLKIGDTFPDFKTEASGIDGWFNLYELIKDKWAIYISFPSTFTPL...,220
1,2jua.pdb,2jua.pdb,0.765421,2.174414,MSGEFEEIEEELREVLEEALEHVKGGEEKAEELKELLDEVVEVIRE...,102
2,4kpn.pdb,4kpn.pdb,0.981509,0.685358,XXXXXXXXXXXXXXXXXXXXXXXXXXXPPQKVIASVSPGVDEAWAI...,345
3,4agh.pdb,4agh.pdb,0.885824,1.107018,XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXGVRIDADGN...,158
4,2es7.pdb,2es7.pdb,0.894444,1.346238,XXXXXXFEALLEALKERGWEPIKEEXXXXXXXXMGDAVILLLGDPE...,142
...,...,...,...,...,...,...
851,5ddt.pdb,5ddt.pdb,0.921041,1.513016,MDFDVVIPTAGAGAKQAAGQDTLFRELEGEPIIVHTLKVFDNYKNC...,232
852,1wjz.pdb,1wjz.pdb,0.732512,4.608592,GSSGSSGEIEEKTFNWKAFEVLGVDPDASREELEARYKENVKKFDP...,94
853,1fo8.pdb,1fo8.pdb,0.968752,0.911202,SRVIPVLVMACDRVSVRLTLDVLLKYRPSDELYPIIVGQACGAEAT...,343
854,3kkg.pdb,3kkg.pdb,0.969839,0.654078,MRALTPAERENVETVMKLFTEGHGSVPGWREVWESTCTDDFELVIE...,146


In [32]:
def calculate_averages_from_csv_by_length(csv_path, start_length, end_length):

      df = pd.read_csv(csv_path)

      df_filtered = df[(df['Sequence Length'] >= start_length) & (df['Sequence Length'] <= end_length)]

      scTM_mean = df_filtered['scTM Value'].mean()
      RMSD_mean = df_filtered['RMSD Value'].mean()

      return scTM_mean, RMSD_mean


# CATH4.2 PiFold

In [35]:
csv_file_path = '/home/zhengsun/code/protein/ProteinInvBench/results/scTM_RMSD/cath42pifold_with_seq.csv'
start_length = 200
end_length = 500
scTM_avg, RMSD_avg = calculate_averages_from_csv_by_length(csv_file_path, start_length, end_length)
print(f"Average scTM Value: {scTM_avg}, Average RMSD Value: {RMSD_avg}")

Average scTM Value: 0.895158427337027, Average RMSD Value: 2.715483436262136


# CATH4.2 ProteinMPNN

In [36]:
csv_file_path = '/home/zhengsun/code/protein/ProteinInvBench/results/scTM_RMSD/cath42mpnn_with_seq.csv'
start_length = 200
end_length = 500
scTM_avg, RMSD_avg = calculate_averages_from_csv_by_length(csv_file_path, start_length, end_length)
print(f"Average scTM Value: {scTM_avg}, Average RMSD Value: {RMSD_avg}")

Average scTM Value: 0.8995894489973522, Average RMSD Value: 2.6481970929361696
