In [1]:
import os
import gemmi
import numpy as np

In [2]:
def read_csv(file):
    """
    Read a csv file and return a list of the lines.
    """
    with open(file, 'r') as file:
        lines=file.readlines()
    files=[line.strip() for line in lines]
    return files

In [3]:
def parse_pdb(filepath):
    with open(filepath, 'r') as file:
        lines = file.readlines()
    return [line for line in lines if line.startswith(('ATOM', 'HETATM', 'HEADER', 'COMPND', 'SOURCE', 'EXPDTA', 'REMARK', 'SEQRES', 'HET', 'HETNAM', 'FORMUL', 'LINK', 'CRYST1', 'ANISOU', ))]


In [4]:
def calculate_ave_b(pdb_path):
    """Calculate the averate B factor for all atoms of a given pdb file."""
    structure=gemmi.read_structure(pdb_path)

    # Generate dictionary of all atoms.
    chain_keys=['seq_id', 'res', 'b_fac', 'atom']
    structure = gemmi.read_structure(os.path.join(nowat_dir, '6ym8.pdb'))
    # Check model.
    i = 0
    model = structure[i]  # consider the first model (skip if empty)
    while len(model) == 0:  # sometimes the first model is empty
        i += 1
        try:
            model = structure[i]
        except Exception:
            raise ValueError("Can't read valid model from the input PDB file!")
    # Chain.
    # at_dict = {x.atom.name: x.atom.b_iso for x in model.all()} # CANNOT use atom name, otherwise will be updated.
    at_dict = {x.atom: x.atom.b_iso for x in model.all()} # CANNOT use atom name, otherwise will be updated.

    # Calculate the average B factor for all atoms.
    all_b = list(at_dict.values())
    average_b = round(np.average(all_b), 2)

    return average_b
    

In [6]:
def process_water_pdb_Average_B_HP(water_pdb_lines, average_b):
    """FOr HydraProt output, modify B factor of water lines by multiplying the average B factor of protein atoms and plus 50."""
    new_lines = []
    for atom_number, line in enumerate(water_pdb_lines):
        if line.startswith(('ATOM', 'HETATM')):
            new_residue_number = atom_number
            raw_b = line[62:67] # b factor.
            new_b = round((float(raw_b.strip()) * average_b + 50), 2) # Multiply by average B factor of protein then plus 50. Then round to 2 decimals.
            new_line = (
                f"{line[:21]}S{line[22:26].replace(line[22:26], f'{new_residue_number:4d}')}{line[26:60]}  {str(new_b)}{line[67:]}"
            )
            
            new_lines.append(new_line)
    return new_lines


In [7]:
def process_water_pdb_Average_B_SW(water_pdb_lines, average_b):
    """FOr SuperWater output, modify B factor of water lines by multiplying the average B factor of protein atoms by 1.5 since SW outputs zero B factors."""
    new_lines = []
    for atom_number, line in enumerate(water_pdb_lines):
        if line.startswith(('ATOM', 'HETATM')):
            new_residue_number = atom_number
            raw_b = line[62:67] # b factor.
            new_b = round((1.5 * average_b), 2) # Multiply the average B factor of protein by 1.5. Then round to 2 decimals.
            new_line = (
                f"{line[:21]}S{line[22:26].replace(line[22:26], f'{new_residue_number:4d}')}{line[26:60]}  {str(new_b)}{line[67:]}"
            )
            
            new_lines.append(new_line)
    return new_lines


In [15]:
def merge_pdb_files_B(nowat_pdb_path, water_pdb_path, output_pdb_path, average_b, model_type):
    """
    Generate combined PDB file with water lines modified with new B factors.
    Depending on the model_type (SuperWater/HydraProt), B factors are updated in different ways.
    model_type = 'SW' or 'HP'
    """
    nowat_pdb_lines = parse_pdb(nowat_pdb_path)
    water_pdb_lines = parse_pdb(water_pdb_path)

    # For different model output, B factors are updated in different ways.
    if model_type == 'SW':
        modified_water_lines = process_water_pdb_Average_B_SW(water_pdb_lines, average_b)
    elif model_type =='HP':
        modified_water_lines = process_water_pdb_Average_B_HP(water_pdb_lines, average_b)
    else:
        print('Need input of model type to correct B factors.')
    
    merged_lines = nowat_pdb_lines + ['TER\n'] + modified_water_lines + ['END\n'] 
    with open(output_pdb_path, 'w') as output_file:
        output_file.writelines(merged_lines)
    
    print(f"Merged PDB file saved as: {output_pdb_path}")

In [10]:
pwd

'/Users/mingbin/Desktop/Wankowicz_lab_Mac/Projects/water/data/pipeline'

In [11]:
nowat_dir='/Users/mingbin/Desktop/Wankowicz_lab_Mac/Projects/water/data/pipeline/PDBs_test_SW_nowat' # generated by grep (Ahmed's code)/ '{id}.pdb'
HP_dir='/Users/mingbin/Desktop/Wankowicz_lab_Mac/Projects/water/data/pipeline/waterPDBs_test_HP' # "{id}_final_waters.pdb"
SW_dir='/Users/mingbin/Desktop/Wankowicz_lab_Mac/Projects/water/data/pipeline/waterPDBs_test_SW_whitney' # "./{id}/{id}_centroid.pdb"

In [12]:
test_ids=read_csv('/Users/mingbin/Desktop/Wankowicz_lab_Mac/Projects/water/data/test_split.txt')

In [12]:
# Generate output directory.
# os.mkdir('/Users/mingbin/Desktop/Wankowicz_lab_Mac/Projects/water/data/pipeline/output_HP_merged')
output_dir='/Users/mingbin/Desktop/Wankowicz_lab_Mac/Projects/water/data/pipeline/output_HP_merged'

In [17]:
# Generate output directory for correcting B factors.
# os.mkdir('/Users/mingbin/Desktop/Wankowicz_lab_Mac/Projects/water/data/pipeline/output_HP_merged_B')
output_dir_HP_B='/Users/mingbin/Desktop/Wankowicz_lab_Mac/Projects/water/data/pipeline/output_HP_merged_B'

In [18]:
# Generate output directory for correcting B factors.
# os.mkdir('/Users/mingbin/Desktop/Wankowicz_lab_Mac/Projects/water/data/pipeline/output_SW_merged_B')
output_dir_SW_B='/Users/mingbin/Desktop/Wankowicz_lab_Mac/Projects/water/data/pipeline/output_SW_merged_B'

In [19]:
for pdb in test_ids:
    if os.path.exists(os.path.join(nowat_dir, f"{pdb}.pdb")) and os.path.exists(os.path.join(HP_dir, f"{pdb}_final_waters.pdb")) and os.path.exists(os.path.join(SW_dir, pdb, f"{pdb}_centroid.pdb")):
        nowat_pdb_path = os.path.join(nowat_dir, f"{pdb}.pdb")
        superwater_pdb_path = os.path.join(SW_dir, pdb, f"{pdb}_centroid.pdb")
        hydraprot_pdb_path = os.path.join(HP_dir, f"{pdb}_final_waters.pdb")
        superwater_output_pdb_path = os.path.join(output_dir_SW_B, f"{pdb}_sw_merged.pdb")
        hydraprot_output_pdb_path = os.path.join(output_dir_HP_B, f"{pdb}_hp_merged.pdb")
        

        print(f"Merging {nowat_pdb_path} and {superwater_pdb_path} into {superwater_output_pdb_path}")
        average_b = calculate_ave_b(nowat_pdb_path)
        merge_pdb_files_B(nowat_pdb_path, superwater_pdb_path, superwater_output_pdb_path, average_b, model_type='SW')
        
        # print(f"Merging {nowat_pdb_path} and {hydraprot_pdb_path} into {hydraprot_output_pdb_path}")
        # average_b = calculate_ave_b(nowat_pdb_path)
        # merge_pdb_files_B(nowat_pdb_path, hydraprot_pdb_path, hydraprot_output_pdb_path, average_b, model_type='HP')

Merging /Users/mingbin/Desktop/Wankowicz_lab_Mac/Projects/water/data/pipeline/PDBs_test_SW_nowat/6ym8.pdb and /Users/mingbin/Desktop/Wankowicz_lab_Mac/Projects/water/data/pipeline/waterPDBs_test_SW_whitney/6ym8/6ym8_centroid.pdb into /Users/mingbin/Desktop/Wankowicz_lab_Mac/Projects/water/data/pipeline/output_SW_merged_B/6ym8_sw_merged.pdb
Merged PDB file saved as: /Users/mingbin/Desktop/Wankowicz_lab_Mac/Projects/water/data/pipeline/output_SW_merged_B/6ym8_sw_merged.pdb
Merging /Users/mingbin/Desktop/Wankowicz_lab_Mac/Projects/water/data/pipeline/PDBs_test_SW_nowat/5pu7.pdb and /Users/mingbin/Desktop/Wankowicz_lab_Mac/Projects/water/data/pipeline/waterPDBs_test_SW_whitney/5pu7/5pu7_centroid.pdb into /Users/mingbin/Desktop/Wankowicz_lab_Mac/Projects/water/data/pipeline/output_SW_merged_B/5pu7_sw_merged.pdb
Merged PDB file saved as: /Users/mingbin/Desktop/Wankowicz_lab_Mac/Projects/water/data/pipeline/output_SW_merged_B/5pu7_sw_merged.pdb
Merging /Users/mingbin/Desktop/Wankowicz_lab_Mac