In [1]:
import sys
import numpy as np
import pandas as pd
import simtk.openmm
import simtk.unit as unit
import os
import glob
import shutil
import time
import MDAnalysis as mda
import math
pd.set_option("display.precision", 10)

ca_sbm_3spn_openmm_path = '/Users/smliu/Documents/Projects/CA_SBM_3SPN2C_OPENMM'
sys.path.insert(0, ca_sbm_3spn_openmm_path)

import openSMOG3SPN2.open3SPN2.ff3SPN2 as ff3SPN2
import openSMOG3SPN2.calphaSMOG.ffCalpha as ffCalpha
import openSMOG3SPN2.openFiber as openFiber
import openSMOG3SPN2.rigid

# set some global parameters
n_nucl = 2
scale_factor = 2.5 # scale factor for all the SBM related potentials
run_smog = False
apply_rigid_body = False
compare_with_lammps = True
run_sim = False
dna_seq_file = None
ffCalpha_xml_path = '%s/openSMOG3SPN2/calphaSMOG/ffCalpha.xml' % ca_sbm_3spn_openmm_path
smog_dir = '/Users/administrator/Documents/Tools/smog-2.2' # the directory where smog is installed
# smog_dir does not matter if run_smog == False
histone_dna_data_dir = '%s/data/chromatin-%dmer/separate-%dmer-output' % (ca_sbm_3spn_openmm_path, n_nucl, n_nucl) # the path that saves the input pdb structures for all the dna and histones (each histone is saved in one pdb file)
group_rigid_txt_path = '%s/data/chromatin-%dmer/chromatin-%dmer-rigid-group/group_rigid.txt' % (ca_sbm_3spn_openmm_path, n_nucl, n_nucl) # group_rigid.txt file with atom index starts from 1 (lammps format)
main_output_dir = '%s/output-files/chromatin-%dmer' % (ca_sbm_3spn_openmm_path, n_nucl) # the main output directory
smog_output_dir = '%s/smog' % main_output_dir # smog output directory
openmm_files_dir = '%s/openmm-files' % main_output_dir
platform_name = 'CPU' # 'Reference', 'CPU', 'CUDA', 'OpenCL'
sim_output_dir = '%s/sim-test-%s' % (openmm_files_dir, platform_name)
init_system_state_dir = '%s/init-system-state' % main_output_dir

# build the output directories
if not os.path.exists(main_output_dir):
    os.makedirs(main_output_dir)
if not os.path.exists(smog_output_dir):
    os.makedirs(smog_output_dir)
if not os.path.exists(openmm_files_dir):
    os.makedirs(openmm_files_dir)
if not os.path.exists(sim_output_dir):
    os.makedirs(sim_output_dir)
if not os.path.exists(init_system_state_dir):
    os.makedirs(init_system_state_dir)

dcd_path = '%s/data/chromatin-%dmer/chromatin-%dmer-snapshots/relaxed1.dcd' % (ca_sbm_3spn_openmm_path, n_nucl, n_nucl)

  from .autonotebook import tqdm as notebook_tqdm


# 1 Build the CG model for the chromatin

## 1.1 Load PDB structures

In [2]:
# load each histone
all_histone_fix_list = []
for i in range(n_nucl):
    all_histone_fix_list.append(ff3SPN2.fixPDB('%s/histone-%d.pdb' % (histone_dna_data_dir, i + 1)))

# load dna
dna_fix = ff3SPN2.fixPDB('%s/dna.pdb' % histone_dna_data_dir)

# convert to pandas format tables that includes all the information of each histone and dna
# we use pandas table because there is no length limit for the entries
all_histone_atom_tables = []
for each in all_histone_fix_list:
    all_histone_atom_tables.append(ff3SPN2.pdb2table(each))

dna_atom_table = ff3SPN2.pdb2table(dna_fix)

# update serial for each histone and dna
for i in range(len(all_histone_atom_tables)):
    all_histone_atom_tables[i] = openFiber.change_serial_resSeq(all_histone_atom_tables[i], change_resSeq=False)
dna_atom_table = openFiber.change_serial_resSeq(dna_atom_table, change_resSeq=False)

# combine the tables for histones and DNA
complex_table = all_histone_atom_tables[0]
for i in range(1, len(all_histone_atom_tables)):
    complex_table = openFiber.combine_molecules(complex_table, all_histone_atom_tables[i], add_resSeq=False)
complex_table = openFiber.combine_molecules(complex_table, dna_atom_table, add_resSeq=False)

# write the data into csv file
all_atom_output_dir = '%s/all-atom-fiber' % main_output_dir
if not os.path.exists(all_atom_output_dir):
    os.makedirs(all_atom_output_dir)
complex_table.to_csv('%s/chromatin-%dmer.csv' % (all_atom_output_dir, n_nucl), index=False)


## 1.2 Apply SMOG to histones

In [3]:
# write all the histones into a PDB file
ffCalpha.writePDB_protein(complex_table, '%s/histones.pdb' % smog_output_dir)

# add TER to the pdb file
input_pdb_path = '%s/histones.pdb' % smog_output_dir
output_pdb_path = '%s/histones_clean.pdb' % smog_output_dir
openFiber.add_ter_end_and_remove_OXT_for_pdb(input_pdb_path, output_pdb_path)

In [4]:
if run_smog:
    # perform smog on the clean protein pdb file
    cmd = 'source %s/configure.smog2; ' % smog_dir
    cmd = cmd + 'cd %s; ' % smog_output_dir
    sbm_aa_path = '%s/share/templates/SBM_AA' % smog_dir
    sbm_calpha_gaussian_path = '%s/share/templates/SBM_calpha+gaussian' % smog_dir
    cmd = cmd + 'smog2 -i histones_clean.pdb -t %s -tCG %s' % (sbm_aa_path, sbm_calpha_gaussian_path)
    #print(cmd)
    os.system(cmd)

    # pick out sections from smog.top
    cmd = 'cd %s; ' % smog_output_dir
    py_get_section_script_path = '%s/openSMOG3SPN2/getSection.py' % ca_sbm_3spn_openmm_path
    key_word_list = ['atoms', 'bonds', 'angles', 'dihedrals', 'pairs', 'exclusions', 'system']
    for i in range(len(key_word_list) - 1):
        keyword1 = key_word_list[i]
        keyword2 = key_word_list[i + 1]
        cmd = cmd + 'python %s ./smog.top %s.dat "[ %s ]" "[ %s ]"; ' % (py_get_section_script_path, keyword1, keyword1, keyword2)
    #print(cmd)
    os.system(cmd)


## 1.3 Load DNA and histone CG models separately and then combine them

In [5]:
# generate DNA and protein CG model from complex_table
cg_dna = ff3SPN2.DNA.CoarseGrain(complex_table)
cg_proteins = ffCalpha.Protein.CoarseGrain(complex_table)

# update the sequence for cg_dna
if dna_seq_file != None:
    n_bp, target_dna_seq = openFiber.load_dna_seq_file(dna_seq_file)
    cg_dna = openFiber.update_cg_dna_seq(cg_dna, target_dna_seq)

# combine CG histones and DNA
cg_fiber = pd.concat([cg_proteins, cg_dna], sort=False)
cg_fiber.index = list(range(len(cg_fiber.index)))
cg_fiber['serial'] = list(range(len(cg_fiber.index)))
n_cg_atoms = cg_fiber.shape[0]

# change the chainID of the chromatin fiber
cg_fiber_unique_chainID = openFiber.change_unique_chainID(cg_fiber)

# save protein sequence
if not os.path.exists('%s/cg-fiber' % main_output_dir):
    os.makedirs('%s/cg-fiber' % main_output_dir)
protein_seq_path = '%s/cg-fiber/protein_seq.txt' % main_output_dir
ffCalpha.save_protein_sequence(cg_fiber_unique_chainID, sequence_file=protein_seq_path)

# write cg_fiber to pdb format, which will later be loaded by openmm
# note we convert cg_fiber instead of cg_fiber_unique_chainID to pdb format, since cg_fiber_unique_chainID may have chainID length beyond the limit of pdb format
cg_fiber_pdb_path = '%s/cg-fiber/cg_fiber.pdb' % main_output_dir
#print(cg_fiber)
ffCalpha.writePDB(cg_fiber, cg_fiber_pdb_path)
cg_fiber.to_csv('%s/cg-fiber/cg_fiber.csv' % main_output_dir, index=False)

# also save cg_fiber_unique_chainID.csv
cg_fiber_unique_chainID.to_csv('%s/cg-fiber/cg_fiber_unique_chainID.csv' % main_output_dir, index=False)

# 2 Set up OpenMM simulations

## 2.1 Set up the system, protein and dna objects

In [6]:
cg_fiber_pdb_path = '%s/cg-fiber/cg_fiber.pdb' % main_output_dir
os.chdir('%s/cg-fiber' % main_output_dir)

pdb = simtk.openmm.app.PDBFile(cg_fiber_pdb_path)
coord_pdb = pdb.getPositions(asNumpy=True)
top = pdb.getTopology()
coord = openFiber.load_coord_from_dcd(cg_fiber_pdb_path, dcd_path)
forcefield = simtk.openmm.app.ForceField(ffCalpha_xml_path, ff3SPN2.xml)
s = forcefield.createSystem(top)

In [7]:
# create the DNA and protein objects
dna = ff3SPN2.DNA.fromCoarsePandasDataFrame(pd_df=cg_fiber_unique_chainID, dna_type='B_curved')
with open(protein_seq_path, 'r') as ps:
    protein_seq = ps.readlines()[0].rstrip()
protein = ffCalpha.Protein.fromCoarsePandasDataFrame(pd_df=cg_fiber_unique_chainID, sequence=protein_seq)

dna.periodic = False
protein.periodic = False


use the new set of base step geometry parameters
the DNA sequence is W-C paired!
use the sequence of the first ssDNA as the input sequence for x3dna


In [8]:
# save dna bonds, angles, and dihedrals
# dna bonds, angles and dihedral equilibrium values are based on template built by x3dna
dna.bonds.to_csv('%s/cg-fiber/dna_bonds.csv' % main_output_dir, index=False)
dna.angles.to_csv('%s/cg-fiber/dna_angles.csv' % main_output_dir, index=False)
dna.stackings.to_csv('%s/cg-fiber/dna_stackings.csv' % main_output_dir, index=False)
dna.dihedrals.to_csv('%s/cg-fiber/dna_dihedrals.csv' % main_output_dir, index=False)

In [9]:
# get DNA sequence
dna_seq = dna.getFullSequences()
dna_seq = ''.join(dna_seq.values)
dna_seq = dna_seq[:int(len(dna_seq)/2)]

# compare with target sequence
dna_seq_file = '%s/data/chromatin-%dmer/dnaSeq.txt' % (ca_sbm_3spn_openmm_path, n_nucl)
n_bp, target_dna_seq = openFiber.load_dna_seq_file(dna_seq_file)
if dna_seq != target_dna_seq:
    print('DNA sequence is not consistent with input sequence!')


## 2.2 Set up forces

### 2.2.1 Set up rigid body list and chain list

In [10]:
# create rigid identity list for the fiber
rigid_body_array = np.loadtxt(group_rigid_txt_path, dtype=int) - 1 # atom index starts from 0
fiber_rigid_identity = []
print('n_cg_atoms = %d' % n_cg_atoms)
for i in range(n_cg_atoms):
    rigid_identity = None
    if apply_rigid_body:
        for j in range(n_nucl):
            if i in rigid_body_array[j]:
                rigid_identity = j
                break
    fiber_rigid_identity.append(rigid_identity)

# save fiber_rigid_identity
if apply_rigid_body:
    fiber_rigid_identity_output_path = '%s/cg-fiber/rigid_identity.dat' % main_output_dir
    openFiber.write(fiber_rigid_identity, fiber_rigid_identity_output_path)

#print(fiber_rigid_identity)
histones_chains = openFiber.get_single_fiber_histones_chains(n_nucl)

n_cg_atoms = 3830


### 2.2.2 Set up forces for histones and dna

In [11]:
# load the force parameters given by smog
smog_atoms_file_path = '%s/atoms.dat' % smog_output_dir
smog_bonds_file_path = '%s/bonds.dat' % smog_output_dir
smog_angles_file_path = '%s/angles.dat' % smog_output_dir
smog_dihedrals_file_path = '%s/dihedrals.dat' % smog_output_dir
smog_exclusions_file_path = '%s/exclusions.dat' % smog_output_dir
smog_pairs_file_path = '%s/pairs.dat' % smog_output_dir

smog_bonds_data = openFiber.load_smog_bonds(smog_bonds_file_path)
smog_angles_data = openFiber.load_smog_angles(smog_angles_file_path)
smog_dihedrals_data = openFiber.load_smog_dihedrals(smog_dihedrals_file_path)
smog_exclusions_data = openFiber.load_smog_exclusions(smog_exclusions_file_path)
smog_pairs_data = openFiber.load_smog_pairs(smog_pairs_file_path)

# remove protein-protein native pairs if at least one atom is within histone tail
# also update smog_exclusions_data based on the new smog_pairs_data
smog_pairs_data, smog_exclusions_data = openFiber.remove_IDR_pairs_exclusions(smog_pairs_data, smog_exclusions_data)

# also remove dihedrals if at least one atom is within histone tail
smog_dihedrals_data = openFiber.remove_IDR_dihedrals(smog_dihedrals_data)

# save the new smog_pairs_data, smog_exclusions_data, smog_dihedrals_data
openFiber.write_smog_pairs(smog_pairs_data, '%s/pairs_IDR_removed.dat' % smog_output_dir)
openFiber.write_smog_exclusions(smog_exclusions_data, '%s/exclusions_IDR_removed.dat' % smog_output_dir)
openFiber.write_smog_dihedrals(smog_dihedrals_data, '%s/dihedrals_IDR_removed.dat' % smog_output_dir)

smog_data = dict(bonds=smog_bonds_data, angles=smog_angles_data, dihedrals=smog_dihedrals_data, pairs=smog_pairs_data)

# set force dictionary
forces = {}

# set exclusions list and save
dna_exclusions_list = ff3SPN2.buildDNANonBondedExclusionsList(dna)
dna_exclusions_list_output_path = '%s/cg-fiber/dna_exclusions.dat' % main_output_dir
openFiber.write_exclusions_list(dna_exclusions_list, dna_exclusions_list_output_path)
protein_exclusions_list = ffCalpha.buildProteinNonBondedExclusionsList(protein, smog_exclusions_data)

print('total number of protein native pairs is %d' % smog_exclusions_data.shape[0])

# for comparison with lammps, we do not put protein native pairs in protein_exclusions_list
if compare_with_lammps:
    print('Compare with lammps, include MJ and electrostatic interactions for protein native pairs')
    print('Warning, in principle MJ and electrostatic interactions should be removed for protein native pairs!')
    print('remove protein native pairs from protein_exclusions_list')
    print('Before removing protein native pairs, total number of exclusions between protein atoms is %d' % len(protein_exclusions_list))
    smog_exclusions_list = []
    for i in range(smog_exclusions_data.shape[0]):
        i1, i2 = int(smog_exclusions_data[i, 0]), int(smog_exclusions_data[i, 1])
        if i1 > i2:
            i1, i2 = i2, i1
        smog_exclusions_list.append((i1, i2))
    new_protein_exclusions_list = []
    for each in protein_exclusions_list:
        if each not in smog_exclusions_list:
            new_protein_exclusions_list.append(each)
    protein_exclusions_list = new_protein_exclusions_list
    print('After removing protein native pairs, total number of exclusions between protein atoms is %d' % len(protein_exclusions_list))
    protein_exclusions_list_output_path = '%s/cg-fiber/protein_exclusions_compare_with_lammps.dat' % main_output_dir
else:
    print('total number of exclusions between protein atoms is %d' % len(protein_exclusions_list))
    protein_exclusions_list_output_path = '%s/cg-fiber/protein_exclusions.dat' % main_output_dir
openFiber.write_exclusions_list(protein_exclusions_list, protein_exclusions_list_output_path)

print('total number of exclusions between DNA atoms is %d' % len(dna_exclusions_list))

# add DNA-DNA, protein-DNA and protein-protein interactions
openFiber.add_protein_dna_forces(s, forces, dna, protein, smog_data, dna_exclusions_list, protein_exclusions_list, fiber_rigid_identity, scale_factor)

# save force names and groups as pandas dataframe
df_forces_name_group = pd.DataFrame({'group': [], 'name': []})
for force_name, force in forces.items():
    group = force.getForceGroup()
    group = int(group)
    df_forces_name_group.loc[len(df_forces_name_group.index)] = [group, force_name]
df_forces_name_group.to_csv('%s/forces_name_group.csv' % main_output_dir, index=False)


total number of protein native pairs is 4692
Compare with lammps, include MJ and electrostatic interactions for protein native pairs
remove protein native pairs from protein_exclusions_list
Before removing protein native pairs, total number of exclusions between protein atoms is 10440
After removing protein native pairs, total number of exclusions between protein atoms is 5748
total number of exclusions between DNA atoms is 57486
Add protein-protein forces
Protein-protein forces are derived from SMOG output parameters
adding force BondProtein takes 0.004823 seconds
adding force AngleProtein takes 0.005684 seconds
adding force DihedralProtein takes 0.010776 seconds
adding force NativePairProtein takes 0.017713 seconds
adding force NonbondedMJ takes 0.099360 seconds
adding force ElectrostaticsProteinProtein takes 0.042306 seconds
Add DNA-DNA forces
DNA-DNA forces are based on 3SPN2 forcefield
force name Bond is updated to BondDNA
adding force BondDNA takes 0.211109 seconds
force name Ang

## 2.3 Set up rigid body

In [12]:
if apply_rigid_body:
    rigid_body_array = np.loadtxt(group_rigid_txt_path, dtype=int) - 1 # atom index starts from 0
    rigid_body_list = []
    for i in range(n_nucl):
        new_list = rigid_body_array[i].tolist()
        new_list = [int(each) for each in new_list]
        rigid_body_list.append(new_list)
    openFiber.rigid.createRigidBodies(s, coord, rigid_body_list)

## 2.4 Run the simulation

In [13]:
temperature = 300*unit.kelvin

integrator = simtk.openmm.LangevinIntegrator(temperature, 1/unit.picosecond, 10*unit.femtoseconds)
platform = simtk.openmm.Platform.getPlatformByName(platform_name)

simulation = simtk.openmm.app.Simulation(top, s, integrator, platform)

simulation.context.setPositions(coord)
energy_unit = unit.kilocalories_per_mole
state = simulation.context.getState(getEnergy=True)
energy = state.getPotentialEnergy().value_in_unit(energy_unit)
print("The overall potential energy is %.6f %s" % (energy, energy_unit.get_symbol()))

df_forces_name_group = pd.read_csv('%s/forces_name_group.csv' % main_output_dir)
for index, row in df_forces_name_group.iterrows():
    group, force_name = int(row['group']), row['name']
    state = simulation.context.getState(getEnergy=True, groups={group})
    energy = state.getPotentialEnergy().value_in_unit(energy_unit)
    print('Group: %d, force name: %s, energy = %.6f %s' % (group, force_name, energy, energy_unit.get_symbol()))

The overall potential energy is -5387.071675 kcal/mol
Group: 1, force name: BondProtein, energy = 572.086984 kcal/mol
Group: 2, force name: AngleProtein, energy = 413.327471 kcal/mol
Group: 3, force name: DihedralProtein, energy = 110.061952 kcal/mol
Group: 4, force name: NativePairProtein, energy = -2066.112534 kcal/mol
Group: 5, force name: NonbondedMJ, energy = -337.756852 kcal/mol
Group: 6, force name: ElectrostaticsProteinProtein, energy = 81.529240 kcal/mol
Group: 7, force name: BondDNA, energy = 280.981968 kcal/mol
Group: 8, force name: AngleDNA, energy = 632.934250 kcal/mol
Group: 9, force name: Stacking, energy = -1692.943675 kcal/mol
Group: 10, force name: DihedralDNA, energy = -1861.648828 kcal/mol
Group: 11, force name: BasePair, energy = -1030.840282 kcal/mol
Group: 12, force name: CrossStacking, energy = -229.274822 kcal/mol
Group: 13, force name: ExclusionDNADNA, energy = 2.795626 kcal/mol
Group: 14, force name: ElectrostaticsDNADNA, energy = 100.661733 kcal/mol
Group: 1

In [14]:
# save the system and the state
# system.xml contains all of the force field parameters
state = simulation.context.getState(getPositions=True, getVelocities=True, getForces=True, getEnergy=True, 
                                    getParameters=True, enforcePeriodicBox=False)

with open('%s/system.xml' % init_system_state_dir, 'w') as f:
    system_xml = simtk.openmm.XmlSerializer.serialize(s) 
    f.write(system_xml)
    
with open('%s/state.xml' % init_system_state_dir, 'w') as f: 
    # state.xml contains positions, velocities, forces, the barostat
    f.write(simtk.openmm.XmlSerializer.serialize(state))



In [15]:
if run_sim:
    # do the energy minimization
    start_time = time.time()
    simulation.minimizeEnergy()
    end_time = time.time()
    delta_time = end_time - start_time
    print('energy minimization time cost: %.2f' % delta_time)
    energy_unit = unit.kilocalories_per_mole
    state = simulation.context.getState(getEnergy=True)
    energy = state.getPotentialEnergy().value_in_unit(energy_unit)
    print('energy minimized')
    print("The overall potential energy is %.6f %s" % (energy, energy_unit.get_symbol()))
    for index, row in df_forces_name_group.iterrows():
        group, force_name = int(row['group']), row['name']
        state = simulation.context.getState(getEnergy=True, groups={group})
        energy = state.getPotentialEnergy().value_in_unit(energy_unit)
        print('Group: %d, force name: %s, energy = %.6f %s' % (group, force_name, energy, energy_unit.get_symbol()))

    # run simulation
    simulation.context.setVelocitiesToTemperature(temperature)

    # add simulation reporters
    dcd_reporter = simtk.openmm.app.DCDReporter('%s/output.dcd' % sim_output_dir, 10)
    energy_reporter = simtk.openmm.app.StateDataReporter(sys.stdout, 10, step=True, time=True, potentialEnergy=True, kineticEnergy=True, totalEnergy=True, temperature=True, speed=True)
    simulation.reporters.append(dcd_reporter)
    simulation.reporters.append(energy_reporter)
    start_time = time.time()
    n_steps = 100
    simulation.step(n_steps)
    end_time = time.time()
    delta_time = end_time - start_time
    print('simulation takes %.2f seconds for %d steps' % (delta_time, n_steps))
