In [1]:
import sys
import numpy as np
import pandas as pd
import simtk.openmm
import simtk.unit as unit
import os
import glob
import shutil
import time
import MDAnalysis as mda
import math
pd.set_option("display.precision", 10)

ca_sbm_3spn_openmm_path = '/Users/smliu/Documents/Projects/CA_SBM_3SPN2C_OPENMM'
sys.path.insert(0, ca_sbm_3spn_openmm_path)

import openSMOG3SPN2.open3SPN2.ff3SPN2 as ff3SPN2
import openSMOG3SPN2.calphaSMOG.ffCalpha as ffCalpha
import openSMOG3SPN2.openFiber as openFiber
import openSMOG3SPN2.rigid

# set some global parameters
n_nucl_each_fiber = 2 # the number of nucleosomes in each chromatin fiber
n_fibers = 2 # the number of chromatin fibers
scale_factor = 2.5 # scale factor for all the SBM related potentials
apply_rigid_body = False
compare_with_lammps = True
run_sim = False

ffCalpha_xml_path = '%s/openSMOG3SPN2/calphaSMOG/ffCalpha.xml' % ca_sbm_3spn_openmm_path
single_fiber_dcd_path = '%s/data/chromatin-%dmer/chromatin-%dmer-snapshots/relaxed1.dcd' % (ca_sbm_3spn_openmm_path, n_nucl_each_fiber, n_nucl_each_fiber)
single_fiber_group_rigid_txt_path = '%s/data/chromatin-%dmer/chromatin-%dmer-rigid-group/group_rigid.txt' % (ca_sbm_3spn_openmm_path, n_nucl_each_fiber, n_nucl_each_fiber) # group_rigid.txt file with atom index starts from 1 (lammps format)
single_fiber_main_output_dir = '%s/output-files/chromatin-%dmer' % (ca_sbm_3spn_openmm_path, n_nucl_each_fiber) # the main output directory for a single chromatin fiber
fibers_main_output_dir = '%s/output-files/chromatin-%dx%dmers' % (ca_sbm_3spn_openmm_path, n_fibers, n_nucl_each_fiber) # the main output directory for multiple chromatin fibers
single_fiber_smog_output_dir = '%s/smog' % single_fiber_main_output_dir # smog output directory for single chromatin fiber
openmm_files_dir = '%s/openmm-files' % fibers_main_output_dir

platform_name = 'CPU' # 'Reference', 'CPU', 'CUDA', 'OpenCL'
sim_output_dir = '%s/sim-test-%s' % (openmm_files_dir, platform_name)

# build the output directories
if not os.path.exists(single_fiber_main_output_dir):
    print('%s does not exist!' % single_fiber_main_output_dir)
if not os.path.exists(single_fiber_smog_output_dir):
    print('%s does not exist!' % single_fiber_smog_output_dir)
if not os.path.exists('%s/cg-fibers' % fibers_main_output_dir):
    os.makedirs('%s/cg-fibers' % fibers_main_output_dir)
if not os.path.exists(sim_output_dir):
    os.makedirs(sim_output_dir)

  from .autonotebook import tqdm as notebook_tqdm


# 1 Build multiple fiber system from single fiber

## 1.1 Load the structure of single chromatin fiber

In [2]:
# load the pandas dataframe of single fiber structure
single_cg_fiber_unique_chainID = pd.read_csv('%s/cg-fiber/cg_fiber_unique_chainID.csv' % single_fiber_main_output_dir)
single_cg_fiber = pd.read_csv('%s/cg-fiber/cg_fiber.csv' % single_fiber_main_output_dir)

n_cg_atoms_each_fiber = single_cg_fiber_unique_chainID.shape[0]

## 1.2 Build the structure for multiple chromatin fibers

In [3]:
# build the pandas dataframe for multiple fibers
# build two pandas dataframes, one with unique chainID and resSeq, and one without unique chainID or resSeq
# the one without unique chainID or resSeq will be converted to pdb format and later loaded by openmm
delta_x, delta_y, delta_z = -20, 100, 0
delta_r = np.array([delta_x, delta_y, delta_z])*unit.angstrom
cg_fibers_unique_chainID = single_cg_fiber_unique_chainID.copy()
cg_fibers = single_cg_fiber.copy()
for i in range(1, n_fibers):
    cg_fiber_i_unique_chainID = single_cg_fiber_unique_chainID.copy()
    cg_fiber_i_unique_chainID['x'] += i*delta_x
    cg_fiber_i_unique_chainID['y'] += i*delta_y
    cg_fiber_i_unique_chainID['z'] += i*delta_z
    cg_fibers_unique_chainID = openFiber.combine_molecules(cg_fibers_unique_chainID, cg_fiber_i_unique_chainID, add_resSeq=False)
    cg_fiber_i = single_cg_fiber.copy()
    cg_fiber_i['x'] += i*delta_x
    cg_fiber_i['y'] += i*delta_y
    cg_fiber_i['z'] += i*delta_z
    cg_fibers = openFiber.combine_molecules(cg_fibers, cg_fiber_i, add_serial=False, add_resSeq=False)

# move center to (0, 0, 0)
cg_fibers = openFiber.move_complex_to_center(cg_fibers)
cg_fibers_unique_chainID = openFiber.move_complex_to_center(cg_fibers_unique_chainID)

cg_fibers_unique_chainID = openFiber.change_unique_chainID(cg_fibers_unique_chainID)
cg_fibers_unique_chainID.index = list(range(len(cg_fibers_unique_chainID.index)))
cg_fibers.index = list(range(len(cg_fibers.index)))

n_cg_atoms = len(cg_fibers.index)

# replace NaN with ''
cg_fibers_unique_chainID = cg_fibers_unique_chainID.fillna('')
cg_fibers = cg_fibers.fillna('')

cg_fibers_pdb_path = '%s/cg-fibers/cg_fibers.pdb' % fibers_main_output_dir
ffCalpha.writePDB(cg_fibers, cg_fibers_pdb_path)
cg_fibers_unique_chainID.to_csv('%s/cg-fibers/cg_fibers_unique_chainID.csv' % fibers_main_output_dir, index=False)

# 2 Set up OpenMM simulations

## 2.1 Set up the system, protein and dna objects

In [4]:
os.chdir('%s/cg-fibers' % fibers_main_output_dir)

pdb = simtk.openmm.app.PDBFile(cg_fibers_pdb_path)
top = pdb.getTopology()
#coord_pdb = pdb.getPositions(asNumpy=True)

# get position from dcd file
# start from single fiber coordinate
single_cg_fiber_pdb_path = '%s/cg-fiber/cg_fiber.pdb' % single_fiber_main_output_dir
single_fiber_coord = openFiber.load_coord_from_dcd(single_cg_fiber_pdb_path, single_fiber_dcd_path)
# extend single fiber coordinate to mutliple fibers
fibers_coord = openFiber.get_fibers_coord_from_single_fiber_coord(single_fiber_coord, n_fibers, delta_r)

# save the coordinate for the multi-fiber system as xyz file
xyz_file = '%s/cg-fibers/fibers_coord_openmm.xyz' % fibers_main_output_dir
openFiber.write_openmm_coord_xyz(fibers_coord, cg_fibers, xyz_file)

forcefield = simtk.openmm.app.ForceField(ffCalpha_xml_path, ff3SPN2.xml)
s = forcefield.createSystem(top)

In [5]:
# create the DNA and protein objects
# set dna bonds, angles, and dihedrals from the parameters of single dsDNA
# so the original open3SPN2 code will build a long DNA with sequence composed of all the bases, though convenient, this may lead to some boundary effects
# do not use ff3SPN2 to automatically set bonds, angles, and dihedrals (i.e. set compute_topology as False, then ff3PNS2.DNA.fromCoarsePDB_thorugh_pdframe will not automatically get dna bonds, angles, stackings, and dihedrals)
# load dna bonds, angles, and dihedrals manually based on single chromatin fiber dna bonds, angels, and dihedrals
dna = ff3SPN2.DNA.fromCoarsePandasDataFrame(pd_df=cg_fibers_unique_chainID, dna_type='B_curved', compute_topology=False, parse_config=True)
single_fiber_dna_bonds = pd.read_csv('%s/cg-fiber/dna_bonds.csv' % single_fiber_main_output_dir)
single_fiber_dna_angles = pd.read_csv('%s/cg-fiber/dna_angles.csv' % single_fiber_main_output_dir)
single_fiber_dna_stackings = pd.read_csv('%s/cg-fiber/dna_stackings.csv' % single_fiber_main_output_dir)
single_fiber_dna_dihedrals = pd.read_csv('%s/cg-fiber/dna_dihedrals.csv' % single_fiber_main_output_dir)
single_fiber_dna_topo_dict = dict(bond=single_fiber_dna_bonds, 
                                  angle=single_fiber_dna_angles, 
                                  stacking=single_fiber_dna_stackings, 
                                  dihedral=single_fiber_dna_dihedrals)
openFiber.add_topo_to_fibers_from_single_fiber_dna(dna, single_fiber_dna_topo_dict, n_fibers, n_cg_atoms_each_fiber)
#print(dna.bonds)

single_fiber_protein_seq_path = '%s/cg-fiber/protein_seq.txt' % single_fiber_main_output_dir
with open(single_fiber_protein_seq_path, 'r') as ps:
    single_fiber_protein_seq = ps.readlines()[0].rstrip()
fibers_protein_seq = single_fiber_protein_seq*n_fibers

protein = ffCalpha.Protein.fromCoarsePandasDataFrame(pd_df=cg_fibers_unique_chainID, sequence=fibers_protein_seq)

dna.periodic = False
protein.periodic = False


use the new set of base step geometry parameters


In [6]:
# create rigid identity list for the fiber
if apply_rigid_body:
    pass # to be fulfilled
else:
    fibers_rigid_identity = [None]*n_cg_atoms

# get exclusions list
single_fiber_dna_exclusions_list = openFiber.load_exclusions_list('%s/cg-fiber/dna_exclusions.dat' % single_fiber_main_output_dir)
if compare_with_lammps:
    # if compare with lammps, then openmm needs to compute electrostatic and MJ potential for protein native pairs
    print('Compare with lammps, so we need to compute electrostatic and MJ potential for protein native pairs')
    single_fiber_protein_exclusions_list_file = '%s/cg-fiber/protein_exclusions_compare_with_lammps.dat' % single_fiber_main_output_dir
    single_fiber_protein_exclusions_list = openFiber.load_exclusions_list(single_fiber_protein_exclusions_list_file)
else:
    single_fiber_protein_exclusions_list_file = '%s/cg-fiber/protein_exclusions.dat' % single_fiber_main_output_dir
    single_fiber_protein_exclusions_list = openFiber.load_exclusions_list(single_fiber_protein_exclusions_list_file)

fibers_dna_exclusions_list = ff3SPN2.buildDNANonBondedExclusionsList(dna) # since there are exclusions between W-C paired basepairs, we cannot simply generalize exclusions from single fiber DNA exclusions
fibers_protein_exclusions_list = openFiber.extend_exclusions(single_fiber_protein_exclusions_list, n_fibers, n_cg_atoms_each_fiber)

print('total number of exclusions between DNA atoms is %d' % len(fibers_dna_exclusions_list))
print('total number of exclusions between protein atoms is %d' % len(fibers_protein_exclusions_list))


Compare with lammps, so we need to compute electrostatic and MJ potential for protein native pairs
total number of exclusions between DNA atoms is 215168
total number of exclusions between protein atoms is 11496


## 2.2 Set up forces for histones and dna

In [7]:
scale_factor = 2.5 # scale factor for all the SBM related potentials

# set force dictionary
forces = {}

# load smog data
single_fiber_smog_bonds_file_path = '%s/bonds.dat' % single_fiber_smog_output_dir
single_fiber_smog_angles_file_path = '%s/angles.dat' % single_fiber_smog_output_dir
single_fiber_smog_dihedrals_file_path = '%s/dihedrals_IDR_removed.dat' % single_fiber_smog_output_dir
single_fiber_smog_exclusions_file_path = '%s/exclusions_IDR_removed.dat' % single_fiber_smog_output_dir
single_fiber_smog_pairs_file_path = '%s/pairs_IDR_removed.dat' % single_fiber_smog_output_dir

single_fiber_smog_bonds_data = openFiber.load_smog_bonds(single_fiber_smog_bonds_file_path)
single_fiber_smog_angles_data = openFiber.load_smog_angles(single_fiber_smog_angles_file_path)
single_fiber_smog_dihedrals_data = openFiber.load_smog_dihedrals(single_fiber_smog_dihedrals_file_path)
single_fiber_smog_exclusions_data = openFiber.load_smog_exclusions(single_fiber_smog_exclusions_file_path)
single_fiber_smog_pairs_data = openFiber.load_smog_pairs(single_fiber_smog_pairs_file_path)

fibers_smog_bonds_data = openFiber.extend_single_fiber_to_fibers_bonds(single_fiber_smog_bonds_data, n_fibers, n_cg_atoms_each_fiber)
fibers_smog_angles_data = openFiber.extend_single_fiber_to_fibers_angles(single_fiber_smog_angles_data, n_fibers, n_cg_atoms_each_fiber)
fibers_smog_dihedrals_data = openFiber.extend_single_fiber_to_fibers_dihedrals(single_fiber_smog_dihedrals_data, n_fibers, n_cg_atoms_each_fiber)
fibers_smog_exclusions_data = openFiber.extend_single_fiber_to_fibers_exclusions(single_fiber_smog_exclusions_data, n_fibers, n_cg_atoms_each_fiber)
fibers_smog_pairs_data = openFiber.extend_single_fiber_to_fibers_pairs(single_fiber_smog_pairs_data, n_fibers, n_cg_atoms_each_fiber)

fibers_smog_data = dict(bonds=fibers_smog_bonds_data, 
                        angles=fibers_smog_angles_data, 
                        dihedrals=fibers_smog_dihedrals_data, 
                        pairs=fibers_smog_pairs_data)

# add dna, protein interactions
openFiber.add_protein_dna_forces(s, forces, dna, protein, fibers_smog_data, fibers_dna_exclusions_list, fibers_protein_exclusions_list, fibers_rigid_identity, scale_factor)


Add protein-protein forces
Protein-protein forces are derived from SMOG output parameters
adding force BondProtein takes 0.007968 seconds
adding force AngleProtein takes 0.012203 seconds
adding force DihedralProtein takes 0.045069 seconds
adding force NativePairProtein takes 0.042465 seconds
adding force NonbondedMJ takes 0.277008 seconds
adding force ElectrostaticsProteinProtein takes 0.171374 seconds
Add DNA-DNA forces
DNA-DNA forces are based on 3SPN2 forcefield
force name Bond is updated to BondDNA
adding force BondDNA takes 0.477162 seconds
force name Angle is updated to AngleDNA
adding force AngleDNA takes 0.601867 seconds
adding force Stacking takes 0.156779 seconds
force name Dihedral is updated to DihedralDNA
adding force DihedralDNA takes 0.664953 seconds
adding force BasePair takes 0.907913 seconds
adding force CrossStacking takes 5.704732 seconds
force name Exclusion is updated to ExclusionDNADNA
adding force ExclusionDNADNA takes 1.677835 seconds
force name Electrostatics 

## 2.3 Set up rigid body

In [8]:
if apply_rigid_body:
    rigid_body_list = []
    for i in range(n_nucl_each_fiber*n_fibers):
        rigid_body_list.append([])

    for i in range(len(fibers_rigid_identity)):
        j = fibers_rigid_identity[i]
        if j == None:
            pass
        else:
            rigid_body_list[j].append(i)

    #print(rigid_body_list)
    openFiber.rigid.createRigidBodies(s, fibers_coord, rigid_body_list)

## 2.4 Run the simulation

In [9]:
temperature = 300*simtk.openmm.unit.kelvin

integrator = simtk.openmm.LangevinIntegrator(temperature, 1/simtk.openmm.unit.picosecond, 10*simtk.openmm.unit.femtoseconds)
platform = simtk.openmm.Platform.getPlatformByName(platform_name)

if platform_name == 'CUDA':
    properties = {'Precision':'double'}
    simulation = simtk.openmm.app.Simulation(top, s, integrator, platform, properties)
else:
    simulation = simtk.openmm.app.Simulation(top, s, integrator, platform)

simulation.context.setPositions(fibers_coord)
#energy_unit=simtk.openmm.unit.kilojoule_per_mole
energy_unit = simtk.openmm.unit.kilocalories_per_mole
state = simulation.context.getState(getEnergy=True)
energy = state.getPotentialEnergy().value_in_unit(energy_unit)
print("The overall energy is %.6f %s" % (energy, energy_unit.get_symbol()))

# get the detailed energy after the simulation
# double check SBM pair, nonbonded, and electrostatic interactions
df_forces_name_group = pd.read_csv('%s/forces_name_group.csv' % single_fiber_main_output_dir)
for index, row in df_forces_name_group.iterrows():
    group, force_name = int(row['group']), row['name']
    state = simulation.context.getState(getEnergy=True, groups={group})
    energy = state.getPotentialEnergy().value_in_unit(energy_unit)
    print('Group: %d, force name: %s, energy = %.6f %s' % (group, force_name, energy, energy_unit.get_symbol()))

The overall energy is -10595.154567 kcal/mol
Group: 1, force name: BondProtein, energy = 1144.172831 kcal/mol
Group: 2, force name: AngleProtein, energy = 826.655314 kcal/mol
Group: 3, force name: DihedralProtein, energy = 220.123851 kcal/mol
Group: 4, force name: NativePairProtein, energy = -4132.224792 kcal/mol
Group: 5, force name: NonbondedMJ, energy = -675.513580 kcal/mol
Group: 6, force name: ElectrostaticsProteinProtein, energy = 342.054816 kcal/mol
Group: 7, force name: BondDNA, energy = 561.964043 kcal/mol
Group: 8, force name: AngleDNA, energy = 1265.868105 kcal/mol
Group: 9, force name: Stacking, energy = -3385.887155 kcal/mol
Group: 10, force name: DihedralDNA, energy = -3723.297452 kcal/mol
Group: 11, force name: BasePair, energy = -2061.680631 kcal/mol
Group: 12, force name: CrossStacking, energy = -458.549766 kcal/mol
Group: 13, force name: ExclusionDNADNA, energy = 5.591238 kcal/mol
Group: 14, force name: ElectrostaticsDNADNA, energy = 201.328318 kcal/mol
Group: 15, for

In [10]:
# save the system and the state
# system.xml contains all of the force field parameters
state = simulation.context.getState(getPositions=True, getVelocities=True, getForces=True, getEnergy=True, 
                                    getParameters=True, enforcePeriodicBox=False)

with open('%s/system.xml' % sim_output_dir, 'w') as f:
    system_xml = simtk.openmm.XmlSerializer.serialize(s) 
    f.write(system_xml)
    
with open('%s/integrator.xml' % sim_output_dir, 'w') as f:
    # integrator.xml contains the configuration for the integrator, RNG seed
    integrator_xml = simtk.openmm.XmlSerializer.serialize(integrator) 
    f.write(integrator_xml)
    
with open('%s/state.xml' % sim_output_dir, 'w') as f: 
    # state.xml contains positions, velocities, forces, the barostat
    f.write(simtk.openmm.XmlSerializer.serialize(state))


In [11]:
if run_sim:
    # do the energy minimization
    start_time = time.time()
    simulation.minimizeEnergy()
    end_time = time.time()
    delta_time = end_time - start_time
    print('energy minimization takes %.2f seconds' % delta_time)
    energy_unit = simtk.openmm.unit.kilocalories_per_mole
    state = simulation.context.getState(getEnergy=True)
    energy = state.getPotentialEnergy().value_in_unit(energy_unit)
    print('energy minimized')
    print('energy = %.6f %s' % (energy, energy_unit.get_symbol()))
    for force_name, force in forces.items():
        group=force.getForceGroup()
        state = simulation.context.getState(getEnergy=True, groups={group})
        energies[force_name] = state.getPotentialEnergy().value_in_unit(energy_unit)
        print('force group %d, force name %s, energy %.6f %s' % (group, force_name, energies[force_name], energy_unit.get_symbol()))

    # run simulation
    simulation.context.setVelocitiesToTemperature(temperature)

    # add simulation reporters
    dcd_reporter = simtk.openmm.app.DCDReporter('%s/output.dcd' % sim_output_dir, 500)
    energy_reporter = simtk.openmm.app.StateDataReporter(sys.stdout, 500, step=True, time=True, potentialEnergy=True, kineticEnergy=True, totalEnergy=True, temperature=True, speed=True)
    simulation.reporters.append(dcd_reporter)
    simulation.reporters.append(energy_reporter)
    start_time = time.time()
    n_steps = 50000
    simulation.step(n_steps)
    end_time = time.time()
    delta_time = end_time - start_time
    print('%d steps takes %.2f seconds' % (n_steps, delta_time))
