# Analyze the Fraction of Nucleosomes Bound by HP1

In [1]:
import os
import sys

import numpy as np
import pandas as pd

In [2]:
cwd = os.getcwd()
parent_dir = cwd + "/../.."
sys.path.insert(1, parent_dir)

### Identify simulation output directory

In [3]:
output_dir = "../../output"
simulation_dirs = os.listdir(output_dir)
print(simulation_dirs)

['simulations.csv', '.DS_Store', 'sim_4', 'sim_3', 'sim_2', 'sim_5', 'sim_14', 'sim_13', 'sim_12', 'sim_15', 'semiflexible_100000.ipynb', 'sim_9', 'sim_7', 'sim_6', 'sim_1', 'sim_8', 'sim_10', 'sim_17', 'sim_18', 'sim_16', 'sim_11']


In [4]:
simulation_dirs = [dir for dir in simulation_dirs if os.path.isdir(f"{output_dir}/{dir}")]
print(simulation_dirs)

['sim_4', 'sim_3', 'sim_2', 'sim_5', 'sim_14', 'sim_13', 'sim_12', 'sim_15', 'sim_9', 'sim_7', 'sim_6', 'sim_1', 'sim_8', 'sim_10', 'sim_17', 'sim_18', 'sim_16', 'sim_11']


### Load simulation file paths

In [5]:
snapshots = {}
dir_paths = {}
binders = {}
num_equilibration = 2
for dir in simulation_dirs:
    dir_path = f"{output_dir}/{dir}"
    dir_paths[dir] = dir_path
    files = os.listdir(dir_path)
    files_filtered = []
    for file in files:
        file_path = f"{dir_path}/{file}"
        if  os.path.isfile(file_path):
            if file.startswith("Chr-") and file.endswith(".csv"):
                if int(file.split("-")[2].split(".")[0]) >= num_equilibration:
                    files_filtered.append(file_path)
            if file == "binders":
                binders[dir] = dir_path + "/" + file
    snapshots[dir] = files_filtered

### Load reader proteins and chemical potentials

In [12]:
binder_names = {}
chemical_potentials = {}
for sim in binders.keys():
    binder_names_lst = []
    chemical_potentials_lst = []
    binder_path = binders[sim]
    binder_data = pd.read_csv(binder_path, header=0, index_col=0)

    for i in range(binder_data.shape[0]):
        binder_names_lst.append(binder_data.name.to_numpy()[i])
        chemical_potentials_lst.append(str(binder_data.chemical_potential.to_numpy()[i]))
        binder_names[sim] = binder_names_lst
        chemical_potentials[sim] = chemical_potentials_lst

print(binder_names)
print(chemical_potentials)

{'sim_4': ['HP1'], 'sim_3': ['null_mark'], 'sim_2': ['HP1'], 'sim_5': ['HP1'], 'sim_14': ['null_mark'], 'sim_13': ['null_mark'], 'sim_12': ['null_mark'], 'sim_15': ['null_mark'], 'sim_9': ['null_mark'], 'sim_7': ['null_mark'], 'sim_6': ['HP1'], 'sim_1': ['null_mark'], 'sim_8': ['null_mark'], 'sim_10': ['null_mark'], 'sim_17': ['null_mark'], 'sim_18': ['null_mark'], 'sim_16': ['null_mark'], 'sim_11': ['null_mark']}
{'sim_4': ['-1.0'], 'sim_3': ['0.0'], 'sim_2': ['-1.0'], 'sim_5': ['-1.0'], 'sim_14': ['0.0'], 'sim_13': ['0.0'], 'sim_12': ['0.0'], 'sim_15': ['0.0'], 'sim_9': ['0.0'], 'sim_7': ['0.0'], 'sim_6': ['-1.0'], 'sim_1': ['0.0'], 'sim_8': ['0.0'], 'sim_10': ['0.0'], 'sim_17': ['0.0'], 'sim_18': ['0.0'], 'sim_16': ['0.0'], 'sim_11': ['0.0']}


### Load Sequence of Binding States

In [13]:
for sim in binder_names.keys():
    for j, binder in enumerate(binder_names[sim]):
        chemical_potential = chemical_potentials[sim][j]
        for i, snapshot in enumerate(snapshots[sim]):
            if i == 0:
                binding_pattern = pd.read_csv(snapshot, skiprows=1, header=0, usecols=[binder]).to_numpy().flatten()
            else:
                binding_pattern += pd.read_csv(snapshot, skiprows=1, header=0, usecols=[binder]).to_numpy().flatten()
        avg_binding_pattern = binding_pattern / len(snapshots[sim])
        save_path = f"{dir_paths[sim]}/average_binding_states_{binder}_{chemical_potential.replace('-', 'm').replace('.', 'd')}.csv"
        np.savetxt(save_path, avg_binding_pattern)

  if __name__ == '__main__':
  if __name__ == '__main__':


ValueError: Usecols do not match columns, columns expected but not found: ['HP1']