## Synthesis Pipeline

Testing the Synthesis pipeline to make sure it is working as I expect it to 

In [12]:

synthesis_models = []

# Parameters 
#coupling_rates = [0.9, 0.925, 0.95, 0.975, 0.99]
coupling_rates = [0.99]
sim_repeats = 1
strand_repeats = 100000
strand_length = 200


In [13]:

from datetime import datetime
import os

# Initiating sim run data path
timestamp = str(datetime.now()).replace(':', '.')
preceeding_path = os.path.join('runs', timestamp)
os.mkdir(preceeding_path)

synthesized_strands_write_path = os.path.join(preceeding_path, 'synthesized.fasta')
original_strand_write_path = os.path.join(preceeding_path, 'original_strands.txt')
parameters_path = os.path.join(preceeding_path, 'parameters.txt')

# Starting a new file
with open(original_strand_write_path, 'w') as f:
    f.write("")

with open(synthesized_strands_write_path, 'w') as f:
    f.write("")

with open(parameters_path, 'w') as f:
    f.write(f"\nRun on {timestamp}\n")
    f.write(f"Coupling Rates = {coupling_rates}\n")
    f.write(f"Simulation Repeats = {sim_repeats}\n")
    f.write(f"Strand repeats = {strand_repeats}\n")
    f.write(f"Strand length = {strand_length}\n")

In [14]:

from synthesis import NaiveSynthesisModel

# Creating all the synthesis models
for coupling_rate in coupling_rates:
    for _ in range(sim_repeats):
        synthesis_models.append(NaiveSynthesisModel(
            coupling_rate, strand_length=strand_length, repeats=strand_repeats, capping=True, write_file=False))

        synthesis_models.append(NaiveSynthesisModel(
            coupling_rate, strand_length=strand_length, repeats=strand_repeats, capping=False, write_file=False))

### Writing original strands to the file

In [15]:
# Get all the original strands and write them to the file
for model in synthesis_models:
    with open(original_strand_write_path, 'a') as f:
        f.write(
            f'{model.strand_id} {model.coupling_rate} {model.capping}\n{model.strand}\n\n')

### Writing Synthesized strands to the file

In [16]:
# Synthesise strands and write them
from tqdm import tqdm

synthesized_strands_arr = []

for model in tqdm(synthesis_models):
    synthesized_strands = model.simulate_synthesis(synthesized_strands_write_path)
    strand_id = model.strand_id

    
    # So one file for each seperate model - about 20ish files
    split_strands = [synthesized_strands[i:i + 9000] for i in range(0, len(synthesized_strands) - 9001, 9000)]
    
    for i, strands_ in enumerate(split_strands):  
        write_path = os.path.join(preceeding_path, f'synthesized_{strand_id}_{i}.fasta')
        with open(write_path, 'w') as f:
            for strand in strands_:

                if len(strand) < 100: # PBSim does not accept strands that are less than 100 bases long
                    continue
                f.write(f">{strand_id}\n")
                f.write(strand + '\n\n')

100%|██████████| 2/2 [00:14<00:00,  7.48s/it]


### Validating Synthesis

In [None]:
strand_lengths_arr = [[len(i) for i in j] for j in synthesized_strands_arr]

In [55]:

import matplotlib.pyplot as plt
import numpy as np

for i in strand_lengths_arr:
    print(np.mean(i))
    print(np.max(i))
    print(np.std(i))



3.883
34
4.48991213722496
160.084
175
5.625739418067638
5.509
52
6.274704694246575
170.261
185
5.085162632600849
9.008
65
9.274262019158183
180.256
193
4.330180596695709
19.248
144
19.492370199644782
189.999
199
3.0677351580604215
82.844
200
65.7524422664284
197.964
200
1.4285321137447349


## Running PBSim

In [6]:
!pbsim

'pbsim' is not recognized as an internal or external command,
operable program or batch file.
