In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
! pip install -q condacolab
! pip install biopython

Collecting biopython
  Downloading biopython-1.83-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m28.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: biopython
Successfully installed biopython-1.83


In [3]:
import condacolab
import os
from Bio import SeqIO
import random
import numpy as np

In [4]:
n_ref = 10
dataset = 'zymo'

In [None]:
condacolab.install()

✨🍰✨ Everything looks OK!


In [None]:
!conda --version

conda 23.11.0


In [None]:
!conda create --name simlord_env python=3.6.5

In [None]:
%%bash
source activate simlord_env

In [None]:
! conda install -c bioconda simlord

Channels:
 - bioconda
 - conda-forge
Platform: linux-64
Collecting package metadata (repodata.json): - \ | / - \ | / - \ | / done
Solving environment: \ | / - done


    current version: 23.11.0
    latest version: 24.1.2

Please update conda by running

    $ conda update -n base -c conda-forge conda



# All requested packages already installed.



In [None]:
! mkdir ./data
! cp -r /content/drive/MyDrive/FYP/FYP/datasets/zymo\ reference/* ./data

mkdir: cannot create directory ‘./data’: File exists


In [None]:
ref_files = os.listdir('./data')
ref_files

['Staphylococcus_aureus_complete_genome.fasta',
 'Bacillus_subtilis_complete_genome.fasta',
 'Escherichia_coli_complete_genome.fasta',
 'Salmonella_enterica_complete_genome.fasta',
 'Cryptococcus_neoformans_JEC21_genome.fasta',
 'Pseudomonas_aeruginosa_complete_genome.fasta',
 'Lactobacillus_fermentum_complete_genome.fasta',
 'Saccharomyces_cerevisiae_YJM1307_genome.fasta',
 'Listeria_monocytogenes_complete_genome.fasta',
 'Enterococcus_faecalis_complete_genome.fasta']

In [None]:
!mkdir ./output

mkdir: cannot create directory ‘./output’: File exists


In [None]:
import subprocess

# Set the values for the loop
for ref in ref_files:  # Adjust the range as
    genome = ref.split('.')[0]
    ref_fasta = f"./data/{str(genome)[::-6]}"
    ref_ = f"./output/sim-{genome}-{n_ref}"

    # Build the command
    command = [
        "simlord",
        "--no-sam",
        "-rr",
        ref_fasta,
        "-n",
        "50000",
        "-fl",
        "5000",
        "-pi",
        "0.11",
        "-pd",
        "0.04",
        "-ps",
        "0.01",
        ref_
    ]

    # Run the command
    subprocess.run(command)

In [None]:
# ! simlord --no-sam -rr msa_combined.fasta -n 500000 -fl 5000 -pi 0.11 -pd 0.04 -ps 0.01  sim-15

In [7]:
! cp -r /content/drive/MyDrive/FYP/FYP/datasets/simlord/output/sim_10 ./output

In [8]:
output_files= os.listdir('./output')
output_files

['sim-Salmonella_enterica_complete_genome-10.fastq',
 'sim-Escherichia_coli_complete_genome-10.fastq',
 'sim-Saccharomyces_cerevisiae_YJM1307_genome-10.fastq',
 'sim-Staphylococcus_aureus_complete_genome-10.fastq',
 'sim-Bacillus_subtilis_complete_genome-10.fastq',
 'sim-Cryptococcus_neoformans_JEC21_genome-10.fastq',
 'sim-Enterococcus_faecalis_complete_genome-10.fastq',
 'sim-Pseudomonas_aeruginosa_complete_genome-10.fastq',
 'sim-Listeria_monocytogenes_complete_genome-10.fastq',
 'sim-Lactobacillus_fermentum_complete_genome-10.fastq']

In [9]:
shuffled_records = []

# Read records from each fastq file and shuffle
for fastq_file in output_files:
    if not fastq_file.startswith('sim'): continue
    records = list(SeqIO.parse(f'./output/{fastq_file}', "fastq"))
    random.shuffle(records)
    shuffled_records.extend(records)

random.shuffle(shuffled_records)

In [10]:
len(shuffled_records)

500000

In [12]:
fasta_file = f"combined_sim_{n_ref}.fasta"
labels = []

with open(fasta_file, "w") as fasta_output:
    for index, record in enumerate(shuffled_records, start=1):
        fasta_output.write(f">seq{index}\n{record.seq}\n")
        labels.append(record.description.split(';')[3].split('=')[-1])

In [13]:
np.save(f'sim_{n_ref}_labels.npy',np.array(labels))

In [16]:
labels_np = np.load(f'sim_{n_ref}_labels.npy')
labels_np.shape

(500000,)

In [None]:
import shutil
# shutil.copy('/content/drive/MyDrive/fyp code/msa_combined_reference.fasta','msa_combined.fasta')
shutil.move(f"combined_sim_{n_ref}.fasta",f'/content/drive/MyDrive/fyp code/sim-{n_ref}.fasta')

'/content/drive/MyDrive/fyp code/sim-15.fastq'

In [15]:
! cp -r ./combined_sim_10.fasta /content/drive/MyDrive/FYP/FYP/datasets/simlord/output/sim_10