In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
! pip install pysam

In [None]:
import subprocess
import os

import pysam

import pandas as pd


In [2]:
dataset = 'simlord' # Dataset name
filename = 'sim_10.fasta' # Dataset filename

version = 2 # Version of the experiment if any. If not, set to 1
bin_count = 10 # Number of bins for the dataset (only for simulated datasets). If not applicable, set to None
bins = bin_count if dataset == 'simlord' else None

# Setup and run minimap2

In [6]:
# input path for files
input_path = f"/content/drive/MyDrive/FYP/FYP/datasets/{dataset}{'/bin_'+str(bins) if bins != None else '/'}/v{version}"

In [7]:
! cp $input_path/$filename reads.fasta

In [8]:
!mkdir ref
!cp -r /content/drive/MyDrive/FYP/FYP/datasets/zymo\ reference/* ./ref

In [None]:
# Clone the repository for minimap2
!git clone https://github.com/lh3/minimap2

Cloning into 'minimap2'...
remote: Enumerating objects: 5563, done.[K
remote: Counting objects: 100% (1648/1648), done.[K
remote: Compressing objects: 100% (325/325), done.[K
remote: Total 5563 (delta 1449), reused 1399 (delta 1322), pack-reused 3915[K
Receiving objects: 100% (5563/5563), 1.77 MiB | 6.69 MiB/s, done.
Resolving deltas: 100% (4017/4017), done.


In [None]:
%cd minimap2

/content/minimap2


In [None]:
!make

cc -c -g -Wall -O2 -Wc++-compat  -DHAVE_KALLOC  main.c -o main.o
cc -c -g -Wall -O2 -Wc++-compat  -DHAVE_KALLOC  kthread.c -o kthread.o
cc -c -g -Wall -O2 -Wc++-compat  -DHAVE_KALLOC  kalloc.c -o kalloc.o
cc -c -g -Wall -O2 -Wc++-compat  -DHAVE_KALLOC  misc.c -o misc.o
cc -c -g -Wall -O2 -Wc++-compat  -DHAVE_KALLOC  bseq.c -o bseq.o
cc -c -g -Wall -O2 -Wc++-compat  -DHAVE_KALLOC  sketch.c -o sketch.o
cc -c -g -Wall -O2 -Wc++-compat  -DHAVE_KALLOC  sdust.c -o sdust.o
cc -c -g -Wall -O2 -Wc++-compat  -DHAVE_KALLOC  options.c -o options.o
cc -c -g -Wall -O2 -Wc++-compat  -DHAVE_KALLOC  index.c -o index.o
[01m[Kindex.c:[m[K In function ‘[01m[Kmm_idx_load[m[K’:
  520 |                 [01;35m[Kfread(&l, 1, 1, fp)[m[K;
      |                 [01;35m[K^~~~~~~~~~~~~~~~~~~[m[K
  523 |                         [01;35m[Kfread(s->name, 1, l, fp)[m[K;
      |                         [01;35m[K^~~~~~~~~~~~~~~~~~~~~~~~[m[K
  526 |                 [01;35m[Kfread(&s->len, 4, 1,

In [None]:
%cd ..
%cd ref

/content
/content/ref


In [None]:
genomes = os.listdir()
genomes

In [None]:

cmd = f"cat {' '.join(genomes)} > combined.fasta"
subprocess.run(cmd, shell=True)

In [None]:
%cd ..

/content


In [None]:
! ./minimap2/minimap2 -ax map-ont ref/combined_ref.fasta reads.fasta > result.sam

[M::mm_idx_gen::2.847*1.13] collected minimizers
[M::mm_idx_gen::3.700*1.29] sorted minimizers
[M::main::3.700*1.29] loaded/built the index for 42 target sequence(s)
[M::mm_mapopt_update::3.922*1.28] mid_occ = 40
[M::mm_idx_stat] kmer size: 15; skip: 10; is_hpc: 0; #seq: 42
[M::mm_idx_stat::4.068*1.26] distinct minimizers: 10278012 (91.52% are singletons); average occurrences: 1.146; average spacing: 5.348; total length: 62975088
[M::worker_pipeline::414.639*1.17] mapped 100000 sequences
[M::worker_pipeline::812.756*1.16] mapped 100000 sequences
[M::worker_pipeline::1239.596*1.14] mapped 100000 sequences
[M::worker_pipeline::1655.892*1.13] mapped 100000 sequences
[M::worker_pipeline::2075.416*1.12] mapped 100000 sequences
[M::main] Version: 2.26-r1190-dirty
[M::main] CMD: ./minimap2/minimap2 -ax map-ont ref/combined_ref.fasta reads.fasta
[M::main] Real time: 2075.547 sec; CPU: 2319.365 sec; Peak RSS: 1.697 GB


In [None]:
# Define output path to store minimap2 output
output_path = '/content/drive/MyDrive/FYP/FYP/Tools Outputs/minimap2'

In [None]:
!cp /content/result.sam "$output_path"/$dataset/v$version/result.sam

In [None]:
def sam_to_paf(sam_file_path, output_paf_path):
    """
    Function to convert a SAM file to a PAF file.
    Parameters: sam_file_path (str) - path to the SAM file  
                output_paf_path (str) - path to the output PAF file
    Returns: None
    """
    with pysam.AlignmentFile(sam_file_path, 'r') as samfile:
        with open(output_paf_path, 'w') as paf_file:
            count = 0
            for alignment in samfile.fetch():
              count += 1

              if count % 100000 == 0:
                print(f"Processed {count} alignments")
              read_name = alignment.query_name
              reference_name = samfile.get_reference_name(alignment.reference_id)
              start_position = alignment.reference_start + 1  # SAM is 1-based
              end_position = alignment.reference_end
              mapping_quality = alignment.mapping_quality

              paf_line = f"{read_name}\t{alignment.query_length}\t{start_position}\t{end_position}\t"
              paf_line += f"+\t{reference_name}\t{alignment.reference_length}\t{alignment.cigarstring}\t"
              paf_line += f"{mapping_quality}\n"

              paf_file.write(paf_line)

sam_input_path = 'result.sam'
paf_output_path = 'output.paf'

sam_to_paf(sam_input_path, paf_output_path)


Processed 100000 alignments
Processed 200000 alignments
Processed 300000 alignments
Processed 400000 alignments
Processed 500000 alignments


In [None]:
# Column names for pandas dataframe made using the PAF file
cols = ['seq_id',
'query_length',
'query_alignment_start',
'query_alignment_end',
'flag',
'name',
'reference_length',
# 'reference_start',
# 'reference_end',
# 'template_length',
'_',
'mapping_quality',]

# Generate dataframe using minimap2 output

In [None]:
with open(f'output.paf','r') as f:
    df_ = pd.read_csv(f,sep='\t',header=None,names=cols)

In [None]:
df = df_.copy()

In [None]:
df.shape

(532408, 9)

In [None]:
df.head()

Unnamed: 0,seq_id,query_length,query_alignment_start,query_alignment_end,flag,name,reference_length,_,mapping_quality
0,seq1,5000,1070049,1074944,+,Lactobacillus_fermentum_complete_genome,4896,14M1I3M1I29M1D103M1I78M1I4M1I34M1D35M1I8M1I41M...,60
1,seq2,5000,1866191,1871049,+,Staphylococcus_aureus_chromosome,4859,49M1I46M1D47M1I14M1I36M1I32M1I42M1I47M1I34M1I7...,60
2,seq3,5000,1494110,1498898,+,Staphylococcus_aureus_chromosome,4789,4M1D34M1I31M1I3M1I15M1I13M1I12M1D6M1I60M1I6M1D...,60
3,seq4,5000,342507,347426,+,Lactobacillus_fermentum_complete_genome,4920,153M1I7M1I5M1I34M1I45M1I1M1I14M1I8M1I49M2I42M1...,60
4,seq5,5000,4219587,4224428,+,Salmonella_enterica_complete_genome,4842,91M1I18M1D2M1I105M1I18M1I2M1I17M1I114M1I31M1I2...,60


In [None]:
df['name'].unique()

array(['Lactobacillus_fermentum_complete_genome',
       'Staphylococcus_aureus_chromosome',
       'Salmonella_enterica_complete_genome',
       'BS.pilon.polished.v3.ST170922',
       'Listeria_monocytogenes_complete_genome',
       'Enterococcus_faecalis_complete_genome', 'CP006167.2',
       'CP005636.2', 'CP006287.1',
       'Pseudomonas_aeruginosa_complete_genome', 'NC_006682.1',
       'NC_006692.1', 'Escherichia_coli_chromosome', 'CP006391.1',
       'NC_006686.1', 'CP005542.2', 'NC_006694.1',
       'Escherichia_coli_plasmid', 'NC_006685.1', 'CP006075.2',
       'CP005440.1', 'NC_006684.1', 'CP005242.2', 'CP005144.1',
       'NC_006683.1', 'NC_006693.1', 'CP004803.2', 'NC_006680.1',
       'NC_006679.1', 'NC_006687.1', 'CP004708.2', 'CP005031.2',
       'CP004640.2', 'NC_006670.1', 'NC_006691.1', 'CP004458.2',
       'CP005339.2', 'CP004938.2', 'NC_006681.1',
       'Staphylococcus_aureus_plasmid1', 'None'], dtype=object)

In [None]:
df['name'].value_counts()

Listeria_monocytogenes_complete_genome     51517
BS.pilon.polished.v3.ST170922              51313
Enterococcus_faecalis_complete_genome      51063
Staphylococcus_aureus_chromosome           50941
Lactobacillus_fermentum_complete_genome    50894
Salmonella_enterica_complete_genome        50849
Pseudomonas_aeruginosa_complete_genome     50340
Escherichia_coli_chromosome                49725
CP006391.1                                 31428
CP004708.2                                  6062
NC_006670.1                                 6024
NC_006685.1                                 5730
NC_006686.1                                 4727
NC_006684.1                                 4396
CP005242.2                                  4212
NC_006687.1                                 4102
NC_006691.1                                 3910
CP005636.2                                  3827
NC_006692.1                                 3535
CP005440.1                                  3509
CP006167.2          

In [None]:
mapping = {
    'CP005144.1': 'Saccharomyces cerevisiae',
    'CP005031.2': 'Saccharomyces cerevisiae'
}

In [None]:
mapping = {'CP005144.1': 'Saccharomyces cerevisiae',
 'NC_006687.1': 'Cryptococcus neoformans',
 'NC_006680.1': 'Cryptococcus neoformans',
 'CP006391.1': 'Saccharomyces cerevisiae',
 'NC_006686.1': 'Cryptococcus neoformans',
 'CP006287.1': 'Saccharomyces cerevisiae',
 'CP005242.2': 'Saccharomyces cerevisiae',
 'CP006075.2': 'Saccharomyces cerevisiae',
 'CP004803.2': 'Saccharomyces cerevisiae',
 'CP005031.2': 'Saccharomyces cerevisiae',
 'NC_006691.1': 'Cryptococcus neoformans',
 'NC_006683.1': 'Cryptococcus neoformans',
 'NC_006670.1': 'Cryptococcus neoformans',
 'NC_006693.1': 'Cryptococcus neoformans',
 'NC_006694.1': 'Cryptococcus neoformans',
 'CP005440.1': 'Saccharomyces cerevisiae',
 'CP005542.2': 'Saccharomyces cerevisiae',
 'NC_006692.1': 'Cryptococcus neoformans',
 'NC_006679.1': 'Cryptococcus neoformans',
 'NC_006682.1': 'Cryptococcus neoformans',
 'NC_006685.1': 'Cryptococcus neoformans',
 'NC_006684.1': 'Cryptococcus neoformans',
 'CP004640.2': 'Saccharomyces cerevisiae',
 'CP006167.2': 'Saccharomyces cerevisiae',
 'CP005636.2': 'Saccharomyces cerevisiae',
 'NC_006681.1': 'Cryptococcus neoformans',
 'CP004708.2': 'Saccharomyces cerevisiae',
 'CP005339.2': 'Saccharomyces cerevisiae',
 'CP004938.2': 'Saccharomyces cerevisiae',
 'CP004458.2': 'Saccharomyces cerevisiae'}

In [None]:
df['name'] = df['name'].replace(mapping)

In [None]:
df['name'].unique()

array(['Lactobacillus_fermentum_complete_genome',
       'Staphylococcus_aureus_chromosome',
       'Salmonella_enterica_complete_genome',
       'BS.pilon.polished.v3.ST170922',
       'Listeria_monocytogenes_complete_genome',
       'Enterococcus_faecalis_complete_genome',
       'Saccharomyces cerevisiae',
       'Pseudomonas_aeruginosa_complete_genome',
       'Cryptococcus neoformans', 'Escherichia_coli_chromosome',
       'Escherichia_coli_plasmid', 'Staphylococcus_aureus_plasmid1',
       'None'], dtype=object)

In [None]:
df['name'].value_counts()

Saccharomyces cerevisiae                   72542
Listeria_monocytogenes_complete_genome     51517
BS.pilon.polished.v3.ST170922              51313
Cryptococcus neoformans                    51236
Enterococcus_faecalis_complete_genome      51063
Staphylococcus_aureus_chromosome           50941
Lactobacillus_fermentum_complete_genome    50894
Salmonella_enterica_complete_genome        50849
Pseudomonas_aeruginosa_complete_genome     50340
Escherichia_coli_chromosome                49725
Escherichia_coli_plasmid                    1093
None                                         774
Staphylococcus_aureus_plasmid1               121
Name: name, dtype: int64

In [None]:
df_final = df[['seq_id','name']]

In [None]:
output_path = f"/content/drive/MyDrive/FYP/FYP/Tools Outputs/minimap2/{dataset}{'/bin_'+str(bins) if bins != None else '/'}/v{version}"
output_path

'/content/drive/MyDrive/FYP/FYP/Tools Outputs/minimap2/simlord/bin_10/v1'

In [None]:
df_final.to_csv(f'{output_path}/combined_minimap_v{version}.csv',index=False)

In [None]:
! cp output.paf "$output_path"/output.paf
! cp result.sam "$output_path"/result.sam