<a href="https://colab.research.google.com/github/jasonwong-lab/HKU-Practical-Bioinformatics/blob/main/NGS_sequence_alignment_command_line.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### BBMS3004 2026 - Targeted sequencing in acute myeloid leukaemia

*by Alvin Ip*

This practical aims to demonstrate how next-generation sequencing data is processed to yield clinically useful information.


## *** Package installation and downloads for workshop (~ 10 minutes)

1.   conda (for simple installation of packages)
2.   FastQC (for reads quality check)
3.   bwa (tools for sequence alignment)
4.   samtools (tools for processing sam & bam files)  

**IMPORTANT：Every time you connect to Google Colab, you have to perform these set up steps again.**

In [None]:
# Set working pathway to your own google drive (~ 1 min)
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
# Install igv-notebook (<1 min)
!pip install igv-notebook

In [None]:
# Install conda (~ 1 min). There will be a message saying that the session has crashed, but don't worry about this. This is due to the session restarting following conda installation.
!pip install -q condacolab
import condacolab
condacolab.install()

In [None]:
# Install fastqc (~ 2 mins)
!conda install -c bioconda fastqc

In [None]:
# Install bwa (~ 1 min)
!conda install -c bioconda bwa

In [None]:
# Install samtools (~1 min)
#!conda install -c bioconda samtools

In [None]:
# Install samtools, bcftools, htslib, bedtools, bamtools (~ 2 mins)
!conda install -y -c conda-forge -c bioconda \
  samtools=1.20 bcftools=1.20 htslib=1.20 bedtools=2.31.1 bamtools

In [None]:
# Check installation ran correctly
!samtools --version | head -1

In [None]:
# Install GATK in a new environment (to specify the exapct openjdk and python version) ~1.5 mins
!conda create -y -n gatkenv python=3.10
!conda config --env --set channel_priority strict
!conda install -y -n gatkenv -c conda-forge -c bioconda gatk4=4.6.2.0 openjdk=17

In [None]:
# Check that gatk is installed properly
!conda run -n gatkenv gatk --version

In [None]:
# Install igv-notebook
import sys
print(sys.version, sys.executable)
!{sys.executable} -m pip install -U igv-notebook
import igv_notebook

## Set working directory

By default, the working directory will be My Drive/PB_course

In [None]:
import os
try:
  os.mkdir("/content/gdrive/My Drive/PB_course")         # change this path if necessary
except FileExistsError:
  print("directory already exist. OK to continue")
os.chdir("/content/gdrive/My Drive/PB_course")

## Download ready prepared files for analysis.

In [None]:
# Download reference sequence
# Double check that we are in the right directory (~ 30s)
import os
os.chdir("/content/gdrive/MyDrive/PB_course")                     # change this path if necessary

import os
if os.path.isfile("/content/gdrive/MyDrive/PB_course/DB_trunc/chr2.fa"):    # check if the file exist
  print("reference file already exist, OK to continue.")
else:
  !pip install gdown
  !gdown -O DB_trunc.zip https://drive.google.com/uc?id=1aRJVznjy5WLQ5Dc0DT9c6NiXw64HdoKr # download if file not exist
  # unzip fasta file
  !unzip -o DB_trunc.zip
  # remove the zip file after extraction
  !rm DB_trunc.zip

!ls -l ./DB_trunc/

In [None]:
# Download sample sequences
import os
os.chdir("/content/gdrive/My Drive/PB_course/")
if os.path.isfile("/content/gdrive/MyDrive/PB_course/Datasets/ChIP-seq_H3K27ac_example.fq.gz"):    # check if the file exist
  print("file already exist, OK to continue.")
else:
 !wget -O Datasets.zip https://github.com/jasonwong-lab/HKU-Practical-Bioinformatics/raw/main/files/Datasets.zip    # download necessary file
 !unzip -o Datasets.zip   #unzip file
 !rm Datasets.zip

## NGS_alignment command line

1.1 Quality control

1.2. Sequence alignment using Burrows–Wheeler Aligner (BWA)

1.3. Viewing SAM files

1.4. Align the WGS paired-end file

In [None]:
# Check what datasets we have downloaded
%cd /content/gdrive/MyDrive/PB_course
!ls -l

%cd Datasets/
!ls -l

In [None]:
# To look at the ChIP-seq fastq file type (head -n 12 to print first 12 lines):
!zcat < ChIP-seq_H3K27ac_example.fq.gz | head -n 12

In [None]:
# How many reads are there in the file? Type your code below:


### Quality control

In [None]:
## Run FastQC
!fastqc ChIP-seq_H3K27ac_example.fq.gz

In [None]:
# Check the o html file
!ls

# Download the html file and check it on your local browser
from google.colab import files
files.download('ChIP-seq_H3K27ac_example_fastqc.html')

You can also download the file **ChIP-seq_H3K27ac_example_fastqc.html** from "gdrive/MyDrive/PB_course/Datasets" on the left side.

### Burrows–Wheeler Aligner

In [None]:
# Look at the options for bwa and bwa mem
!bwa

In [None]:
!bwa mem

In [None]:
# Let's take a look at the database files
%cd /content/gdrive/My Drive/PB_course/DB_trunc

!ls -l

In [None]:
# Get ready to run BWA: First go into the Datasets directory
%cd /content/gdrive/My Drive/PB_course/Datasets

In [None]:
# Make a directory to store the output file
!mkdir BAM

In [None]:
# Do sequence alignment with the default options
!bwa mem ../DB_trunc/chr2.fa ./ChIP-seq_H3K27ac_example.fq.gz > ./BAM/ChIP-seq_H3K27ac_example.sam

In [None]:
# Check the result
!head -n 20 ./BAM/ChIP-seq_H3K27ac_example.sam

### Working with SAM files

In [None]:
# Check the samtools command
!samtools

In [None]:
# Check out some stats about our aligned file
!samtools flagstat ./BAM/ChIP-seq_H3K27ac_example.sam

In [None]:
# Prepare the file for viewing on genome browser:
#Step 1 – convert SAM to BAM
!samtools view -b ./BAM/ChIP-seq_H3K27ac_example.sam > ./BAM/ChIP-seq_H3K27ac_example.bam

In [None]:
# Step 2 – sort BAM file
!samtools sort ./BAM/ChIP-seq_H3K27ac_example.bam > ./BAM/ChIP-seq_H3K27ac_example_sorted.bam

In [None]:
# Step 3 – index BAM file
!samtools index ./BAM/ChIP-seq_H3K27ac_example_sorted.bam

In [None]:
# Look at the files that we have created
!ls -l ./BAM/

### IGV browser

In [None]:
# Load track from local paths
import igv_notebook

igv_notebook.init()

b = igv_notebook.Browser(
    {
        "genome": "hg38",
        "locus": "chr2:47,782,081-47,807,953"
    }
)

b.load_track(
    {
        "name": "ChIP-seq_H3K27ac",
        "path": "./BAM/ChIP-seq_H3K27ac_example_sorted.bam",
        "indexPath": "./BAM/ChIP-seq_H3K27ac_example_sorted.bam.bai",
        "format": "bam",
        "type": "alignment"
    })

### Align the WGS paired-end file

In [None]:
# Check that we have the WGS fastq files
%cd /content/gdrive/My Drive/PB_course/Datasets
!ls -l

In [None]:
# If you don't have WGS_example_1.fq.gz or WGS_example_2.fq.gz, run this cell to redownload the files
import os
os.chdir("/content/gdrive/My Drive/PB_course/")

import os
if os.path.isfile("/content/gdrive/MyDrive/PB_course/Datasets/WGS_example_1.fq.gz"):    # check if the file exist
  print("reference file already exist, OK to continue.")
else:
 !wget -O Datasets.zip https://github.com/jasonwong-lab/HKU-Practical-Bioinformatics/raw/main/files/Datasets.zip
 !unzip -o Datasets.zip   #unzip file
 !rm Datasets.zip

# Current directory should still be ~/Datasets
%cd Datasets/
!ls -l

In [None]:
# Look at our work directory and paired-end file:
!zcat < WGS_example_1.fq.gz |head -1
!zcat < WGS_example_2.fq.gz |head -1

In [None]:
# Align the WGS paired-end file:( ~ 1 mins)
# Step 1 – Aligning paired-end file using bwa mem:
!bwa mem ../DB_trunc/chr2.fa ./WGS_example_1.fq.gz ./WGS_example_2.fq.gz > ./BAM/WGS_example.sam


In [None]:
# Step 2 – Output sorted BAM, this time use piping to skip one step:
!samtools view -b ./BAM/WGS_example.sam | samtools sort >./BAM/WGS_example_sorted.bam

In [None]:
# Step 3 – Index sorted bam file
!samtools index ./BAM/WGS_example_sorted.bam

In [None]:
# Check stats about the aligned WGS file
!samtools flagstat ./BAM/WGS_example_sorted.bam

In [None]:
## IGV
# Load track from local paths
import igv_notebook

igv_notebook.init()

b = igv_notebook.Browser(
    {
        "genome": "hg38",
        "locus": "chr2:47,781,678-47,809,690"
    }
)

b.load_track(
    {
        "name": "ChIP-seq H3K27ac",
        "path": "./BAM/ChIP-seq_H3K27ac_example_sorted.bam",
        "indexPath": "./BAM/ChIP-seq_H3K27ac_example_sorted.bam.bai",
        "format": "bam",
        "type": "alignment"
    })

b.load_track(
    {
        "name": "WGS",
        "path": "./BAM/WGS_example_sorted.bam",
        "indexPath": "./BAM/WGS_example_sorted.bam.bai",
        "format": "bam",
        "type": "alignment"
    })