# <span style="color:green">Formation South Green 2022</span> - Structural Variants Detection by using short and long reads 

# __DAY 2 : SNP calling__

Created by C. Tranchant (DIADE-IRD), J. Orjuela (DIADE-IRD), F. Sabot (DIADE-IRD) and A. Dereeper (PHIM-IRD)

## __1. Preparing the working environment__ 

### First create a dedicated folder to work 

In [None]:
# go to work directory
cd /home/jovyan/work/
ls

### We are going to use only one clones to check is all is ok before run the whole of samples! 

In [None]:
## declare variables
i=10
REF_DIR="/home/jovyan/work/SV_DATA/REF/"
REF="/home/jovyan/work/SV_DATA/REF/reference.fasta"
ONT="/home/jovyan/work/SV_DATA/LONG_READS/Clone$i.fastq.gz"
ILL_R1="/home/jovyan/work/SV_DATA/SHORT_READS/Clone${i}_R1.fastq.gz"
ILL_R2="/home/jovyan/work/SV_DATA/SHORT_READS/Clone${i}_R2.fastq.gz"
BAM="/home/jovyan/work/MAPPING-ILL/dirClone10/Clone$i.SORTED.bam"

In [None]:
# go to SR mapping results
cd /home/jovyan/work/MAPPING-ILL/dirClone${i}

In [None]:
ls /home/jovyan/work/MAPPING-ILL/dirClone${i}

### Mark duplications

In [None]:
echo -e "\nMarkDuplicates in Clone$i";
gatk MarkDuplicates -I $BAM -M duplicates.$i.metrics -O Clone$i.SORTED.MD.bam;
samtools index Clone$i.SORTED.MD.bam;

In [None]:
# check MD bam
ls -l

### Indexing reference with GATK

In [None]:
cd $REF_DIR
samtools faidx $REF
gatk CreateSequenceDictionary -R $REF

### SNP Calling for a Clone

In [None]:
# go to bam repertory
cd /home/jovyan/work/MAPPING-ILL/dirClone${i}
# change BAM file for MD one
BAM="/home/jovyan/work/MAPPING-ILL/dirClone10/Clone$i.SORTED.MD.bam"
# lauch GATK HaplotypeCaller
echo -e "\nCalling Clone$i";
gatk --java-options "-Xmx4g" HaplotypeCaller --native-pair-hmm-threads 4 -I Clone$i.SORTED.MD.bam -O Clone$i.g.vcf -R $REF -ERC GVCF;

In [None]:
head Clone$i.g.vcf

### Error produced here is normal ! RG is important in mapping step  !! 

Yes, you have to relaunch mapping with all samples ! BUT ...

Don't worry, bam files with correct `-R "@RG\tID:Clone${i}\tSM:Clone${i}"` parameter in bwa-mem2 command are available for downloading.

In [None]:
cd /home/jovyan/work/
wget --no-check-certificat -rm -nH --cut-dirs=1 --reject="index.html*" wget https://itrop.ird.fr/sv-training/BAM_ILL.tar.gz
tar zxvf BAM_ILL.tar.gz
BAM_ILL="/home/jovyan/work/BAM_ILL"
rm BAM_ILL.tar.gz
ls $BAM_ILL

In [None]:
#wget --no-check-certificat -rm -nH --cut-dirs=1 --reject="index.html*" wget https://itrop.ird.fr/sv-training/BAM_ONT.tar.gz
#tar zxvf BAM_ONT.tar.gz
#BAM_ONT="/home/jovyan/work/BAM_ONT"
#rm BAM_ONT.tar.gz
#ls $BAM_ONT

## Calling all samples on one raw VCF with correct BAM files

In [None]:
BAM_ILL="/home/jovyan/work/BAM_ILL"
ls $BAM_ILL

In [None]:
# create a work directory to SNP calling results
mkdir -p /home/jovyan/work/VCF
VCF_DIR="/home/jovyan/work/VCF"

### Markduplicates

In [None]:
cd $BAM_ILL
for i in {1..20}
    do
        samtools index Clone$i.SORTED.bam;
        echo -e "\nMarkDuplicates in Clone$i";
        gatk MarkDuplicates -I Clone$i.SORTED.bam -M duplicates.$i.metrics -O Clone$i.SORTED.MD.bam;
        samtools index Clone$i.SORTED.MD.bam;
    done

### HaplotypeCaller

In [None]:
cd $BAM_ILL
for i in {1..9}
    do
        # lauch GATK HaplotypeCaller
        echo -e "\nCalling Clone$i";
        gatk --java-options "-Xmx4g" HaplotypeCaller --native-pair-hmm-threads 4 -I Clone$i.SORTED.MD.bam -O $VCF_DIR/Clone$i.g.vcf -R $REF -ERC GVCF
    done

### CombineGVCF

In [None]:
BAM_ILL="/home/jovyan/work/BAM_ILL"
VCF_DIR="/home/jovyan/work/VCF"
REF="/home/jovyan/work/SV_DATA/REF/reference.fasta"

# change of work directory
cd $VCF_DIR
# Loop to inflate the --variant option
OPTION=""
for i in {1..9}
do
    OPTION="${OPTION} --variant Clone${i}.g.vcf"
done
echo $OPTION
# GATK
gatk CombineGVCFs -R $REF $OPTION -O rawSNP.vcf

### Have a Look to it combineGVCF

In [None]:
head -n 1000 rawSNP.vcf | tail

### Compute the Genotypes

In [None]:
gatk --java-options "-Xmx4g" GenotypeGVCFs -R $REF -V rawSNP.vcf -O output.vcf

In [None]:
head output.vcf

### Compute the SNP density along the chromosomes

In [None]:
echo -e "Reference\t1000000\n" > genome.txt
bedtools genomecov -bga -split -i output.vcf -g genome.txt > density.csv

In [None]:
head density.csv

### Count the number of variants with `bcftools stat`

In [None]:
bcftools stats output.vcf > output.vcf.stats

In [None]:
head -n 100 output.vcf.stats