# What is the use of Polishing ?

* Polishing used short illumina reads that were mapped on their respective assemblies (constructed from long reads), to improve overall genome assemblies (contiguity, completness...).

* Before doing polishing we need to index and map our assemblies to its respective long reads assemblies obtain from Flye. We used bwa-mem2 for this purpose.

# BWA

## Indexing

In [None]:
#!/bin/bash 

# Create folders to stock assemblies, and their index 
mkdir -p /bigvol/omion/05-Polishing/BWA-MEM2/Indexing/{Dorado,Guppy}/{modbasecalling,basecalling}

for i in /bigvol/omion/04-Flye/*/*/*/assembly.fasta; do
    if [ -f "$i" ]; then
        directory=$(dirname "$i")
        gd_part=$(basename "$directory")

        # Extract the basecalling method (Dorado or Guppy) and type (modbasecalling or basecalling)
        basecalling_method=$(echo "$directory" | awk -F'/' '{print $(NF-2)}')
        basecalling_type=$(echo "$directory" | awk -F'/' '{print $(NF-1)}')

        # Create the output directory path
        output_dir="/bigvol/omion/05-Polishing/BWA-MEM2/Indexing/${basecalling_method}/${basecalling_type}/${gd_part}"

        mkdir -p "${output_dir}"

        # Copy the assembly.fasta to the output directory
        cp "$i" "${output_dir}/assembly.fasta"

        # Change to the output directory
        cd "${output_dir}"

        # Run BWA-MEM2 indexing
        bwa-mem2 index assembly.fasta

        # Change back to the original directory
        cd - > /dev/null
    fi
done


## Mapping

In [None]:
#!/bin/bash

# Create folders to stock mapping files
mkdir -p /bigvol/omion/05-Polishing/BWA-MEM2/Mapping/{Dorado,Guppy}/{modbasecalling,basecalling}

for i in /bigvol/omion/05-Polishing/BWA-MEM2/Indexing/*/*/Gd*/assembly.fasta; do
    if [ -f "$i" ]; then
        directory=$(dirname "$i")
        gd_part=$(basename "$directory")

        # Extract the basecalling method (Dorado or Guppy) and type (modbasecalling or basecalling)
        basecalling_method=$(echo "$directory" | awk -F'/' '{print $(NF-2)}')
        basecalling_type=$(echo "$directory" | awk -F'/' '{print $(NF-1)}')

        # Create the output directory path
        output_dir="/bigvol/omion/05-Polishing/BWA-MEM2/Mapping/${basecalling_method}/${basecalling_type}/${gd_part}"

        mkdir -p "${output_dir}"
        gd_number=$(echo "$gd_part" | grep -oP 'Gd\K[0-9]+')

        echo "Processing $gd_part (number: $gd_number)"

        # Find the corresponding folders containing the Illumina reads
        illumina_folder=$(find /bigvol/shared_FunAdapt/00-illumina_Nanopore/illumina_shortreads -type d -name "Gd_*${gd_number}*" | head -n 1)

        if [ -z "$illumina_folder" ]; then
            echo "No matching Illumina folder found for $gd_part"
        else
            echo "Found Illumina folder: $illumina_folder"

            # List contents in the Illumina folders
            echo "Contents of $illumina_folder:"
            ls -l "$illumina_folder"

            # Find the Illumina files 
            illumina_file1=$(find "$illumina_folder" -type f \( -name "*_R1.fastq.gz" -o -name "*_1.fq.gz" -o -name "*_1.fastq" -o -name "*_1.fastq.gz" \) | head -n 1)
            illumina_file2=$(find "$illumina_folder" -type f \( -name "*_R2.fastq.gz" -o -name "*_2.fq.gz" -o -name "*_2.fastq" -o -name "*_2.fastq.gz" \) | head -n 1)

            if [ -f "$illumina_file1" ] && [ -f "$illumina_file2" ]; then
                echo "Found Illumina files:"
                echo "File 1: $illumina_file1"
                echo "File 2: $illumina_file2"

                # Run BWA-MEM2 mapping
                echo "Running BWA-MEM2 mapping..."
                bwa-mem2 mem -t 80 "$i" "$illumina_file1" "$illumina_file2" > "${output_dir}/bwa_mapping_${gd_part}.sam"

                echo "Mapping completed. Output in ${output_dir}/bwa_mapping_${gd_part}.sam"
            else
                echo "Illumina files not found in $illumina_folder"
                echo "Searched for patterns: *_R1.fastq.gz, *_R2.fastq.gz, *_1.fq.gz, *_2.fq.gz, *_1.fastq, *_2.fastq, *_1.fastq.gz, *_2.fastq.gz"
            fi
        fi

        echo "----------------------------------------"
    fi
done

# Conversion SAM to BAM

Bwa-mem2 will output sam file, that we need to convert to bam for further processing. Then file were also sorted to extract genome size and coverage.

In [None]:
## Convertion SAM to BAM
$ for file in /bigvol/omion/05-Polishing/BWA-MEM2/Mapping/*/*/Gd*/bwa_mapping_Gd*.sam; do
    output="${file%.sam}.bam"
    samtools view -bS "$file" --threads 80 > "$output"
done


## Sorting 
for basecaller in /bigvol/omion/05-Polishing/BWA-MEM2/Mapping/*; do
    basecaller_name=$(basename "$basecaller")
    for mode in "$basecaller"/*; do
        mode_name=$(basename "$mode")
        for file in "$mode"/Gd*/bwa_mapping_Gd*.bam; do
            if [ -f "$file" ]; then
                dir=$(dirname "$file")
                sample=$(basename "$file" | cut -d'_' -f3)
                output_file="$dir/sorted_bwa_mapping_${sample}_${basecaller_name}_${mode_name}.bam"
                samtools sort -o "$output_file" "$file" --threads 80
                echo "Sorted and renamed: $output_file"
            fi
        done
    done
done

In [None]:
## Estimation of genome size and coverage

#!/bin/bash

# Directory containing BAM files
directory="/bigvol/omion/05-Polishing/BWA-MEM2/Mapping/*/*/Gd*/"

# Output header
echo -e "FileName\tsize\tcoverage"

# Process each BAM file
for bamfile in $directory/sorted_bwa_mapping_Gd*.bam; do
    # Extract the file name without extension and remove the prefix
    filename=$(basename "$bamfile" .bam)
    filename=${filename#sorted_bwa_mapping_}

    # Calculate the genome size
    size=$(samtools view -H "$bamfile" --threads 80 | grep -P '^@SQ' | cut -f 3 -d ':' | awk '{sum+=$1} END {print sum}')

    # Calculate coverage
    depth_sum=$(samtools depth "$bamfile" --threads 80 | awk '{sum+=$3} END {print sum}')
    coverage=$(echo "$depth_sum / $size" | bc -l)

    # Print the isolate name, the genome size, and the coverage
    echo -e "$filename\t$size\t$coverage"
done


A table was created from the output. The first column contain isolate name, the second the genome size, and the third the coverage estimation.

# HyPo : Polishing of Flye Assembly using short Illumina reads

In [None]:
#!/bin/bash 

# Path to the data file containing genome size + coverage
data_file="/bigvol/omion/05-Polishing/genome_size_coverage.txt"

# Directories containing input files
fq_dir="/bigvol/shared_FunAdapt/00-illumina_Nanopore/illumina_shortreads"
index_dir="/bigvol/omion/05-Polishing/BWA-MEM2/Indexing/"
mapping_dir="/bigvol/omion/05-Polishing/BWA-MEM2/Mapping/"

# Number of threads used
threads=40

# Read the data file into an associative array
declare -A data_map
while IFS=$'\t' read -r filename size coverage; do
    data_map["$filename"]="$size $coverage"
done < "$data_file"

echo -e "FileName\tValue\tAverage"

# Process each fasta and bam file
for fasta in ${index_dir}/*/*/Gd*/assembly_Gd*.fasta; do
    # Extract the identifier 
    identifier=$(basename "$fasta" .fasta | sed 's/assembly_//')

    # Find the corresponding BAM file
    bam_file=$(find ${mapping_dir}/*/*/Gd*/ -name "sorted_bwa_mapping_${identifier}.bam")
    if [[ -z "$bam_file" ]]; then
        echo "No BAM file found for identifier $identifier"
        continue
    fi

    directory=$(dirname "$fasta")
    gd_part=$(basename "$directory")
    gd_number=$(echo "$gd_part" | grep -oP 'Gd\K[0-9]+')

    # Find the correct folder for FASTQ files
    illumina_folder=$(find ${fq_dir} -type d -name "Gd_*${gd_number}*" | head -n 1)
    if [[ -z "$illumina_folder" ]]; then
        echo "No matching Illumina folder found for $gd_part"
        continue
    fi

    echo "Found Illumina folder: $illumina_folder"
    echo "Contents of $illumina_folder:"
    ls -l "$illumina_folder"

    # Find the Illumina files
    illumina_file1=$(find "$illumina_folder" -type f \( -name "*_R1.fastq.gz" -o -name "*_1.fq.gz" -o -name "*_1.fastq" -o -name "*_1.fastq.gz" \) | head -n 1)
    illumina_file2=$(find "$illumina_folder" -type f \( -name "*_R2.fastq.gz" -o -name "*_2.fq.gz" -o -name "*_2.fastq" -o -name "*_2.fastq.gz" \) | head -n 1)

    if [[ -z "$illumina_file1" || -z "$illumina_file2" ]]; then
        echo "Illumina files not found for $identifier"
        continue
    fi

    echo "Found Illumina files:"
    echo "File 1: $illumina_file1"
    echo "File 2: $illumina_file2"

    # Extract corresponding genome size and coverage
    if [[ -n "${data_map[$identifier]}" ]]; then
        size=$(echo "${data_map[$identifier]}" | awk '{print $1}')
        coverage=$(echo "${data_map[$identifier]}" | awk '{print $2}')
    else
        echo "No data found for identifier $identifier"
        continue
    fi

    # Run HyPo
    hypo -r "$illumina_file1" "$illumina_file2" \
         -d "$fasta" \
         -b "$bam_file" \
         -c "$coverage" \
         -s "${size}m" \
         -t "$threads"
done
