# BUSCO

## What is BUSCO ?
BUSCO provides a quantitative assessment of the completeness in terms of expected gene content of a genome assembly using a database containing orthologs genes.

# Running BUSCO


In [None]:
$ busco --list-datasets ## List all the available dataset, the one interesting to us are fungi_odb10, and to be more precise : ascomycota_odb10,leotiomycetes_odb10

## Create a conda environnment for busco

In [None]:
$ conda create -n busco -c conda-forge -c bioconda busco=5.6.0
$ source ~/.bashrc
$ conda activate busco

In [None]:
#!/bin/bash 

# Activate the conda environment
conda activate busco

# Create directory for BUSCO results
mkdir -p "/bigvol/omion/08-QualityAssemblies/BUSCO"

# Loop through all assembly files
for i in /bigvol/omion/All_assemblies/Gd*/*.fasta; do
    if [ -f "$i" ]; then
        gd_part=$(basename $(dirname "$i"))
        assembly_name=$(basename "$i" .fasta)

        # Create the output directory path
        output_dir="/bigvol/omion/08-QualityAssemblies/BUSCO/${gd_part}"

        # Run BUSCO
        busco -i "$i" -m genome -l leotiomycetes_odb10 -o "${output_dir}" -f
    fi
done

# Deactivate the conda environment
conda deactivate


# Creation of BUSCO plots

## Code to moove the short summary from busco to their respective busco_summaries folder for the creation of BUSCO plots

In [None]:
#!/bin/bash

# Create a folder to put all the busco short summaries
mkdir -p busco_summaries

# Traverse all directories except for the "logs" directory
for dir in /bigvol/omion/08-QualityAssemblies/BUSCO/Gd*/busco_Gd*/; do
    if [ "$dir" != "/bigvol/omion/08-QualityAssemblies/BUSCO/Gd*/busco_Gd*/logs/" ]; then
        dir_name=$(basename "$dir")
        echo "Processing directory $dir_name..."

        # Create a directory for each main directory
        mkdir -p "busco_summaries/$dir_name"

        # Copy all short_summary*.txt files to busco_summaries/main_directory_name/
        cp "$dir"**/short_summary*.txt "./busco_summaries/$dir_name/"
    fi
done


## Code to create BUSCO plots

In [None]:
#!/bin/bash

for dir in ./busco_summaries/busco_Gd*; do
  # Execute the generate_plot.py script with the current directory
  generate_plot.py -wd "$dir"
done


# Obtain lenght of contigs

Quast does not output by default the lenght of the contigs, they will only be available in the html viewver but one by one, and not in group, so to obtain directly the lengh of all the contigs of a sample we can execute :  

In [None]:
#!/bin/bash

# Loop over each file matching the pattern filtered_hypo*
for file in /bigvol/omion/05-Polishing/HyPo/Gd*/hypo_Gd*.fasta; do
  # Extract the specific part of the filename (Gd1111)
  prefix=$(echo "$file" | sed -E 's/filtered_hypo_(Gd[^_]+)_.*/\1/')

  # Process the file with awk
  awk -v prefix="$prefix" '
  /^>/ {
    if (seq) {
      print prefix "\t" substr(name, 2) "\t" length(seq);
    }
    name = $0;
    seq = "";
  }
  !/^>/ {
    seq = seq $0;
  }
  END {
    if (seq) {
      print prefix "\t" substr(name, 2) "\t" length(seq);
    }
  }' "$file" >> contig_lengths.tsv
done


# QUAST


## What is QUAST ?
QUAST provides an analysis about the quality of assemblies, notably at the level of the contiguity  (N50,N90, n° of contigs, largest contig, total lenght in bp...).

# Running QUAST


In [None]:
#!/bin/bash  Place yourself in the folder /bigvol/omion/All_assemblies

# Define the isolates
isolates=("Gd1111" "Gd2185" "Gd2407" "Gd267" "Gd442" "Gd4985" "Gd4986" "Gd614" "Gd708" "Gd994")

# Loop through each isolate
for isolate in "${isolates[@]}"; do
    # Define the directories and files for the current isolate
    assembly_guppy="/bigvol/omion/All_assemblies/${isolate}/assembly_${isolate}_Guppy_modbasecalling.fasta"
    assembly_dorado="/bigvol/omion/All_assemblies/${isolate}/assembly_${isolate}_dorado_modbasecalling.fasta"
    hypo_guppy="/bigvol/omion/All_assemblies/${isolate}/hypo_${isolate}_Guppy_modbasecalling.fasta"
    hypo_dorado="/bigvol/omion/All_assemblies/${isolate}/hypo_${isolate}_Dorado_modbasecalling.fasta"
    final="/bigvol/omion/All_assemblies/${isolate}/final${isolate#"Gd"}.fasta"
    filtered_hypo_dorado="/bigvol/omion/All_assemblies/${isolate}/filtered_hypo_${isolate}_Dorado_modbasecalling.fasta"

    # Create QUAST output directory for the current isolate
    output_dir="./Quast/${isolate}/"
    mkdir -p "$output_dir"

    # Run QUAST and store output in the specified directory
    quast -o "$output_dir" -l "assembly_${isolate}_Guppy_modbasecalling,assembly_${isolate}_dorado_modbasecalling,hypo_${isolate}_Guppy_modbasecalling,hypo_${isolate}_Dorado_modbasecalling,final${isolate#"Gd"},filtered_hypo_${isolate}_Dorado_modbasecalling" "$assembly_guppy" "$assembly_dorado" "$hypo_guppy" "$hypo_dorado" "$final" "$filtered_hypo_dorado" --fungus --fragmented --k-mer-stats --threads 40
done

isolates=("Gd293" "Gd45")

# Loop through each isolate
for isolate in "${isolates[@]}"; do
    # Define the directories and files for the current isolate
    assembly_gup="/bigvol/omion/All_assemblies/${isolate}/assembly_${isolate}_Guppy_basecalling.fasta"
    assembly_guppy="/bigvol/omion/All_assemblies/${isolate}/assembly_${isolate}_Guppy_modbasecalling.fasta"
    assembly_dor="/bigvol/omion/All_assemblies/${isolate}/assembly_${isolate}_dorado_basecalling.fasta"
    assembly_dorado="/bigvol/omion/All_assemblies/${isolate}/assembly_${isolate}_dorado_modbasecalling.fasta"
    hypo_gup="/bigvol/omion/All_assemblies/${isolate}/hypo_${isolate}_Guppy_basecalling.fasta"
    hypo_guppy="/bigvol/omion/All_assemblies/${isolate}/hypo_${isolate}_Guppy_modbasecalling.fasta"
    hypo_dor="/bigvol/omion/All_assemblies/${isolate}/hypo_${isolate}_Dorado_basecalling.fasta"
    hypo_dorado="/bigvol/omion/All_assemblies/${isolate}/hypo_${isolate}_Dorado_modbasecalling.fasta"
    final="/bigvol/omion/All_assemblies/${isolate}/final${isolate#"Gd"}.fasta"
    filtered_hypo_dorado="/bigvol/omion/All_assemblies/${isolate}/filtered_hypo_${isolate}_Dorado_modbasecalling.fasta"


    # Create QUAST output directory for the current isolate
    output_dir="./Quast/${isolate}/"
    mkdir -p "$output_dir"

    # Run QUAST and store output in the specified directory
    quast -o "$output_dir" -l "assembly_${isolate}_Guppy_basecalling,assembly_${isolate}_Guppy_modbasecalling,assembly_${isolate}_dorado_basecalling,assembly_${isolate}_dorado_modbasecalling,hypo_${isolate}_Guppy_basecalling,hypo_${isolate}_Guppy_modbasecalling,hypo_${isolate}_Dorado_basecalling,hypo_${isolate}_Dorado_modbasecalling,final${isolate#"Gd"},filtered_hypo_${isolate}_Dorado_modbasecalling" "$assembly_gup" "$assembly_guppy" "$assembly_dor" "$assembly_dorado" "$hypo_gup" "$hypo_guppy" "$hypo_dor" "$hypo_dorado" "$final" "$filtered_hypo_dorado" --fungus --fragmented --k-mer-stats --threads 40
done
