# Result outputs of the fresh water sponge resistome analysis

1. Statistics of reads and contigs, bins and MAGs
- (OPEN) SL_Fasta_Files_stats_modified_jobname.pl
    - Reads: RAW, EC, Contig_min500, Contig_PacBio
        - Contigs_min500: StatCTG_Illumina.xls
        - Prokaryotic contigs (after DMC): StatCTG_PROKS.xls
        - Contig_PacBio: StatCTG_metaFlye.xls
    - Bin, MAG
        - MetaBAT2 bin Illumina: StatMAG_Illumina.xls
        - MetaBAT2 bin PacBio: StatMAG_PacBio.xls
        - dREP MAG: StatMAG_ASG.xls
- Reads: FastQC, MultiQC
- Bins and MAGs: CheckM2
    - MAG_Illumina: MAG_Illumina_quality_report.tsv
    - MAG_ASG: MAG_ASG_quality_report.tsv
    - MAG_metaFLYE:MAG_metaFlye_quality_report.tsv
    - MAG_drep: MAG_dRep_ALL_quality_report.tsv

In [None]:
# Path to fasta stat files (contigs, bins, MAG)
/gxfs_work/geomar/smomw681/DATA/MAG_ALL/QC_ALL/Stats_Fasta

# Path to multiQC results
/gxfs_work/geomar/smomw681/DATA/QC_RESULTS/MULTIQC
/gxfs_work/geomar/smomw681/DATA/MAG_ALL/QC_ALL/QC_RAW_ALL

# Path to CheckM2 results
/gxfs_work/geomar/smomw681/DATA/MAG_Illumina/METABAT2/CheckM2/Backups_w.o_66/MAG_Illumina_quality_report.tsv
/gxfs_work/geomar/smomw681/DATA/MAG_PacBio/CheckM2_PacBio
/gxfs_work/geomar/smomw681/DATA/MAG_ALL/CheckM_ALL
/gxfs_work/geomar/smomw681/DATA/MAG_ALL/QC_ALL/CheckM_MAG  # all collected


cd /gxfs_work/geomar/smomw681/DATA/MAG_Illumina/
## STATS.xls files for

## CTG_Illumina
sbatch -c 3 -p base --mem=50G --job-name=StatCTG_Illumina \
     --wrap="perl /gxfs_work/geomar/smomw681/DATA/MAG_Files/SL_Fasta_Files_stats_modified_jobname.pl \
     CONTIGs_renamed"
##
## CTG_PROKS
sbatch -c 3 -p base --mem=50G --job-name=StatCTG_PROKS \
     --wrap="perl /gxfs_work/geomar/smomw681/DATA/MAG_Files/SL_Fasta_Files_stats_modified_jobname.pl \
     CLASS_CONTIGs/PROKS"
##

2. Taxonomic identification/Classification with DMC
- GTDBTK
    - Archea: gtdbtk.ar53.summary.tsv
    - Bacteria: gtdbtk.bac120.summary.tsv

- 

In [None]:
# Path to GTDB-TK summary files
/gxfs_work/geomar/smomw681/DATA/MAG_ALL/GTDBTK_ALL/*tsv

# DMC Stats
/gxfs_work/geomar/smomw681/DATA/MAG_ALL/DeepMicroClass_ALL/Stats_DMC_drep.tsv

# DMC statistics:
#!/bin/bash

# Output file
output="Stats_DMC_drep.tsv"

 cd $WORK/DATA/MAG_Illumina/DEEPMicroClass/
 
output="Stats_DMC_Illumina.tsv"

echo -e "File\tEukaryote\tEukaryoteVirus\tPlasmid\tProkaryote\tProkaryoteVirus" > "$output"

# Process each prediction file
for file in *.fasta_pred_onehot_hybrid.tsv; do
    base=$(basename "$file" .fasta_pred_onehot_hybrid.tsv)

    # Use awk to assign each contig to the column with the highest score
    awk -v fname="$base" '
    BEGIN {
        FS = OFS = "\t";
        euk = eukv = plasmid = prok = prokv = 0;
    }
    NR > 1 {
        maxval = $2;
        idx = 2;
        for (i = 3; i <= 6; i++) {
            if ($i > maxval) {
                maxval = $i;
                idx = i;
            }
        }
        # Count based on column index
        if (idx == 2) euk++;
        else if (idx == 3) eukv++;
        else if (idx == 4) plasmid++;
        else if (idx == 5) prok++;
        else if (idx == 6) prokv++;
    }
    END {
        print fname, euk, eukv, plasmid, prok, prokv;
    }' "$file" >> "$output"
done


3. Relative abundance of prokaryotes and ARG in samples 
- CoverM: drep_MAG in samples
    - Summarize the coverm tsv output results
- bbmap or CoverM: ARG in samples

In [None]:
# Path to coverM outputs
/gxfs_work/geomar/smomw681/DATA/MAG_ALL/CoverM_ALL/*.coverm_proks_dREpMAGs.tsv

 for i in ./*.coverm_proks_dRepMAGs.tsv; do
 base=$(basename $i ".tsv")
 awk 'NR == 1; NR > 1 {print $0 | "sort -k1 -n $i > CoverM_sorted/${base}_sorted.tsv"}'
 done

 #!/bin/bash

# Output file
output_file="MAG_coverM_hitlist.tsv"
> "$output_file"  # Clear or create the file

# Temp file for first column
tmp_first_col=$(mktemp)

# Header line
header="MAG_ID"

# Loop over all .tsv files
for i in CoverM_sorted/*.coverm_proks_dRepMAGs_sorted.tsv; do
    base=$(basename "$i" ".coverm_proks_dRepMAGs_sorted.tsv")
    
    # Save first column from the first file only
    if [ ! -s "$tmp_first_col" ]; then
        cut -f1 "$i" > "$tmp_first_col"
    fi
    
    # Extract second column and save to a temp file
    cut -f2 "$i" > "${base}_col2.tmp"
    
    # Append to header
    header+="\t$base"
done

# Write header to output
echo -e "$header" > "$output_file"

# Paste first column and all second columns side by side
paste "$tmp_first_col" *.tmp >> "$output_file"

# Cleanup
rm "$tmp_first_col" *.tmp


4. DeepARG results
- wc, awk
- SL_Summarize_deeparg_out_mapping_ARG_with_InOutOptions_modified.pl script
- ARG_hitlist
    - ARG on MAG_drep ORF 
    - ARG on MetaBAT2 bin ORF (PacBio, Illumina, ASG)

In [None]:
# Path to DeepARG outputs
# DeepARG on MAG_drep ORF:
/gxfs_work/geomar/smomw681/DATA/MAG_ALL/DeepARG_ALL/DeepARG_ORF
/gxfs_work/geomar/smomw681/DATA/MAG_ALL/DeepARG_ALL/DeepARGs_BIN_hits_perSample.txt
/gxfs_work/geomar/smomw681/DATA/MAG_ALL/DeepARG_ALL/DeepARG_BIN_Summary

# DeepARG on bin ORF:
/gxfs_work/geomar/smomw681/DATA/MAG_ALL/DeepARG_ALL/DeepARG_ORF
/gxfs_work/geomar/smomw681/DATA/MAG_ALL/DeepARG_ALL/DeepARG_ORF/DeepARGs_hits_perSample.tsv
/gxfs_work/geomar/smomw681/DATA/MAG_ALL/DeepARG_ALL/DeepARG_ORF_Summary

# Stats script in 5_1_3


In [None]:

# Unique ARGs in column 1
cd /gxfs_work/geomar/smomw681/DATA/MAG_ALL/DeepARG_ALL/DeepARG_ALL
cd /gxfs_work/geomar/smomw681/DATA/MAG_ALL/DeepARG_ALL/DeepARG_ALL/DeepARG_ES


##!/bin/bash

# Output file
output_file="DeepARG_CTG_all_unique.tsv"
output_file="DeepARG_CTG_FS_unique.tsv"
output_file="DeepARG_CTG_ES_unique.tsv"
output_file="DeepARG_MAG_all_unique.tsv"

> "$output_file"  # Clear or create the file

# Temporary file to collect all ARG names
tmp_file=$(mktemp)

# Loop over all *.ARG files
for file in *.mapping.ARG; do
    # Skip header, extract 1st column, append to temp file
    awk 'NR > 1 {print $1}' "$file" >> "$tmp_file"
done

# Count unique ARG names
echo -e "ARG_Name\tCount" > "$output_file"
sort "$tmp_file" | uniq -c | sort -nr | awk '{print $2 "\t" $1}' >> "$output_file"

# Print total count
total=$(wc -l < "$tmp_file")
echo -e "\nTotal ARG hits:\t$total"  >> "$output_file"

# Unique ARG name count
unique_ARG=$(awk 'FNR > 1 {print $1}' *.ARG | sort | uniq | wc -l)
echo -e "\nUnique ARGs:\t$unique_ARG" >> "$output_file"

# Clean up
rm "$tmp_file"

## re-do the statistics for only freshwater and estuarine sponges
# DeepARG_CTG_FS_unique.tsv

# cd /gxfs_work/geomar/smomw681/DATA/MAG_ALL/DeepARG_ALL/DeepARG_ALL/DeepARG_ES
# DeepARG_CTG_ES_unique.tsv


In [None]:
## ARG class in column 5
cd /gxfs_work/geomar/smomw681/DATA/MAG_ALL/DeepARG_ALL/DeepARG_ALL
cd /gxfs_work/geomar/smomw681/DATA/MAG_ALL/DeepARG_ALL/DeepARG_ALL/DeepARG_ES

# Output file
output_file="DeepARG_CTG_all_unique_class.tsv"
output_file="DeepARG_CTG_FS_unique_class.tsv"
output_file="DeepARG_CTG_ES_unique_class.tsv"
output_file="DeepARG_MAG_all_unique_class.tsv"


> "$output_file"  # Clear or create the file

# Temporary file to collect all ARG names
tmp_file=$(mktemp)

# Loop over all *.mapping.ARG files
for file in *.mapping.ARG; do
    # Skip header, extract 5st column, append to temp file
    awk 'NR > 1 {print $5}' "$file" >> "$tmp_file"
done

# Count unique ARG names
echo -e "ARG_Class\tCount" > "$output_file"
sort "$tmp_file" | uniq -c | sort -nr | awk '{print $2 "\t" $1}' >> "$output_file"

# Print total count
total=$(wc -l < "$tmp_file")
echo -e "\nTotal class hits:\t$total"  >> "$output_file"

# Unique ARG name count - something is wrong and need to be fixed
unique_class=$(awk 'FNR > 1 {print $1}' *.ARG | sort | uniq | wc -l)
echo -e "\nTotal_Class:\t$unique_class" >> "$output_file" 

# Clean up
rm "$tmp_file"

5. DeepBGC
- 

6. AntiSMASH 
- 
- ARG cluster on MAG_drep: 
    - in AntiSMASH_ALL 
    - align.daa.tsv, out.mapping.ARG
- ARG cluster on MetaBAT2 bin (PacBio, Illumina, ASG)
    - in AntiSMASH_BIN

In [None]:
# Path to AntiSMASH results
/gxfs_work/geomar/smomw681/DATA/MAG_ALL/AntiSMASH_ALL/
/gxfs_work/geomar/smomw681/DATA/MAG_ALL/AntiSMASH_ALL/AntiSMASH_ALL
/gxfs_work/geomar/smomw681/DATA/MAG_ALL/AntiSMASH_ALL/AntiSMASH_BIN

## Summary antiSMAH results
module load gcc12-env/12.3.0
module load miniconda3/24.11.1
conda activate AntiSMASH

# AntiSMASH on MAG_drep 
python /gxfs_work/geomar/smomw681/.conda/envs/AntiSMASH/bin/multismash/workflow/scripts/count_regions.py \
    /gxfs_work/geomar/smomw681/DATA/MAG_ALL/AntiSMASH_ALL/AntiSMASH_ALL/ \
    MAG_ALL/AntiSMASH_ALL/AntiSMASH_ALL/AntiSMASH_FS_MAG_drep_ALL.tsv

# AntiSMASH on MetaBAT2 bin  
python /gxfs_work/geomar/smomw681/.conda/envs/AntiSMASH/bin/multismash/workflow/scripts/count_regions.py \
    /gxfs_work/geomar/smomw681/DATA/MAG_ALL/AntiSMASH_ALL/AntiSMASH_BIN/ \
    MAG_ALL/AntiSMASH_ALL/AntiSMASH_BIN/AntiSMASH_BIN_METABAT2.tsv

# AntiSMASH on contigs 
python /gxfs_work/geomar/smomw681/.conda/envs/AntiSMASH/bin/multismash/workflow/scripts/count_regions.py \
    /gxfs_work/geomar/smomw681/DATA/MAG_ALL/AntiSMASH_ALL/AntiSMASH_CTG \
    /gxfs_work/geomar/smomw681/DATA/MAG_ALL/AntiSMASH_ALL/AntiSMASH_CTG/AntiSMASH_CTG_ALL.tsv

# the script tabulate_regions.py can be also used 
