# Pipeline to execute RepeatModeler2 + EDTA --> MCHelper --> RepeatMasker (with the help of Alba Marino)

TEs were identified through RepeatModeler2 and EDTA. Both tools generate comprehensive libraries of TEs, which may include redundant sequences. All isolated libraries were clustered through CD-Hit est (-d 0 -aS 0.98 -c 0.95 -G 0 -g 1 -b 500), and redundant sequences were filtered out through MCHelper using a minimum percentage of identity of 95%, and 98% coverage. (-outfmt 6 -perc_identity 95). We used a semi-automatic model in MCHelper to avoid having to verify detected TEs one by one.
Once repeats were identified, they were annotated with RepeatMasker (-xsmall). 
To do so, we used a  pipeline designed by Alba Marino, a Phd Student of the lab (See code below). We used it separately for each P. destructans isolates and outgroup. The TE libraries from all P. destructans were then merged. The final TE annotations were realized with the merged P. destructans TE library using RepeatMasker. 

In [None]:
#!/bin/bash

# TE annotation pipeline

# TO DO
# add help message
# add vm mode for softwares running in docker
# change docker with singularity

# Usage for a complete run (consensi discovery, automatic curation, genome annotation):
# ./script.sh --species Drosophila_melanogaster --output ~/TEannotation_pipeline/TEannotation_benchmarking/results --threads 6 --run-dnapt2x --reads ~/TEannotation_pipeline/Drosophila_pectinifera_sample.fastq.gz --genome-size 1000000 --run-rm2 --run-edta --assembly ~/TEannotation_pipeline/GCA_000001215.4_sample.fa --run-mchelper --busco-lineage diptera --container-mode

#############################
# TO EDIT
# Fill up the variables with programs full paths. If a program is in $PATH or running in Docker replace the path with ""

RM2_PATH=""
DNAPT_PATH=/bigvol/alba/bin/pipeline_dnapipe/
RM_PATH=""
EDTA_PATH=/bigvol/alba/bin/EDTA-2.0.1/
MCHELPER_PATH=/bigvol/alba/bin/MCHelper/
CONDA_PATH=${HOME}/miniconda3/
#############################
# DO NOT EDIT BELOW THIS LINE

set -e

THREADS=1
SAMPLING_SIZE="0.25"

while [[ $# -gt 0 ]]; do
	case $1 in
		--use-rm2-output) # use previous RM2 result instead of running RM2 from scratch (optional) - supply RM2 output directory previously generated
		RM2_OUTPUT="$2"
		shift # past argument
		shift # past value
		;;
		--species) # MANDATORY argument for species name: used to name the parent directory with all the analyses outputs and as files prefix
		SPECIES="$2"
		shift
		shift
		;;
		--assembly) # MANDATORY argument for assembly name
		ASSEMBLY="$2"
		shift
		shift
		;;
		--reads) # fastq file (mandatory if --run-dnapt2x is used)
		READS="$2"
		shift
		shift
		;;
		--output) # MANDATORY argument for main output directory
		OUT="$2"
		shift
		shift
		;;
		--threads) # number of threads to use (optional; default 1)
		THREADS="$2"
		shift
		shift
		;;
		--run-rm2) # run RM2 from scratch (optional)
		RUN_RM2="1"
		shift
		;;
		--use-dnapt2x-output) # use previous dnaPT contigs instead of running dnaPT2x from scratch (optional) - supply dnaPipeTE directory previously generated
		DNAPT2X_OUTPUT="$2"
		shift
		shift
		;;
		--run-dnapt2x) # run dnaPT2X form scratch (optional)
		RUN_DNAPT2X="1"
		shift
		;;
		--genome-size)
		GENOME_SIZE="$2" # supply genome size for dnaPT (mandatory if --run-dnapt2x is used)
		shift
		shift
		;;
		--sampling-size) # coverage used by dnaPT (optional)
		SAMPLING_SIZE="$2"
		shift
		shift
		;;
		--run-edta) # run EDTA from scratch (optional)
		RUN_EDTA="1"
		shift
		;;
		--use-edta-output) # use previous EDTA result instead of running it from scratch (optional) - supply EDTA directory previously generated
		EDTA_OUTPUT="$2"
		shift
		shift
		;;
		--run-mchelper) # run automated curation of all produced libraries altogether (optional)
		RUN_MCHELPER="1"
		shift
		;;
		--busco-lineage) # choose the busco dataset (all lowercase) used by MCHelper for false positive detection (mandatory if --run-mchelper is used) - see https://busco-data.ezlab.org/v5/data/lineages/ for available lineages
		BUSCO_LINEAGE="$2"
		shift
		shift
		;;
		--run-mask) # run RepeatMasker with the final library (optional)
		RUN_MASK="1"
		shift
		;;
		--container-mode) # use this option if RepeatModeler2 and RepeatMasker are used in the te-tools docker image
		CONTAINER_MODE="1"
		shift
		;;
	esac
done

# handle missing mandatory arguments

if [[ ! -v SPECIES ]] || [[ ! -v OUT ]]; then
	echo "Either --species or --output are missing"
	exit 1
fi

if [[ -v RUN_DNAPT2X ]] && [[ ! -v GENOME_SIZE ]]; then
	echo "Please supply --genome-size to run dnaPipeTE"
	exit 1
fi

if [[ -v RUN_DNAPT2X ]] && [[ ! -v READS ]]; then
	echo "Please supply --reads to run dnaPipeTE"
	exit 1
fi

if [[ -v RUN_RM2 ]] || [[ -v RUN_EDTA ]] || [[ -v RUN_MASK ]] && [[ ! -v ASSEMBLY ]]; then
	echo "Please supply --assembly for TE discovery, automated curation, and/or masking"
	exit 1
fi

if [[ -v RUN_MCHELPER ]] && [[ ! -v BUSCO_LINEAGE ]]; then
	echo "Please supply --busco-lineage to run MCHelper"
	exit 1
fi

# set environment variables

OUT_SP=${OUT}/${SPECIES}

if [[ -v ASSEMBLY ]]; then

	BASENAME_ASSEMBLY=$(basename ${ASSEMBLY})

fi

if [[ -v RUN_MCHELPER ]] || [[ -v RUN_EDTA ]]; then

	source ${CONDA_PATH}/etc/profile.d/conda.sh

fi

if [[ -v RUN_EDTA ]] || [[ -v EDTA_OUTPUT ]]; then

	seqkit replace -p '.+' -r 'ctg_{nr}' $ASSEMBLY > ${ASSEMBLY}_rename.fasta
	RENAMED_ASSEMBLY=$(readlink -f ${ASSEMBLY}_rename.fasta) # genome with reduced headers to use with EDTA
	BASENAME_RENAMED_ASSEMBLY=$(basename ${RENAMED_ASSEMBLY})

fi

# Set up te-tools container if --vm-mode is on

#if [[ -v VM_MODE ]]; then

#	echo "RepeatModeler2 and/or RepeatMasker will be run inside docker container"
#	singularity exec dfam-tetools-latest.sif
#	docker stop $(docker ps -a -q) && docker rm $(docker ps -a -q)
#	docker run --name tetools -d -i -t dfam/tetools
#	docker cp $ASSEMBLY tetools:/opt/src/
#	docker exec -it tetools bash
#	BuildDatabase -name ${SPECIES} -engine ncbi $ASSEMBLY
#	RepeatModeler -engine ncbi -threads $THREADS -database ${RM2_OUTPUT}/${SPECIES}
#	RM2_LIB=$(readlink -f ${SPECIES}\-families.fa)

#fi

# TE discovery step

## RM2 submodule: obtain de novo families from RepeatModeler2

if [ "$RUN_RM2" = "1" ] && [ ! -v VM_MODE ]; then

	RM2_OUTPUT=${OUT_SP}/RepeatModeler2
	mkdir -p $RM2_OUTPUT
	echo "Running RepeatModeler2"
	"${RM2_PATH}"BuildDatabase -name ${RM2_OUTPUT}/${SPECIES} -engine ncbi $ASSEMBLY
	(
	cd $RM2_OUTPUT
	"${RM2_PATH}"RepeatModeler -engine ncbi -threads $THREADS -LTRStruct -database ${RM2_OUTPUT}/${SPECIES}
	RM2_LIB=$(readlink -f ${SPECIES}\-families.fa)
	)

elif [ "$RUN_RM2" = "1" ] && [ -v VM_MODE ]; then

	echo "Running RepeatModeler2 in te-tools container"

# run RM2 in container

else

	echo "Skipping RepeatModeler2"

fi

# RM2 check: *families.fa should exist if one of the two arguments was used

if [[ -v RUN_RM2 ]] || [[ -v RM2_OUTPUT ]]; then

	if [[ ! -f ${RM2_OUTPUT}/${SPECIES}\-families.fa ]]; then

		echo "Could not find RepeatModeler2 families at ${RM2_OUTPUT}"
		exit 1

	else

		RM2_LIB=$(readlink -f ${RM2_OUTPUT}/${SPECIES}\-families.fa)

	fi
fi

## dnaPipeTE submodule: run 2rounds of dnaPipeTE and extract the dnaPipeTE contigs generated by the 2nd round ("quick & clean")

if [ "$RUN_DNAPT2X" = "1" ]; then

	DNAPT2X_OUTPUT=${OUT_SP}/dnaPipeTE2x
	mkdir -p ${DNAPT2X_OUTPUT}
	echo "Running dnaPipeTE 2x module"

	(
		ABS_DNAPT2X_OUTPUT=$(readlink -f $DNAPT2X_OUTPUT)
		ABS_READS=$(readlink -f $READS)
		ACC_NUM=$(basename ${READS%fastq.gz})
		cd $DNAPT_PATH && snakemake all --use-conda -j $THREADS -C genome_size=$GENOME_SIZE sampling_size=$SAMPLING_SIZE out_dir=${ABS_DNAPT2X_OUTPUT} short_reads=$ABS_READS species=$SPECIES acc_num=${ACC_NUM}
	)

else

	echo "Skipping dnaPipeTE 2x module"

fi

## dnaPipeTE output check: a dnaPipeTE output should exist if one of the two arguments was used 

if [[ -v RUN_DNAPT2X ]] || [[ -v DNAPT2X_OUTPUT ]]; then

	if [[ ! -f ${DNAPT2X_OUTPUT}/final_dnapipete_output/Trinity.fasta ]]; then

		echo "Could not find dnaPipeTE contigs at ${DNAPT2X_OUTPUT}/final_dnapipete_output/"
		exit 1

	else

		DNAPT2X_LIB=$(readlink -f ${DNAPT2X_OUTPUT}/final_dnapipete_output/Trinity.fasta)

	fi
fi

## EDTA submodule

if [ "$RUN_EDTA" = "1" ]; then

	EDTA_OUTPUT=${OUT_SP}/EDTA
	mkdir -p ${EDTA_OUTPUT}
	echo "Running EDTA"
	conda activate EDTA
	(
		cd $EDTA_OUTPUT
		"${EDTA_PATH}"EDTA.pl --genome $RENAMED_ASSEMBLY --threads $THREADS
	)
	conda deactivate

else

	echo "Skipping EDTA module"

fi

## EDTA output check: a EDTA output should exist if one of the two arguments was used

if [[ -v RUN_EDTA ]] || [[ -v EDTA_OUTPUT ]]; then

	if [[ ! -f "${EDTA_OUTPUT}"/"${BASENAME_RENAMED_ASSEMBLY}".mod.EDTA.TElib.fa ]]; then

		echo "Could not find EDTA output at ${EDTA_OUTPUT}"
		exit 1

	else

		EDTA_LIB=$(readlink -f "${EDTA_OUTPUT}"/"${BASENAME_RENAMED_ASSEMBLY}".mod.EDTA.TElib.fa)
	fi
fi

# Concatenate libraries and handle consensi duplicates in case of pipeline rerun
# (output from one module is not added to mergelibs.fa if families from the same tool are already present

LIBS_OUTPUT=${OUT_SP}/mergedlibs_precuration
mkdir -p ${LIBS_OUTPUT}

if [[ -v RUN_RM2 ]] || [[ -v RM2_OUTPUT ]]; then

	if [[ -f ${LIBS_OUTPUT}/mergedlibs.fa ]] && grep -q "RM2_" ${LIBS_OUTPUT}/mergedlibs.fa; then

		echo -e "${RM2_LIB} is not being added to ${LIBS_OUTPUT}/mergedlibs.fa, as families from RepeatModeler2 are already in there.\n\nIf instead you want to replace them, remove sequences with 'RM2_' prefix from mergedlibs.fa and append ${RM2_LIB} to it, then delete ${MCHELPER_OUTPUT} if needed and rerun the pipeline with --run-mchelper option."

	else

		sed 's/>/>RM2_/g' $RM2_LIB >> ${LIBS_OUTPUT}/mergedlibs.fa # add tool prefix to seqid
	fi
fi

if [[ -v RUN_DNAPT2X ]] || [[ -v DNAPT2X_OUTPUT ]]; then

	if [[ -f ${LIBS_OUTPUT}/mergedlibs.fa ]] && grep -q "dnaPT_" ${LIBS_OUTPUT}/mergedlibs.fa; then

		echo -e "${DNAPT2X_LIB} is not being added to ${LIBS_OUTPUT}/mergedlibs.fa, as dnaPipeTE contigs are already in there.\n\nIf instead you want to replace them, remove sequences with 'dnaPT_' prefix from mergedlibs.fa and append ${DNAPT2X_LIB} to it, then delete ${MCHELPER_OUTPUT} if needed and rerun the pipeline with --run-mchelper option."

	else

		sed 's/>/>dnaPT_/g' $DNAPT2X_LIB | sed 's/>.*/&#Unknown/' >> ${LIBS_OUTPUT}/mergedlibs.fa
	fi
fi

if [[ -v RUN_EDTA ]] || [[ -v EDTA_OUTPUT ]]; then

	if [[ -f ${LIBS_OUTPUT}/mergedlibs.fa ]] && grep -q "EDTA_" ${LIBS_OUTPUT}/mergedlibs.fa; then

		echo -e "${EDTA_LIB} is not being added to ${LIBS_OUTPUT}/mergedlibs.fa, as families from EDTA are already in there.\n\nIf instead you want to replace them, remove sequences with 'EDTA_' prefix from mergedlibs.fa and append ${EDTA_LIB} to it, then delete ${MCHELPER_OUTPUT} if needed and rerun the pipeline with --run-mchelper option."

	else

		sed 's/>/>EDTA_/g' $EDTA_LIB >> ${LIBS_OUTPUT}/mergedlibs.fa

	fi
fi


# Libraries curation step: run MCHelper

if [ "$RUN_MCHELPER" = "1" ]; then

	MCHELPER_OUTPUT=${OUT_SP}/MCHelper
	mkdir -p ${MCHELPER_OUTPUT}
	BUSCO_OUTPUT=${OUT_SP}/busco_profile
	mkdir -p ${BUSCO_OUTPUT}
	echo "Running MCHelper"
	conda activate MCHelper

## download busco hmm profiles

	wget -O ${BUSCO_OUTPUT}/lineages.html https://busco-data.ezlab.org/v5/data/lineages/
	TARNAME=$(grep $BUSCO_LINEAGE ${BUSCO_OUTPUT}/lineages.html | cut -d'"' -f2)
	ABS_BUSCO_PREF=${BUSCO_OUTPUT}/${BUSCO_LINEAGE}
	wget -O ${ABS_BUSCO_PREF}.tar.gz https://busco-data.ezlab.org/v5/data/lineages/${TARNAME} 
	tar -xf ${ABS_BUSCO_PREF}.tar.gz -C $BUSCO_OUTPUT
	cat ${ABS_BUSCO_PREF}_odb10/hmms/*hmm > ${ABS_BUSCO_PREF}.hmm && rm -r ${ABS_BUSCO_PREF}.tar.gz ${ABS_BUSCO_PREF}_odb10


	python3 "${MCHELPER_PATH}"MCHelper.py -r A -t $THREADS -l ${LIBS_OUTPUT}/mergedlibs.fa -o $MCHELPER_OUTPUT -g $ASSEMBLY --input_type fasta -b ${ABS_BUSCO_PREF}.hmm -a F

	conda deactivate

else

	echo "Skipping MCHelper module"

fi

# Masking step: use curated library to mask the genome assembly

if [ "$RUN_MASK" = "1" ]; then

	MCHELPER_OUTPUT=${OUT_SP}/MCHelper
	if [[ -f ${MCHELPER_OUTPUT}/curated_sequences_NR.fa ]]; then

		RM_OUTPUT=${OUT_SP}/RepeatMasker
		mkdir -p ${RM_OUTPUT}
		(
			cd $RM_OUTPUT
			echo "Running RepeatMasker with ${MCHELPER_OUTPUT}/curated_sequences_NR.fa"
			"${RM_PATH}"RepeatMasker -lib ${MCHELPER_OUTPUT}/curated_sequences_NR.fa -a -gff -pa $THREADS $ASSEMBLY
		)
	else

		echo "No library to mask with at ${MCHELPER_OUTPUT}"
		exit 1

	fi

fi

## To loop on multiple genomes 

In [None]:
#!/bin/bash

# Automatically run TEannot pipeline from a reference table
# NOTE: If you want to add or remove a pipeline brick, directly change the arguments in the bash commands of this script. Note that the change will apply in the same way to all the selected assemblies. Only exception is dnaPipeTE which can be run only if reads are available (as per mapfile).

# Requirements:
## sratoolkit should be installed and in $PATH
## 1. mapfile: a tab-separated table containing <genus_species> in 1st column, <SRaccession> in 2nd column, <assemblysize> in 3rd column. If SR is not available fill in NA.
## 2. all the assemblies for which we want to run the pipeline in the same directory and named as <genus_species_genome.fasta>
## 3. the TEannot script.sh to run

# usage: bash auto_run_teannot.sh 14 leotiomycetes
## Arguments in order: mapfile name, number of threads, busco lineage to be used by MCHelper for false positive filtering 

for GENOME in *_modbasecalling.fasta; do

	SPECIES=${GENOME%_Dorado_modbasecalling.fasta}
	SPECIES=${SPECIES#filtered_hypo_}

#	if grep -Fwq "$SPECIES" $1; then

#		SRA=$(grep $SPECIES $1 | cut -f2)
#		GS=$(grep $SPECIES $1 | cut -f3)

#		if [ $SRA = "NA" ]; then

#			echo "Annotating TEs for $SPECIES without dnaPipeTE: reads not available"
			#change here settings to run with the pipeline; can't add dnapt2x options here
#			bash /bigvol/alba/bin/TEannot_script_correct.sh --species $SPECIES --output ${PWD}/TEannotation_results --threads $2 --run-edta --assembly ${PWD}/$GENOME --use-rm2-output ${PWD}/TEannotation_results/${SPECIES}/RepeatModeler2 --run-mchelper --busco-lineage $3 --run-mask

#		else

			echo "Annotating TEs for $SPECIES with all the required options"
			# The following commands fetch reads, convert to zipped fastq and rename if needed: comment them if dnapt2x is not run by script.sh!
#			mkdir tmpfastq
#			(
#				cd tmpfastq
#				prefetch $SRA --max-size 100GB -f yes -p
#				fasterq-dump --outdir $PWD --mem 1G --split-3 --threads $THREADS --skip-technical ${SRA}/${SRA}.sra
#				pigz *fastq
#				COUNT=$(ls ${SRA}.fastq.gz | wc -l)
#				if [ $COUNT = 1 ]; then mv ${SRA}.fastq.gz ${SRA}_1.fastq.gz; fi
#			)

			#change here settings to run with the pipeline
bash /bigvol/alba/bin/TEannot_script_correct.sh --species $SPECIES --output ${PWD}/TEannotation_results --threads $1 --run-edta --assembly ${PWD}/$GENOME --run-mchelper --run-rm2 --run-mask --busco-lineage $2 # --run-dnapt2x --reads ${PWD}/tmpfastq/${SRA}_1.fastq.gz --genome-size $GS
#			mv tmpfastq/* dropfastq/ && rm -r tmpfastq



#		echo "$SPECIES is not in the mapfile. Skipping..."

#	fi


done

* Repeat masker will output a .align file that contain the alignment between the consensus TEs sequences identify by RepeatMasker and our actual TEs sequences from our *P.destructans* assemblies.  
  
* From the .align file we have metric such as Kimura divergence.
Kimura divergence allows estimating evolutionary divergence between TE sequences and their respective identified consensus sequence.

* RepeatMasker use the following formula to calculate Kimura divergence: K = -1/2 ln(1-2p -q)-1/4ln(1-2q) with p as the number of transitions (A→G or C→T), and q as the proportion of transversions (A→T, A→C, G→T, G→ C).

* RepeatMasker uses a CpG adjusted Kimura divergence, where two transitions at a CpG site are counted as a single event, and one transition at a CpG site is counted as 1/10th of a standard transition. Regarding, transversions they are counted normally at CpG sites. This adjustment is made to account for the hypermutability of CpG due to methylation, which can lead to an overestimation of divergence time.


## Obtain a table with TEs family, start, end position, contig, and kimura divergence

In [None]:
import re

# List to stock contigs, start position...
contigs = []
starts = []
ends = []
families = []
kimura_values = []

# Ouverture et lecture du fichier (LOOP avec * pour Gd* )
with open('filtered_hypo_Gd267_Dorado_modbasecalling.fasta.align', 'r') as file:
	lines = file.readlines()
	for line in lines:
    	# Recherche de la ligne contenant le contig, la position de départ et la position de fin
    	match = re.search(r'(\w+)\s+(\d+)\s+(\d+)\s+\((\d+)\)\s+(\w+)', line)
    	if match:
        	contig = match.group(1)
        	start = int(match.group(2))
        	end = int(match.group(3))
        	family_temp = match.group(5)  # Famille temporaire

        	contigs.append(contig)
        	starts.append(start)
        	ends.append(end)
        	families.append(family_temp)  # Ajout de la famille temporaire

    	# Recherche de la famille d'éléments transposables après le #
    	if '#' in line:
        	family = line.split('#')[1].split()[0]
        	if family_temp in families:
            	index = families.index(family_temp)
            	families[index] = family  # Remplacement de la famille temporaire par la famille réelle

    	# Recherche de la valeur de Kimura
    	if 'Kimura (with divCpGMod)' in line:
        	kimura_value = float(line.split('=')[1].strip())
        	kimura_values.append(kimura_value)

# Création du tableau
table = list(zip(contigs, starts, ends, families, kimura_values))

# Affichage du tableau sans les en-têtes
for contig, start, end, family, kimura in table:
	print(f"{contig:<15} {start:<10} {end:<10} {family:<30} {kimura:<15}") ### ICI MODIFIER POUR STOCKER DIRECT DANS UN .TXT

## Sort by TEs families (in column 4)

In [None]:
sort -k4 test.txt >> sorted_Gd267.txt (## FAIRE EN FOR LOOP AUSSI)

## Histogram of kimura divergence for each TEs families (Rstudio)

In [None]:
library(ggplot2)
library(tidyr)
library(dplyr)
library(gridExtra)

# Define the working directory
setwd("/home/stagiaire/Documents/R_script_table/Age_TEs")

# Get the list of files in the directory
file_list <- list.files(pattern = "*.txt")

# Define lineage mapping
lineage1_files <- c("sorted_Gd293.txt", "sorted_Gd1111.txt", "sorted_Gd2407.txt", "sorted_Gd442.txt", "sorted_Gd994.txt", "sorted_Gd4985.txt")
lineage2_files <- c("sorted_Gd45.txt", "sorted_Gd614.txt", "sorted_Gd708.txt", "sorted_Gd2185.txt", "sorted_Gd4986.txt")
outgroup_file <- "sorted_Gd267.txt"

# Function to combine data for each lineage
combine_data <- function(files) {
  combined_data <- data.frame()
  for (file in files) {
    data <- read.table(file, header = FALSE, col.names = c("Contig", "Start", "End", "Class", "Value"))
    combined_data <- rbind(combined_data, data)
  }
  return(combined_data)
}

# Combine data for each lineage
lineage1_data <- combine_data(lineage1_files)
lineage2_data <- combine_data(lineage2_files)
outgroup_data <- combine_data(outgroup_file)


# Function to create plot for a combined dataset
create_plot <- function(data, y_limit, title, show_legend = FALSE) {
  data_grouped <- data %>%
    filter(!is.na(Value) & Value >= 0 & Value <= 60) %>%
    group_by(Class)

  p <- ggplot(data_grouped, aes(x = Value, fill = Class)) +
    geom_histogram(position = "stack", binwidth = 1) +
    labs(x = "Kimura divergence (CpG adjusted)", y = "Nb of TEs", title = title) +
    ylim(0, y_limit) + # Set y-axis limit
    xlim(0, 60) # Set x-axis limit

  if (show_legend) {
    p <- p + guides(fill = guide_legend(title = "Class"))
  } else {
    p <- p + guides(fill = FALSE) # Hide the legend
  }

  return(p)
}

# Define y-axis limits
y_limits <- c(lineage1 = 20000, lineage2 = 20000, outgroup = 20000)

# Create plots for each lineage
lineage1_plot <- create_plot(lineage1_data, y_limits["lineage1"], "Lineage 1")
lineage2_plot <- create_plot(lineage2_data, y_limits["lineage2"], "Lineage 2")
outgroup_plot <- create_plot(outgroup_data, y_limits["outgroup"], "Outgroup")

# Display the plots with the legend in the fourth column
grid.arrange(lineage1_plot, lineage2_plot, outgroup_plot, ncol = 3)


# Genome wide TE coverage 

In [None]:
AJOUTER LE CODE

# Contig wide TE coverage

In [None]:
AJOUTER LE CODE AVEC BEDTOOL COVERAGE

In [None]:
setwd("C:/Users/ocean/Downloads")

# Load necessary libraries
library(ggplot2)
library(dplyr)
library(gridExtra)

# Read the CSV file
data <- read.delim("contig_TEs_final(2).csv", header = TRUE, sep=",")

# Rename columns for clarity (adjust these names as needed)
colnames(data)[c(2,3,6)] <- c("Legend", "name", "value")

# Function to filter unique contig names and reorder by value
filter_unique_contigs_and_reorder <- function(df) {
  df <- df %>%
    group_by(name) %>%
    filter(row_number() == 1) %>%
    ungroup() %>%
    arrange(desc(value))
  return(df)
}

# Define colors for each category in Legend
colors <- c("1" = "#FF5733", "2" = "#3377FF", "Outgroup" = "#9ACD32")

# Create separate plots for 'h' and 'm', with unique contig names and ordered by value
plot_h <- data %>%
  filter_unique_contigs_and_reorder() %>%
  ggplot(aes(x = reorder(name, -value), y = value, fill = Legend)) +  # Use reorder for ordering
  geom_bar(stat = "identity", position = position_dodge(width = 0.9), color = "black") +
  facet_wrap(~ Legend, ncol = 1, scales = "free_x") +
  scale_x_discrete(expand = c(0, 0)) +
  scale_y_continuous(expand = c(0, 0), limits = NULL, name = "TE coverage (%)") +  # Y-axis label for the first plot
  scale_fill_manual(values = colors) +  # Use manual scale for fill colors
  theme_minimal() +
  theme(
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    panel.spacing = unit(0.1, "lines")
  ) +
  labs(title = expression(paste("TE coverage (%) in ", italic("P. destructans"), " and outgroup contigs")), x = NULL, y = NULL)


# Arrange and display plots in two rows
grid.arrange(plot_h, nrow = 1)

