# Purpose:

Make outputs for the Zachry Brenton Sorghum paper

# Pre-amble

In [None]:
setwd("/scratch/")

Make a directory and get the data

In [None]:
dir.create("sorghum/")
system("aws s3 sync s3://ddpsc-baxterlab-data/panvar/panvar_mwe/raw_inputs/sorghum/ sorghum/")

In [None]:
system2("rclone copy dandrive:WIP/panvar/sorghum/BAP_WSC/sorghum_BAP_WSC_gwas.tsv .")

# Dependencies

In [None]:
library(devtools)

Install `panvar`

In [None]:
load_all("~/repos/devel_panvar/")

If you need to rebuild the library for `rmd`.

The order is important here.

In [None]:
setwd("~/repos/devel_panvar/")

In [None]:
devtools::document()

In [None]:
devtools::build()

In [None]:
devtools::build_rmd("vignettes/panvaR.Rmd")

# Inputs

# Scratch

In [None]:
gwas_table <- panvar_gwas(
    genotype_data = "sorghum/BAP_376_Chrall_imputed_annotated.vcf.gz",
    phentotype_path = "sorghum/BAP_WSC_pheno.tsv"
)

In [None]:
gwas_table %>% 
    fwrite("sorghum_BAP_WSC_gwas.tsv", sep = "\t", col.names = TRUE)

In [None]:
gwas_table %>% 
    filter(CHROM == 4) %>%
    filter(Pvalues > 18)

In [None]:
test_run <- panvar_func(
    phenotype_data_path = "sorghum/BAP_WSC_pheno.tsv",
    vcf_file_path = "sorghum/BAP_376_Chrall_imputed_annotated.vcf.gz",
    chrom = "Chr04",
    bp = 66529675,
    all.impacts = TRUE,
    r2_threshold = 0.8
)

In [None]:
test_run2 <- panvar_func(
    phenotype_data_path = "sorghum/BAP_WSC_pheno.tsv",
    vcf_file_path = "sorghum/BAP_376_Chrall_imputed_annotated.vcf.gz",
    chrom = "Chr04",
    bp = 17548900,
    all.impacts = TRUE,
    r2_threshold = 0.8
)

Error
```txt
Error in `left_join()`:
! Can't join `x$CHROM` with `y$CHROM` due to incompatible types.
ℹ `x$CHROM` is a <character>.
ℹ `y$CHROM` is a <double>.
```

In [62]:
phenotype_data_path = "sorghum/BAP_WSC_pheno.tsv"
vcf_file_path = "sorghum/BAP_376_Chrall_imputed_annotated.vcf.gz"
chrom = "Chr04"
bp = 17548900
all.impacts = TRUE

In [63]:
window = 500000
missing_rate = 0.10
maf = 0.05
r2_threshold = 0.3

In [64]:
gwas_table <- fread("sorghum_BAP_WSC_gwas.tsv")

In [65]:
window_bp <- window_unit_func(window)

In [66]:
in_plink_format <- vcf_to_plink2(vcf_file_path)

PLINK v2.00a6LM AVX2 Intel (5 Feb 2024)        www.cog-genomics.org/plink/2.0/
(C) 2005-2024 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to panvar/BAP_376_Chrall_imputed_annotated.log.
Options in effect:
  --make-bed
  --out panvar/BAP_376_Chrall_imputed_annotated
  --set-all-var-ids Chr_@_BP_#
  --vcf sorghum/BAP_376_Chrall_imputed_annotated.vcf.gz

Start time: Sun Oct 20 15:08:33 2024
7883 MiB RAM detected, ~5751 available; reserving 3941 MiB for main workspace.
Using 1 compute thread.
--vcf: 5053806 variants scanned.
--vcf: panvar/BAP_376_Chrall_imputed_annotated-temporary.pgen +
panvar/BAP_376_Chrall_imputed_annotated-temporary.pvar.zst +
panvar/BAP_376_Chrall_imputed_annotated-temporary.psam written.
376 samples (0 females, 0 males, 376 ambiguous; 376 founders) loaded from
panvar/BAP_376_Chrall_imputed_annotated-temporary.psam.
5053806 variants loaded from
panvar/BAP_376_Chrall_imputed_annotated-temporary.pvar.zst.
Note: No phenotype data present.
Writ

In [67]:
cleaned_up <- bed_file_clean_up(in_plink_format$bed, maf = maf, missing_rate = missing_rate)

PLINK v2.00a6LM AVX2 Intel (5 Feb 2024)        www.cog-genomics.org/plink/2.0/
(C) 2005-2024 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to panvar/BAP_376_Chrall_imputed_annotated_cleaned.log.
Options in effect:
  --bfile panvar/BAP_376_Chrall_imputed_annotated
  --geno 0.1
  --maf 0.05
  --make-bed
  --out panvar/BAP_376_Chrall_imputed_annotated_cleaned

Start time: Sun Oct 20 15:11:11 2024
7883 MiB RAM detected, ~5739 available; reserving 3941 MiB for main workspace.
Using 1 compute thread.
376 samples (0 females, 0 males, 376 ambiguous; 376 founders) loaded from
panvar/BAP_376_Chrall_imputed_annotated.fam.
5053806 variants loaded from panvar/BAP_376_Chrall_imputed_annotated.bim.
Note: No phenotype data present.
Calculating allele frequencies... 10111214151618192022232425272829313233353637384041424445464749505153545557585960626364666768707172737576777980818284858688899092939495979899done.
--geno: 0 variants removed due to missing genotype data.
0 variants

In [68]:
subset_genotype_data <- subset_around_tag(cleaned_up,chrom = chrom, bp = bp, window = window_bp)

PLINK v2.00a6LM AVX2 Intel (5 Feb 2024)        www.cog-genomics.org/plink/2.0/
(C) 2005-2024 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to panvar/BAP_376_Chrall_imputed_annotated_cleaned_windowd_17548900.log.
Options in effect:
  --bfile panvar/BAP_376_Chrall_imputed_annotated_cleaned
  --chr Chr04
  --from-bp 17048900
  --make-bed
  --out panvar/BAP_376_Chrall_imputed_annotated_cleaned_windowd_17548900
  --to-bp 18048900

Start time: Sun Oct 20 15:13:09 2024
7883 MiB RAM detected, ~5733 available; reserving 3941 MiB for main workspace.
Using 1 compute thread.
376 samples (0 females, 0 males, 376 ambiguous; 376 founders) loaded from
panvar/BAP_376_Chrall_imputed_annotated_cleaned.fam.
558502 out of 5053806 variants loaded from
panvar/BAP_376_Chrall_imputed_annotated_cleaned.bim.
Note: No phenotype data present.
4755 variants remaining after main filters.
Writing panvar/BAP_376_Chrall_imputed_annotated_cleaned_windowd_17548900.fam
... done.
Writing panvar/B

In [None]:
table <- ld_filtered_snp_list(subset_genotype_data,chrom = chrom, bp = bp, r2_threshold = r2_threshold)

In [None]:
ld_filtered_snp_list

In [None]:
return_snplist_for_bp

In [None]:
ld_table <- ld_table_maker(table)

In [None]:
keep_snp_list <- snps_to_keep(table)

In [None]:
plink2_bcf_dictionary <- plink2_bcftools_chroms_dictionary(vcf_file_path,in_plink_format$bim)

In [None]:
if(!is.null(plink2_bcf_dictionary)){
		ld_table_checked <- apply_dict(plink2_bcf_dictionary, ld_table)

		snp_keep_list_checked <- apply_dict(plink2_bcf_dictionary, keep_snp_list)

        gwas_table_dicted <- apply_dict(plink2_bcf_dictionary, gwas_table)
	} else{
		
		ld_table_checked <-  ld_table

		snp_keep_list_checked <- keep_snp_list

        gwas_table_dicted <- gwas_table
	}

In [None]:
keep_table_path <- keep_table_sanitizer(snp_keep_list_checked)

In [None]:
keep_table_path

In [None]:
filtered_vcf_table <- filter_vcf_file(vcf_file_path = vcf_file_path, keep_table_path)

In [None]:
split_table_path <- split_vcf_eff(filtered_vcf_table)

In [None]:
snpeff_table <- execute_snpsift(split_table_path)

In [None]:
snpsift_table <- snpeff_table$table

In [None]:
all.impacts = TRUE
if(all.impacts){
        snpsift_table_impacts <- snpsift_table
    } else {
        snpsift_table_impacts <- snpsift_table %>% 
            filter(IMPACT %in% c("HIGH","MODERATE") | BP == bp ) # The OR condition lets us retain the tag SNP which might be dropped if the IMPACT factor is not HIGH or MODERATE
    }

In [None]:
pvalues_impact_ld_table <- snpsift_table_impacts %>%
        left_join(gwas_table_dicted, by = c("CHROM","BP")) %>%
        left_join(ld_table_checked, by = c("CHROM","BP"))

In [None]:
    pvalues_impact_ld_colors_table <- pvalues_impact_ld_table %>% mutate(
        Type = case_when(
            BP == bp ~ "tag_snp",
            BP != bp ~ "Candidate"
        )
    )

In [None]:
overall_weight_func(pvalues_impact_ld_colors_table, bp = bp) %>% 
    filter(IMPACT %in% c("HIGH","MODERATE"))

In [69]:
gwas_table

CHROM,BP,Pvalues
<int>,<int>,<dbl>
1,17548900,21.38204
1,17454093,20.81491
1,17500022,20.81491
1,17574841,20.16318
1,17278109,19.76206
1,17542258,19.66837
1,18361956,19.58779
1,17385614,19.51070
1,17280005,19.15257
1,17280033,19.15257
