---
### **Data Bootcamp for Genomic Prediction in Plant Breeding** ###
### **University of Minnesota Plant Breeding Center** ###
#### **June 20 - 22, 2022** ####
---

### **Practical 1:  Filtering & Imputation** ###

<br />
<br />

#### **Source Scripts and Load Data**


In [None]:
WorkDir <- getwd()
setwd(WorkDir)

##Source in functions to be used
source("R_Functions/GS_Pipeline_Jan_2022_FnsApp.R")
source("R_Functions/bootcamp_functions.R")
gc()


#### **Read Genotype File using vcfR** ####

In [None]:

##Load in genotype data. Use package vcfR to read in and work with vcf file.
infileVCF <- "Data/SoyNAM_Geno.vcf"
genotypes_VCF <- read.table(infileVCF)
vcf <- read.vcfR(infileVCF, verbose = FALSE)
vcf


#### **Convert VCF file format to numerical matrix format.**
#### Final genotype matrix is geno_num

In [None]:
gt <- extract.gt(vcf, element = "GT", as.numeric = F)
fix_T <- as_tibble(getFIX(vcf))
gt2 <- matrix(0, ncol = ncol(gt), nrow = nrow(gt))
colnames(gt2) <- colnames(gt)
rownames(gt2) <- rownames(gt)
gt2a <- apply(gt,2, function(x) gsub("1/1","1",x))
gt2b <- gsub("0[/|]0","0",gt2a)
gt2c <- gsub("[10][/|][10]","0.5",gt2b)
gt2d <- gsub("\\.[/|]\\.","NA",gt2c)

gt2d_num<- apply(gt2d,2,as.numeric)
rownames(gt2d_num)<- rownames(gt2d)
geno_num <- t(gt2d_num)
dim(geno_num)
rm(list=grep("gt2",ls(),value=TRUE))


#### **Filter Genotypic Data**

In [None]:
##Filter markers on % missing
miss <- function(x){length(which(is.na(x)))}
mrkNA <- (apply(geno_num, MARGIN=2, FUN=miss))/dim(geno_num)[1]
ndx <- which(mrkNA > 0.2)

if (length(ndx)>0) geno_num2 <- geno_num[, -ndx] else geno_num2 <- geno_num

##Filter individuals on % missing
indNA <- (apply(geno_num2, MARGIN=1, FUN=miss))/dim(geno_num2)[2]
ndx2 <- which(indNA > 0.5)

 if(length(ndx2)>0) geno_num3 <- geno_num2[-ndx2, ] else geno_num3 <- geno_num2


##Filter markers based on MAF
maf <- apply(geno_num3, MARGIN=2, FUN=mean, na.rm=T)
ndx3 <- which(maf<0.05 | maf>0.95) 

if (length(ndx3)>0) geno_num4 <- geno_num2[, -ndx3] else geno_num4 <- geno_num3
  
dim(geno_num4)

#### **Import Phenotypic Data and Merge Geno-Pheno Data**

In [None]:

pheno <- read.csv("Data/SoyNAM_Pheno.csv")

geno_num4_x <- cbind(rownames(geno_num4),geno_num4)
colnames(geno_num4_x)[1]<- "strain"

### Check strain names have same format in pheno and geno 
pheno[,1] <- gsub("[-.]","",pheno[,1])
geno_num4_x[,1] <- gsub("[-.]","",geno_num4_x[,1])

## Merge Geno and Pheno Data
Data <- merge(geno_num4_x,pheno,by="strain",all=TRUE)

## Remove with missing yiled_blup values 

YldNA_Indices <- which(is.na(Data$yield))
if(length(YldNA_Indices) >0){Data_Sub <- Data[-YldNA_Indices,]}else{Data_Sub <- Data}


genoStrain <- unique(as.character(geno_num4_x[,"strain"]))

genoStrainIndices <- which(Data_Sub[,"strain"] %in% genoStrain)
length(genoStrainIndices)
genoIndices <- grep("ss",colnames(geno_num4_x))
initGenoIndx <- genoIndices[1]
finalGenoIndx <- genoIndices[length(genoIndices)]
phenoIndices <- c(1,c((finalGenoIndx+1):ncol(Data_Sub)))

pheno_sub <- Data_Sub[genoStrainIndices,phenoIndices]
geno_num4b <- Data_Sub[genoStrainIndices,c(1,genoIndices)]


uniqueStrainIndices<- which(!duplicated(geno_num4b[,"strain"]))

if(length(uniqueStrainIndices)>0) {geno_num5 <- geno_num4b[uniqueStrainIndices,]}else{geno_num5 <- geno_num4b}

dim(geno_num5)

rm(geno_num4b)
rm(geno_num4)
rm(geno_num3)
rm(geno_num2)

### set 'yield' colname to 'Yield_blup'

yldCol <- which(colnames(pheno_sub) %in% "yield")
colnames(pheno_sub)[yldCol] <- "Yield_blup" 



#### **Impute Genotype Table** ###

In [None]:

# Impute genotype data using either naive imputation or Markov chain implemented in the NAM package
if (impMethod == "naive") geno_imp <- replaceNAwithMean(geno_num5)
if (impMethod == "markov") geno_imp <- markov(apply(geno_num5[, -1], 2, as.numeric))
if (impMethod == "markov") rownames(geno_imp_tst) <- rownames(geno_num5)


In [None]:

# Reduce the number of RILs in the dataset simply for the sake of saving time in computation for demonstration (we don't want to spend all of our time watching our computer work!)

ssNdx <- sample.int(n=dim(pheno2)[1], size=1000)
geno_imp_sub <- geno_imp[ssNdx, ]
pheno2_sub <- pheno2[ssNdx, ]


### **Fit some genomic prediction models to the data** 



In [None]:
# Fit an RR-BLUP model using the rrBLUP package
rrModel <- mixed.solve(y=pheno2_sub$Seedsize, Z=geno_imp_sub)
mrk_effs_RR <- rrModel$u

# Use marker effects to calculate genomic estimated breeding values of individuals in training set by using . Here we are extracting the intercept and adding it back on.
int <- as.numeric(rrModel$beta)
gebv_rr <- int + geno_imp_sub%*%mrk_effs_RR





#### **GBLUP** 

In [None]:

# Calculating a genomic relationship matrix using rrBLUP and fitting a G-BLUP model
G <- A.mat(geno_imp_sub)

gblupModel <- kin.blup(data=pheno2_sub, geno='RIL', pheno='Yield', K=G)
gblupGebv <- gblupModel$g


cor(gebv_rr, pheno2_sub$Seedsize)


### **Compare GEBVs from ridge regression BLUP to G-BLUP**

In [None]:

cor(rrGebv, gblupGebv)
plot(rrGebv, gblupGebv)


#### **Cross-validation analysis**
#### Now extend this to perform a 10-fold cross-validation analysis
#### This works if my total sample size is divisible by 10. If not, need to subset so it is.


In [None]:
  
ndxShuf <- sample(1:dim(geno_imp_sub)[1], dim(geno_imp_sub)[1])

pheno_shuf <- pheno2_sub[ndxShuf, ]
geno_imp_sub_shuf <- geno_imp_sub[ndxShuf, ]

cnt <- 1:floor(length(ndxShuf)/10)

pred_stor <- vector(length=length(ndxShuf))

for (i in 1:10){
  pheno_trn <- pheno_shuf
  pheno_trn$Seedsize[cnt] <- NA
  
  rrModel <- mixed.solve(y=pheno_trn$Seedsize, Z=geno_imp_sub_shuf)
  mrkEffsRR <- rrModel$u
  
  # Use marker effects to calculate genomic estimated breeding values of individuals in training set by using . Here we are extracting the intercept and adding it back on.
  int <- as.numeric(rrModel$beta)
  gebv_rr <- int + geno_imp_sub_shuf%*%mrkEffsRR
  
  
  pred_stor[cnt] <- gebv_rr[cnt]
  
  cnt <- cnt + floor(length(ndxShuf)/10)
}

cor(pred_stor, pheno_shuf$Seedsize)



plot(pred_stor, pheno_shuf$Yield)


#### **Discuss other ways to model these scenarios and refine these models**