---
### **Data Bootcamp for Genomic Prediction in Plant Breeding** ###
### **University of Minnesota Plant Breeding Center** ###
#### **June 20 - 22, 2022** ####
---

### **Practical 5:  Modeling Genotype-Environment Interactions (GxE)** ###

#### **Source Scripts and Load Data**


In [None]:
WorkDir <- getwd()
setwd(WorkDir)

##Source in functions to be used
source("R_Functions/GS_Pipeline_Jan_2022_FnsApp.R")
source("R_Functions/bootcamp_functions.R")




#### **Read Genotype File using vcfR** ####

In [None]:

##Load in genotype data. Use package vcfR to read in and work with vcf file.
infileVCF <- "Data/SoyNAM_Geno.vcf"
genotypes_VCF <- read.table(infileVCF)
vcf <- read.vcfR(infileVCF, verbose = FALSE)
vcf


#### **Convert VCF file format to numerical matrix format.**
#### Final genotype matrix is geno_num

In [None]:
gt <- extract.gt(vcf, element = "GT", as.numeric = F)
fix_T <- as_tibble(getFIX(vcf))
gt2 <- matrix(0, ncol = ncol(gt), nrow = nrow(gt))
colnames(gt2) <- colnames(gt)
rownames(gt2) <- rownames(gt)
gt2a <- apply(gt,2, function(x) gsub("1/1","1",x))
gt2b <- gsub("0[/|]0","0",gt2a)
gt2c <- gsub("[10][/|][10]","0.5",gt2b)
gt2d <- gsub("\\.[/|]\\.","NA",gt2c)

gt2d_num<- apply(gt2d,2,as.numeric)
rownames(gt2d_num)<- rownames(gt2d)
geno_num <- t(gt2d_num)
dim(geno_num)
rm(list=grep("gt2",ls(),value=TRUE))


#### **Filter Genotypic Data**

In [None]:
##Filter markers on % missing
miss <- function(x){length(which(is.na(x)))}
mrkNA <- (apply(geno_num, MARGIN=2, FUN=miss))/dim(geno_num)[1]
ndx <- which(mrkNA > 0.2)

if (length(ndx)>0) geno_num2 <- geno_num[, -ndx] else geno_num2 <- geno_num

##Filter individuals on % missing
indNA <- (apply(geno_num2, MARGIN=1, FUN=miss))/dim(geno_num2)[2]
ndx2 <- which(indNA > 0.5)

 if(length(ndx2)>0) geno_num3 <- geno_num2[-ndx2, ] else geno_num3 <- geno_num2


##Filter markers based on MAF
maf <- apply(geno_num3, MARGIN=2, FUN=mean, na.rm=T)
ndx3 <- which(maf<0.05 | maf>0.95) 

if (length(ndx3)>0) geno_num4 <- geno_num2[, -ndx3] else geno_num4 <- geno_num3
  
dim(geno_num4)

#### **Import Phenotypic Data**

In [None]:

pheno <- read.csv("Data/SoyNAM_Pheno.csv")
geno_num4_x <- cbind(rownames(geno_num4),geno_num4)

colnames(geno_num4_x)[1]<- "strain"

## Merge Geno and Pheno Data
Data <- merge(geno_num4_x,pheno,by="strain",all=TRUE)

## Remove with missing yiled_blup values 

YldNA_Indices <- which(is.na(Data$yield))
Data_Sub <- Data[-YldNA_Indices,]


genoStrain <- unique(geno_num4_x[,"strain"])
genoStrainIndices <- which(Data_Sub[,"strain"] %in% genoStrain)

genoIndices <- grep("ss",colnames(geno_num4_x))
initGenoIndx <- genoIndices[1]
finalGenoIndx <- genoIndices[length(genoIndices)]
phenoIndices <- c(1,c((finalGenoIndx+1):ncol(Data_Sub)))

pheno_sub <- Data_Sub[genoStrainIndices,phenoIndices]
geno_num4b <- Data_Sub[genoStrainIndices,c(1,genoIndices)]

uniqueStrainIndices<- which(!duplicated(geno_num4b[,"strain"]))
geno_num5 <- geno_num4b[uniqueStrainIndices,]


### set 'yield' colname to 'Yield_blup'

yldCol <- which(colnames(pheno_sub) %in% "yield")
colnames(pheno_sub)[yldCol] <- "Yield_blup" 

dim(geno_num5)

#### **Subset Environments** 

In [None]:
### Select 3 environs with largest number of evaluations (lines)  

env_sub <-  names(which(table(pheno_sub[,"environ"])>5100)[1:3])

env_sub_indices <- which(pheno_sub[,"environ"] %in% env_sub)

## Subset Data and Geno tables 
DT <- pheno_sub[env_sub_indices,]

DT$environ <- as.factor(DT$environ)

dim(DT)

#### **Impute Genotype Table** ###

In [None]:
#### Impute genotable using markov function from 'NAM' package 

geno_imp <- markov(apply(geno_num5[,-1],2,as.numeric))
rownames(geno_imp) <- geno_num5[,"strain"]
dim(geno_imp)

In [None]:
### 
env_geno_sub_indices <- which(rownames(geno_imp) %in% unique(DT[,"strain"]))
geno_imp_sub <- geno_imp[env_geno_sub_indices,]

dim(geno_imp_sub)

#### **Relationship Matrix Using A.mat** 

In [None]:
K_rr <- A.mat(geno_imp_sub)
colnames(K_rr) <-rownames(geno_imp_sub)
rownames(K_rr) <- rownames(geno_imp_sub)
A <- K_rr
dim(A)



#### **Subset Genotypes for Computation Demo** 

In [None]:
  
A_Sub <- A[1:500,1:500]
DT_Sub <- DT[which(DT[,"strain"] %in% rownames(A_Sub)),]

E <- diag(length(unique(DT$environ)))
rownames(E) <- colnames(E) <- unique(DT$environ)
dim(E)

### Same set of strains in each of the environments 

rmStrains <- names(which(table(DT_Sub[,"strain"]) <3))
DT_Sub1 <- DT_Sub[-which(DT_Sub[,"strain"] %in% rmStrains),]

A_Sub1 <- A_Sub[-which(rownames(A_Sub) %in% rmStrains),-which(rownames(A_Sub) %in% rmStrains)]
dim(A_Sub1)

### **Exercise - Compare a few of the var-covar structures in SOMMER package**


#### **Model with Main Effect** #### 
##### Model environment as fixed effect (estimate population mean for each of the environments) and estimate random effects for genotypes 


In [None]:

fitMain <- mmer(Yield_blup~environ-1,
                random=~vs(strain,Gu=A_Sub1),
                rcov=~units,
                data=DT_Sub1,verbose=FALSE)
summary(fitMain)


In [None]:

m <- model.matrix(~ environ-1 ,data=DT_Sub1)
m_beta <- m %*% as.numeric(fitMain$Beta[,3]) 
PredMain <- m_beta+fitMain$U$`u:strain`$Yield_blup
cor(PredMain,DT_Sub1[,"Yield_blup"]) 

#### **Model with Compound Symmetry var-covar structure** ####
##### Compound symmetry assumes GxE effects and also assumes constant correlation among environments

In [None]:

E <- diag(length(unique(DT_Sub1$environ)))
rownames(E) <- colnames(E) <- unique(DT_Sub1$environ)

EA <- kronecker(E,A_Sub1, make.dimnames = TRUE)
DT_Sub1$environ <- as.factor(DT_Sub1$environ)
DT_Sub1$strain <- as.factor(DT_Sub1$strain)

fitCS <- mmer(Yield_blup~environ-1,
              random= ~ vs(strain, Gu=A_Sub1) + vs(environ:strain, Gu=EA),
              rcov= ~ units,
              data=DT_Sub1, verbose = FALSE)
summary(fitCS)

In [None]:

m <- model.matrix(~ environ-1 ,data=DT_Sub1)
m_beta <- m %*% as.numeric(fitCS$Beta[,3]) 
PredCS <- m_beta+fitCS$U$`u:environ:strain`$Yield_blup
cor(PredCS,DT_Sub1[,"Yield_blup"]) 


#### **Model with Compound Symmetry + Diagonal Structure** ####
##### Heterogeneous gxe variance among environmenta and constant genotic co-variance among environments 

In [None]:
fitCSDG <- mmer(Yield_blup~environ-1,
                random=~vs(strain,Gu=A_Sub1) +vs(ds(environ),strain,Gu=A_Sub1),
                rcov=~units,
                data=DT_Sub1,verbose=FALSE) 

summary(fitCSDG)

In [None]:

m2 <- cbind(c(rep(1,nrow(DT_Sub1)/3),rep(0,2*nrow(DT_Sub1)/3)),c(rep(1,nrow(DT_Sub1)/3),rep(0,nrow(DT_Sub1)/3),rep(1,nrow(DT_Sub1)/3)),
c(rep(1,nrow(DT_Sub1)/3),rep(1,nrow(DT_Sub1)/3),rep(0,nrow(DT_Sub1)/3)))

m_beta <- m2 %*% as.numeric(fitCSDG$Beta[,3]) 
length(m_beta)
m_env_strain <- do.call(cbind,lapply(fitCSDG$U,function(x) x$Yield_blup))
dim(m_env_strain)
envStrain_blup <-c(m_env_strain[,2:4])                              
                  
strain_blup <- rep(fitCSDG$U$`u:strain`$Yield_blup,3)
length(strain_blup)

In [None]:
PredCSDG <- m_beta+strain_blup+envStrain_blup

indES <-  sort.int(as.numeric(DT_Sub1[,"environ"]),decreasing=FALSE,index.return=TRUE)[[2]]

cor(PredCSDG,DT_Sub1[indES,"Yield_blup"]) 


#### **Model with US - Unstructured Variance-Covariance** ####

In [None]:
fitUS <- mmer(Yield_blup~environ-1,
                random=~vs(us(environ),strain,Gu=A_Sub1),
                rcov=~units,
                data=DT_Sub1,verbose=FALSE) 
summary(fitUS)

In [None]:
envNames <- levels(factor(DT_Sub1$environ))
env1Ind <- c(1,3,6)
U_envStrain <- list()
  for(i in 1:length(envNames)){
       envInd <-  grep(envNames[i],names(fitUS$U))
       U_envStrain[[i]] <-  as.numeric(fitUS$U[[env1Ind[i]]]$Yield_blup)
     for(j  in 2:length(envInd)){ 
         indJ <- envInd[j]
         b <- cbind(names(fitUS$U[[indJ]]$Yield_blup),fitUS$U[[indJ]]$Yield_blup)
         colnames(b) <- c("strain","Yield_blup")
         b_group <- as_tibble(b) %>% group_by(strain)
         YldBlup_group <- b_group %>% summarise(Yield_blup = sum(as.numeric(Yield_blup)))
         U_envStrain[[i]] <- U_envStrain[[i]] +YldBlup_group[,2] + fitUS$Beta[i,3]
       } 
     }
    
PredUS <- c(unlist(U_envStrain))
        
indES <-  sort.int(as.numeric(DT_Sub1[,"environ"]),decreasing=FALSE,index.return=TRUE)[[2]]
cor(PredUS[indES],DT_Sub1[indES,"Yield_blup"]) 


### **Exercise - Predict performance of tested and untested genotypes in tested and untested environments** ###

#### **Tested Genotypes in Untested Environment**

In [None]:
### Remove lines from IA2013 and train the model using IA2012 and IL2013 only and predict 
### performance of lines for IA2013 (untested environ) and compare accuracy with model 
### incorporating data from IA2013 in the training model 

tstIndices1 <- which(DT_Sub1[,"environ"] %in% "IA_2013") 

DT_Sub1A <- DT_Sub1
DT_Sub1A[tstIndices1 ,"Yeild_blup"] <- NA
#DT_Sub1A[tstIndices1 ,"environ"] <- NA

dim(DT_Sub1A)

In [None]:

E <- diag(length(unique(DT_Sub1A$environ)))
rownames(E) <- colnames(E) <- unique(DT_Sub1A$environ)
dim(E)

EA <- kronecker(E,A_Sub1, make.dimnames = TRUE)
DT_Sub1$Aenviron <- as.factor(DT_Sub1A$environ)
DT_Sub1A$strain <- as.factor(DT_Sub1A$strain) 

dim(EA)

In [None]:
fitCS1A <- mmer(Yield_blup~environ-1,
              random= ~ vs(strain, Gu=A_Sub1) + vs(environ:strain, Gu=EA),
              rcov= ~ units,
              data=DT_Sub1A, verbose = FALSE)
summary(fitCS1A)


In [None]:
m <- model.matrix(~ environ-1 ,data=DT_Sub1)
m_beta <- m %*% as.numeric(fitCS1A$Beta[,3]) 
tstPred1 <- (m_beta+fitCS1A$U$`u:environ:strain`$Yield_blup)[tstIndices1]
length(tstPred1)
cor(tstPred1,DT_Sub1[tstIndices1,"Yield_blup"])


#### **Unstructured Var-Covar for Untested Environment**

In [None]:
fitUS1A <- mmer(Yield_blup~environ-1,
                random=~vs(us(environ),strain,Gu=A_Sub1),
                rcov=~units,
                data=DT_Sub1A,verbose=FALSE) 
summary(fitUS1A)

In [None]:
names(fitUS1A$U)
envNames <- levels(factor(DT_Sub1A$environ))
env1Ind <- c(1,3,6)
U_envStrain <- list()
  for(i in 1:length(envNames)){
       envInd <-  grep(envNames[i],names(fitUS1A$U))
       U_envStrain[[i]] <-  as.numeric(fitUS1A$U[[env1Ind[i]]]$Yield_blup)
     for(j  in 2:length(envInd)){ 
         indJ <- envInd[j]
         b <- cbind(names(fitUS1A$U[[indJ]]$Yield_blup),fitUS1A$U[[indJ]]$Yield_blup)
         colnames(b) <- c("strain","Yield_blup")
         b_group <- as_tibble(b) %>% group_by(strain)
         YldBlup_group <- b_group %>% summarise(Yield_blup = sum(as.numeric(Yield_blup)))
         U_envStrain[[i]] <- U_envStrain[[i]] +YldBlup_group[,2] + fitUS1A$Beta[i,3]
       } 
     }
    
PredUS <- c(unlist(U_envStrain))
        
indES <-  sort.int(as.numeric(DT_Sub1A[,"environ"]),decreasing=FALSE,index.return=TRUE)[[2]]
                        
cor(PredUS[indES[tstIndices1]],DT_Sub1[indES[tstIndices1],"Yield_blup"]) 

#### **Untested Genotypes in Tested Environments** ####

In [None]:
### Subset Data to generate untested genotypes 

set.seed(125)
tstStrain <- sample(unique(DT_Sub1[,"strain"]),0.2*length(unique(DT_Sub1[,"strain"])))
length(tstStrain)
tstIndices2 <- which(DT_Sub1[,"strain"] %in% tstStrain)
DT_Sub1B <- DT_Sub1
DT_Sub1B[tstIndices2 ,"Yeild_blup"] <- NA
dim(DT_Sub1B)

In [None]:

E <- diag(length(unique(DT_Sub1B$environ)))
rownames(E) <- colnames(E) <- unique(DT_Sub1B$environ)
dim(E)

### Kronecker Product for EA

EA <- kronecker(E,A_Sub1, make.dimnames = TRUE)
DT_Sub1B$environ <- as.factor(DT_Sub1B$environ)
DT_Sub1B$strain <- as.factor(DT_Sub1B$strain) 

dim(EA)

#### **Fit Compound Symmetry Var-Covar Structure for Tested Environments**

In [None]:
fitCS1B <- mmer(Yield_blup~environ-1,
              random= ~ vs(strain, Gu=A_Sub1) + vs(environ:strain, Gu=EA),
              rcov= ~ units,
              data=DT_Sub1B, verbose = FALSE)
summary(fitCS1B)

In [None]:
m <- model.matrix(~ environ-1 ,data=DT_Sub1)
m_beta <- m %*% as.numeric(fitCS1B$Beta[,3]) 
tstPred2 <- (m_beta+fitCS1B$U$`u:environ:strain`$Yield_blup)[tstIndices2]
length(tstPred2)
cor(tstPred2,DT_Sub1[tstIndices2,"Yield_blup"])


#### **Untested Genotypes in Untested Environment**

In [None]:
### Subset Data to generate untested genotypes in untested environments

set.seed(125)
tstStrain <- sample(unique(DT_Sub1[,"strain"]),0.2*length(unique(DT_Sub1[,"strain"])))
length(tstStrain)
tstIndices2 <- which(DT_Sub1[,"strain"] %in% tstStrain)
DT_Sub1B <- DT_Sub1
DT_Sub1B[tstIndices2 ,"Yeild_blup"] <- NA
dim(DT_Sub1B) 

tstIndices1 <- which(DT_Sub1[,"environ"] %in% "IA_2013") 

tstIndices3 <- intersect(tstIndices2,tstIndices1)
DT_Sub1C <- DT_Sub1B
DT_Sub1C[tstIndices3 ,"Yeild_blup"] <- NA



#### **Fit Model using Unstructured Var-Covar Matrix for Untested Environments**

In [None]:
fitUS1C <- mmer(Yield_blup ~ environ-1,
              random=~vs(us(environ),strain,Gu=A_Sub1),
              rcov=~units,
              data=DT_Sub1C,verbose=FALSE)

summary(fitUS1C)

In [None]:
envNames <- levels(factor(DT_Sub1C$environ))
env1Ind <- c(1,3,6)
U_envStrain <- list()
  for(i in 1:length(envNames)){
    # envInd <- which(names(fitUS1C$U) %in% envNames[i])
      envInd <-  grep(envNames[i],names(fitUS1C$U))
       U_envStrain[[i]] <-  as.numeric(fitUS1C$U[[env1Ind[i]]]$Yield_blup)
     for(j  in 2:length(envInd)){ 
         indJ <- envInd[j]
         b <- cbind(names(fitUS1C$U[[indJ]]$Yield_blup),fitUS1C$U[[indJ]]$Yield_blup)
         colnames(b) <- c("strain","Yield_blup")
         b_group <- as_tibble(b) %>% group_by(strain)
         YldBlup_group <- b_group %>% summarise(Yield_blup = sum(as.numeric(Yield_blup)))
         U_envStrain[[i]] <- U_envStrain[[i]] +YldBlup_group[,2] + fitUS1C$Beta[i,3]
       } 
     }
    

 
PredUS1C <- c(unlist(U_envStrain))
        
indES <-  sort.int(as.numeric(DT_Sub1C[,"environ"]),decreasing=FALSE,index.return=TRUE)[[2]]
 
                                     
cor(PredUS1C[indES[tstIndices3]],DT_Sub1[indES[tstIndices3],"Yield_blup"]) 

#### **Discuss other ways to model these scenarios and refine these models**