In [25]:
using Statistics, Random, DataFrames, JWAS,CSV,Plots,StatsBase,Distributions

In [26]:
data_path = "/Users/ziguiwang/Documents/Animal/Functional_Annoation/FAANG_simulation/largedata/GALLO_MSU_pig/"
result_path = "/Users/ziguiwang/Documents/Animal/Functional_Annoation/FAANG_simulation/reports/fixed_marker/"
raw_genotype = CSV.read(data_path*"pig_genotype.csv",DataFrame);

### Read the genotype, map file, and annotation file.

In [27]:
new_genotype      = raw_genotype[!,2:end]; # remove the ID column
ID                = raw_genotype[!,1];     # get the ID column

annotation_map    = CSV.read(data_path*"pig_FA.csv",DataFrame);   # read annoation information for each SNP
map_data          = CSV.read(data_path*"pig_map.csv",DataFrame);  # read map data for MSU pig data

number_snp        = size(map_data,1);      # get number of SNPs
number_individual = size(raw_genotype,1);  # get number of individuals

### Find the index for SNPs in each annotation class

In [28]:
Random.seed!(345)

# get the snp index that belong to each genome annotation
psudogene_index            = annotation_map[!,2] .== "pseudogene"
protein_coding_index       = annotation_map[!,2] .== "protein_coding"
processed_pseudogene_index = annotation_map[!,2] .== "processed_pseudogene"
snoRNA_index               = annotation_map[!,2] .== "snoRNA"
snRNA_index                = annotation_map[!,2] .== "snRNA"
rRNA_index                 = annotation_map[!,2] .== "rRNA"
miRNA_index                = annotation_map[!,2] .== "miRNA"
miscRNA_index              = annotation_map[!,2] .== "misc_RNA"

# get the snp name that belong to each genome annotation
psudogene_snp              = annotation_map[psudogene_index,1];
protein_coding_snp         = annotation_map[protein_coding_index,1];
processed_pseudogene_snp   = annotation_map[processed_pseudogene_index,1];
snoRNA_snp                 = annotation_map[snoRNA_index,1];
snRNA_snp                  = annotation_map[snRNA_index,1];
rRNA_snp                   = annotation_map[rRNA_index,1];
miRNA_snp                  = annotation_map[miRNA_index,1];
miscRNA_snp                = annotation_map[miscRNA_index,1];

# find the index of the SNPs of each annotation class from the map data
psudogene_snp_index        = findall(x->x in psudogene_snp , map_data[!,:snp]);
protein_coding_snp_index   = findall(x->x in protein_coding_snp , map_data[!,:snp]);
processed_pseudogene_snp_index   = findall(x->x in processed_pseudogene_snp , map_data[!,:snp]);
snoRNA_snp_index                 = findall(x->x in snoRNA_snp , map_data[!,:snp]);

snRNA_snp_index                  = findall(x->x in snRNA_snp , map_data[!,:snp]);
rRNA_snp_index                   = findall(x->x in rRNA_snp , map_data[!,:snp]);
miRNA_snp_index                  = findall(x->x in miRNA_snp , map_data[!,:snp]);
miscRNA_snp_index                = findall(x->x in miscRNA_snp , map_data[!,:snp]);

RNA_snp_index                    = vcat(snoRNA_snp_index,snRNA_snp_index,rRNA_snp_index,miRNA_snp_index,miscRNA_snp_index );
intergenic_snp_index             = findall(!(x->x in annotation_map[!,:snp]) , map_data[!,:snp])


# merge all non protein coding index together
non_protein_coding_snp_index         = vcat(RNA_snp_index,psudogene_snp_index,processed_pseudogene_snp_index,intergenic_snp_index   );

### Start the simulation

In [29]:
function simulate_phenotypes(new_genotype ,n,simu_qtl_index,id,group_number)
    number_qtl             = size(simu_qtl_index,1)

    simu_qtl_genotype      = Matrix(new_genotype[1:n,simu_qtl_index ]);
    
    sigma_g = 1.0    # assume the genetic variance equals to 1

    simu_qtl_effect        = randn(number_qtl )* sigma_g
    g                      = simu_qtl_genotype * simu_qtl_effect

    println("varaince for g: ",var(g))

    phenotypes = DataFrame(ID=id,g = g);

end

simulate_phenotypes (generic function with 1 method)

In [30]:
# simulate 2000 QTLs from protein coding SNPs
protein_coding_QTL_index         = protein_coding_snp_index[sample(1:size(protein_coding_snp_index,1  ),2000,replace = false)];
# simulate 100 QTLs from non protein coding SNPs
non_protein_coding_QTL_index     = non_protein_coding_snp_index[sample(1:size(non_protein_coding_snp_index,1  ),100,replace = false)];
# combine all QTLs index together
candidate_QTL_index              = unique(vcat(protein_coding_QTL_index, non_protein_coding_QTL_index  ));

group1_bv = simulate_phenotypes(new_genotype ,number_individual,non_protein_coding_QTL_index,ID,1)
group2_bv = simulate_phenotypes(new_genotype ,number_individual,protein_coding_QTL_index ,ID,2)


h2 = 0.5

overall_g = group1_bv[!,:g] + group2_bv[!,:g]
overall_g = overall_g./std(overall_g)
varg     = var(overall_g)
stde     = sqrt((1 - h2)/h2 *varg)


y                      = overall_g + randn(number_individual)*stde # I use the 0.5 h2 since the original 0.6 h2 is mimic real biological situation
println("varaince for g: ",var(overall_g))
println("varaince for y: ",var(y))


MSU_simu_phen = DataFrame(ID=ID,y=y,g = overall_g );
#CSV.write( data_path*"MSU_simu_pheno_protein_coding_QTL.csv" , MSU_simu_phen)

varaince for g: 36.30178217124499
varaince for g: 684.3507730626027
varaince for g: 0.9999999999999999
varaince for y: 1.9532642710805328


### Check the marker variance and genetic variance

In [31]:
# define some prior parameter

geno = new_genotype
pi = 0.99
h2 = 0.5
number_markers = size(geno,2) - 1 # compute the number of marker, -1 since the ID column

# compute the genetic variance from the phenotypic variance
phenotypic_var = var(MSU_simu_phen[!,:y])
genetic_var = phenotypic_var*h2
markerMeans = mean(Matrix(geno[!,2:end]), dims=1)
p = markerMeans/2.0
mean2pq = mean(2*transpose(p) .* (1 .- transpose(p) ))
var_effects = genetic_var/(number_markers *(1-pi)* mean2pq )

0.006324806976875542

In [79]:
genetic_var

1.0016514018977138