# Produce the VCF file with the annotation of the RH regions 

This scripts takes the tables from ```02. RH score analysis``` and from ```01. SNP windows``` to produce a new VCF file with the mutation calls, if they are EMS or not and if they are in a RH region. 

## Inputs

1. The table from the step ```01```
2. The ```regions.csv.gz``` file from step ```02```.
3. A file with the VCF header: 

```
##fileformat=VCFv4.1
##fileDate=20190708
##source=JIC;CropGenetics;ricardo.ramirez-gonzalez@jic.ac.uk;cristobal.uauy@jic.ac.uk
##reference=ftp://ftp.ensembl.org/pub/release-93/fasta/Triticum_aestivum/dna/
##INFO=<ID=EMS-induced mutation,Number=0,Type=Flag,Description="EMS-induced mutations from sequenced TILLING populations. Seeds can be ordered from UK SeedStor or US Dubcovsky lab. Line identifier is variant name up to dot (e.g Kronos3128).">
##INFO=<ID=non EMS-induced mutation,Number=0,Type=Flag,Description="SNP found in the TILLING populations that are not ems">
##INFO=<ID=TSA,Number=1,Type=String,Description="Type of sequence alteration. Child of term sequence_alteration as defined by the sequence ontology project.">
##INFO=<ID=EMS-GT,Number=1,Type=String,Description="Genotype of EMS-induced mutations">
##INFO=<ID=RH,Number=0,Type=Flag,Description="SNP is in a region with residual heterogeneity.">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth.">
##INFO=<ID=AO,Number=1,Type=Integer,Description="Approximate reference observations.">
##INFO=<ID=RO,Number=1,Type=Integer,Description="Approximate alternative allele observations.">
##INFO=<ID=MUT_QUAL,Number=1,Type=String,Description="Category of the mutation.">
##INFO=<ID=SNP_INDEX,Number=1,Type=Float,Description="SNP Index of the mutation.">
##INFO=<ID=EMS-line,Number=1,Type=String,Description="EMS line where the mutatation is located.">
#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
```


In [1]:
require 'zlib' 
require 'csv'

true

In [2]:
require 'pp'

true

In [3]:
def read_rh_regions(filename, window_size: "gap_60" ) 
    i=0
    rh_regions = Hash.new {|h,k| h[k] = Array.new }
    Zlib::GzipReader.open(filename) do |gzip|
        csv = CSV.new(gzip, headers:true)
        csv.each do |row|
            next unless row["window_size"] == window_size
            rh_regions[row["region_id"]] << row["LINE"]
        end
        
    end
    rh_regions
end
rh_regions = read_rh_regions("./out/RH_Kronos_regions.csv.gz")
rh_regions.size

80303

In [4]:
def write_annotated_vcf(snp_tables, rh_regions, out_vcf,header_vcf="./in/all_vcf_headers.txt", filter_lines:[])
    kept_snps = Set.new
    out = Zlib::GzipWriter.open(out_vcf)
    File.readlines(header_vcf).each{|line| out.puts line }
    i = 0
    qual   = "."
    filter = "."
    
    current_region = "0"
    current_region_lines = []
    Zlib::GzipReader.open(snp_tables) do |gzip|
        csv = CSV.new(gzip, headers:true, col_sep:"\t")
        csv.each do |row|
            reg = row["region_id"]
            current_region_lines = rh_regions[reg] unless reg == current_region
            current_region = reg
            line = row["LINE"]
            ref=row["REF"]
            alt=row["ALT"]
            rh=""
            rh=";RH" if current_region_lines.include? line
            tsa="NA"
            tsa="SNV" if alt.size == ref.size
            tsa="DEL" if ref.size > alt.size
            tsa="INS" if ref.size < alt.size
            next if filter_lines.include? line
            info   = "EMS-induced mutation;TSA=#{tsa};EMS-GT=#{row["EMS_GT"]};EMS-line=#{line}"
            info_h = "#{info};DP=#{row["DP"]};RO=#{row["RO"]};AO=#{row["AO"]};SNP_INDEX=#{row["SNP_INDEX"]};MUT_QUAL=#{row["MUT_QUAL"]}#{rh}"
            out.puts [row["CHROM"], row["POS"],  row["ID"],  ref,  alt, qual, filter, info_h].join("\t")
            kept_snps << row["ID"]
        end
        
    end
    
    out.close
    kept_snps
end

kept_snps = write_annotated_vcf("./out/20190709_Kronos_snps_with_density_and_het_gap_60.tsv.gz", rh_regions, "./out/20190723_Kronos_annotated.vcf.gz")
kept_snps.size

5042171