forked from grailbio/bio
-
Notifications
You must be signed in to change notification settings - Fork 0
/
common.go
225 lines (206 loc) · 7.3 KB
/
common.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
// Copyright 2020 Grail Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package pileup
import (
"context"
"fmt"
"strings"
"github.com/Schaudge/grailbase/compress"
"github.com/Schaudge/grailbase/file"
"github.com/Schaudge/grailbase/log"
"github.com/Schaudge/grailbio/encoding/fasta"
"github.com/Schaudge/grailbio/interval"
"github.com/Schaudge/hts/sam"
)
// Common pileup components.
// PosType is the integer type used to represent genomic positions.
type PosType = interval.PosType
// PosTypeMax is the maximum value that can be represented by a PosType.
const PosTypeMax = interval.PosTypeMax
// These constants have two relevant meanings:
// 1. In the .bam seq[] encoding (sam.BaseA, sam.BaseC, etc.), it's the
// position of A's set bit. This is relevant when using
// __builtin_ctzl([read seq word] & (~[ref seq word])) to quickly iterate
// over differences-from-reference.
// 2. It's the natural value for A/C/G/T in a packed 2-bit representation
// (useful anywhere we don't have to worry about Ns).
const (
// BaseA represents an A base.
BaseA byte = iota
// BaseC represents an C base.
BaseC
// BaseG represents an G base.
BaseG
// BaseT represents an T base.
BaseT
// BaseX is a catch-all.
BaseX
)
const (
// NBase is the number of regular base types.
NBase = 4
// NBaseEnum counts BaseX as well as the regular base types.
NBaseEnum = 5
)
// Seq8ToEnumTable is the .bam seq nibble -> A/C/G/T/X enum mapping.
var Seq8ToEnumTable = [...]byte{BaseX, BaseA, BaseC, BaseX, BaseG, BaseX, BaseX, BaseX, BaseT, BaseX, BaseX, BaseX, BaseX, BaseX, BaseX, BaseX}
// EnumToASCIITable is the A/C/G/T/X -> ASCII mapping, with X rendered as 'N'.
var EnumToASCIITable = [...]byte{'A', 'C', 'G', 'T', 'N'}
// Seq8ToASCIITable is the .bam seq nibble -> ASCII mapping.
var Seq8ToASCIITable = [...]byte{'=', 'A', 'C', 'M', 'G', 'R', 'S', 'V', 'T', 'W', 'Y', 'H', 'K', 'D', 'B', 'N'}
// StrandType describes which strand a read-pair is aligned to.
type StrandType int
const (
// StrandNone means either no strand restriction, or (when returned by
// GetStrand) undefined-strand (read ends on different chromosomes, or appear
// to be part of an inversion).
StrandNone StrandType = iota
// StrandFwd means that the read-pair's start is on the 5' side and the end
// is on the 3' side of the same chromosome.
StrandFwd
// StrandRev means that the read-pair's start is on the 3' side and the end
// is on the 5' side of the same chromosome.
StrandRev
)
// StrandTypeToASCIITable is the StrandType -> ASCII mapping.
var StrandTypeToASCIITable = [...]byte{'.', '+', '-'}
// GetStrand returns the strand the read-pair is aligned to.
func GetStrand(samr *sam.Record) StrandType {
if samr.Ref != samr.MateRef {
return StrandNone
}
flagStrand := samr.Flags & (sam.Reverse | sam.MateReverse | sam.Read1 | sam.Read2)
if (flagStrand == (sam.MateReverse | sam.Read1)) || (flagStrand == (sam.Reverse | sam.Read2)) {
return StrandFwd
} else if (flagStrand == (sam.Reverse | sam.Read1)) || (flagStrand == (sam.MateReverse | sam.Read2)) {
return StrandRev
}
if samr.Flags&sam.MateUnmapped == sam.MateUnmapped {
// Support an alternate encoding emitted by some 'collapser' programs.
flagStrand &= sam.Reverse | sam.MateReverse
if flagStrand == 0 {
return StrandFwd
} else if flagStrand == (sam.Reverse | sam.MateReverse) {
return StrandRev
}
}
return StrandNone
}
// ParseCols parses a column-set-descriptor string given on the command line
// (colsParam) into a 64-bit integer bitset for internal use.
func ParseCols(colsParam string, colNameMap map[string]int, defaultColBitset int) (colBitset int, err error) {
if colsParam == "" {
return defaultColBitset, nil
}
colsParamParts := strings.Split(colsParam, ",")
// Two cases:
// 1. Each part has a '+' or a '-' in front. Treat these as patches to the
// default column set.
// 2. No part has a '+' or a '-' in front. Ignore the default and treat this
// as the full set.
firstChar := colsParamParts[0][0]
if (firstChar == '+') || (firstChar == '-') {
colBitset = defaultColBitset
for _, part := range colsParamParts {
firstChar = part[0]
if (firstChar != '+') && (firstChar != '-') {
err = fmt.Errorf("parseCols: either all terms in column set descriptor must be preceded by +/-, or none can be")
return
}
v := colNameMap[part[1:]]
if v == 0 {
err = fmt.Errorf("parseCols: %v not found", part[1:])
return
}
// can also check for duplicates
if firstChar == '+' {
colBitset |= v
} else {
colBitset &= ^v
}
}
} else {
for _, part := range colsParamParts {
firstChar = part[0]
if (firstChar == '+') || (firstChar == '-') {
err = fmt.Errorf("parseCols: either all terms in column set descriptor must be preceded by +/-, or none can be")
return
}
v := colNameMap[part]
if v == 0 {
err = fmt.Errorf("parseCols: %v not found", part)
return
}
colBitset |= v
}
}
return colBitset, nil
}
// LoadFa is a thin wrapper around fasta.New().
func LoadFa(ctx context.Context, fapath string, enc fasta.Encoding) (fa fasta.Fasta, err error) {
var infile file.File
if infile, err = file.Open(ctx, fapath); err != nil {
return
}
defer func() {
if e := infile.Close(ctx); e != nil && err == nil {
err = e
}
}()
reader, _ := compress.NewReader(infile.Reader(ctx))
defer func() {
if e := reader.Close(); e != nil && err == nil {
err = e
}
}()
if fa, err = fasta.New(reader, fasta.OptEncoding(enc)); err != nil {
return
}
return
}
// FaToStringSlice returns the data in fa as a []string, using the reference
// order in headerRefs[]. It performs reference-length consistency checks
// between headerRefs and fa in the process.
func FaToStringSlice(fa fasta.Fasta, headerRefs []*sam.Reference) ([]string, error) {
nXamRef := len(headerRefs)
refSeqs := make([]string, nXamRef)
nMissingFromFa := 0
for i, curRef := range headerRefs {
refName := curRef.Name()
refLen, e := fa.Len(refName)
if e != nil {
// Temporarily tolerate this since a few tests harmlessly trigger it.
// TODO(cchang): upgrade this to a regular error at the first good
// opportunity.
nMissingFromFa++
continue
}
if refLen != uint64(curRef.Len()) {
return nil, fmt.Errorf("pileup.FaToStringSlice: inconsistent lengths for contig %s (%d in BAM/PAM header, %d in .fa)", refName, curRef.Len(), refLen)
}
refSeq, err := fa.Get(refName, 0, refLen)
if err != nil {
return nil, err
}
refSeqs[i] = refSeq
}
if nMissingFromFa != 0 {
log.Printf("pileup.FaToStringSlice: warning: %d reference(s) present in BAM/PAM header but missing from .fa", nMissingFromFa)
}
nMissingFromXam := len(fa.SeqNames()) + nMissingFromFa - nXamRef
if nMissingFromXam != 0 {
log.Printf("pileup.FaToStringSlice: warning: %d reference(s) present in .fa but missing from BAM/PAM header", nMissingFromXam)
}
return refSeqs, nil
}