Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add fasta2 package #337

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions io/fasta2/example_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
package fasta2_test

import (
"fmt"

"github.com/TimothyStiles/poly/io/fasta2"
)

// ExampleReadFile shows basic usage for ReadFile
func ExampleReadFile() {
fastas, _ := fasta2.ReadFile("testdata/base.fasta")

fmt.Println(fastas[0].Header)
// Output: gi|5524211|gb|AAD44166.1| cytochrome b [Elephas maximus maximus]
}
86 changes: 86 additions & 0 deletions io/fasta2/fasta.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
package fasta2

import (
"bytes"
"fmt"
"io"
"os"
)

// Record is a struct representing a single Record element with a Name and its corresponding Sequence.
type Record struct {
Header string `json:"header"`
Sequence string `json:"sequence"`
}

// buffer is a utility method to serialize the Record in a buffer.
// The buffer is reset before any writing happens
func (r Record) buffer(b *bytes.Buffer) {
b.Reset()
// grow the buffer to allocate just once, the numbers are in order:
// the header + > + \n, the sequence + one \n for each 80 char, the last \n
b.Grow(len(r.Header) + 2 + len(r.Sequence) + (len(r.Sequence) % 80) + 1)
b.WriteByte('>')
b.WriteString(r.Header)
for i, c := range r.Sequence {
// write the fasta sequence 80 characters at a time
if i%80 == 0 {
b.WriteByte('\n')
}
b.WriteRune(c)
}
b.WriteByte('\n')

return
}

// returns the string representation of a Record.
func (r Record) String() string {
var b bytes.Buffer
r.buffer(&b)
return b.String()
}

// returns the representation of a Record as []byte.
func (r Record) Bytes() []byte {
var b bytes.Buffer
r.buffer(&b)
return b.Bytes()
}

// Write writes a []Record to an io.Writer, it reuses the underlying buffer, so
// it is more efficient for bulk writes.
func Write(recs []Record, w io.Writer) (int, error) {
var (
b bytes.Buffer
written int
)
for _, r := range recs {
r.buffer(&b)
n, err := w.Write(b.Bytes())
written += n
if err != nil {
return written, err
}
}
return written, nil
}

// WriteFile writes all the passed records to the file at path.
func WriteFile(recs []Record, path string) error {
f, err := os.Create(path)
if err != nil {
return fmt.Errorf("error opening file %q: %w", path, err)
}
defer f.Close()
var b bytes.Buffer
for _, r := range recs {
r.buffer(&b)
_, err := f.Write(b.Bytes())
if err != nil {
return fmt.Errorf("error writing to file %q: %w", path, err)
}
}

return nil
}
126 changes: 126 additions & 0 deletions io/fasta2/fasta_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
package fasta2_test

import (
"bytes"
"io"
"os"
"path"
"reflect"
"testing"

"github.com/TimothyStiles/poly/io/fasta2"
"github.com/stretchr/testify/require"
)

func TestFastaString(t *testing.T) {
type fields struct {
Header string
Sequence string
}
tests := []struct {
header string
fields fields
want string
}{
{
header: "success",
fields: fields{
Header: "Cool Sequence",
Sequence: "MDSKGSSQKGSRLLLLLVVSNLLLCQGVVSTPVCPNGPGNCQVSLRDLFDRAVMVSHYIHDLSSEMFNEFDKRYAQGKGFITMALNSCHTSSLPTPEDKEQAQQTHHEVLMSLILGLLRSWNDPLYHL",
},
want: ">Cool Sequence\nMDSKGSSQKGSRLLLLLVVSNLLLCQGVVSTPVCPNGPGNCQVSLRDLFDRAVMVSHYIHDLSSEMFNEFDKRYAQGKGF\nITMALNSCHTSSLPTPEDKEQAQQTHHEVLMSLILGLLRSWNDPLYHL\n",
},
}
for _, tt := range tests {
t.Run(tt.header, func(t *testing.T) {
f := fasta2.Record{
Header: tt.fields.Header,
Sequence: tt.fields.Sequence,
}
if got := f.String(); got != tt.want {
t.Errorf("Record.String() = %v, want %v", got, tt.want)
}
})
}
}

func TestRecord_Bytes(t *testing.T) {
type fields struct {
Header string
Sequence string
}
tests := []struct {
name string
fields fields
want []byte
}{
{
name: "success",
fields: fields{
Header: "Cool Sequence",
Sequence: "MDSKGSSQKGSRLLLLLVVSNLLLCQGVVSTPVCPNGPGNCQVSLRDLFDRAVMVSHYIHDLSSEMFNEFDKRYAQGKGFITMALNSCHTSSLPTPEDKEQAQQTHHEVLMSLILGLLRSWNDPLYHL",
},
want: []byte(">Cool Sequence\nMDSKGSSQKGSRLLLLLVVSNLLLCQGVVSTPVCPNGPGNCQVSLRDLFDRAVMVSHYIHDLSSEMFNEFDKRYAQGKGF\nITMALNSCHTSSLPTPEDKEQAQQTHHEVLMSLILGLLRSWNDPLYHL\n"),
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
r := fasta2.Record{
Header: tt.fields.Header,
Sequence: tt.fields.Sequence,
}
if got := r.Bytes(); !reflect.DeepEqual(got, tt.want) {
t.Errorf("Record.Bytes() = %v, want %v", got, tt.want)
}
})
}
}

func TestWrite(t *testing.T) {
recs := []fasta2.Record{
{
Header: "name1",
Sequence: "seq1",
},
{
Header: "name2",
Sequence: "seq2",
},
}
t.Run("success", func(t *testing.T) {
w := &bytes.Buffer{}
n, err := fasta2.Write(recs, w)
require.NoError(t, err)
require.Equal(t, ">name1\nseq1\n>name2\nseq2\n", w.String())
require.Equal(t, 24, n)
})
t.Run("fail EOF", func(t *testing.T) {
w := errorWriter{}
_, err := fasta2.Write(recs, w)
require.Error(t, err)
})
}

func TestWriteFile(t *testing.T) {
path := path.Join(os.TempDir(), "fasta_test")
defer os.Remove(path) // clean up
recs := []fasta2.Record{
{
Header: "name1",
Sequence: "seq1",
},
{
Header: "name2",
Sequence: "seq2",
},
}
err := fasta2.WriteFile(recs, path)
require.NoError(t, err)
}

// errorWriter is a test utility to have errors
type errorWriter struct{}

func (ew errorWriter) Write(p []byte) (n int, err error) {
return len(p), io.EOF
}
128 changes: 128 additions & 0 deletions io/fasta2/parser.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
package fasta2

import (
"bufio"
"bytes"
"fmt"
"io"
"os"
)

// Parser is a fasta parser it is initialized by the NewParser() function.
type Parser struct {
buff bytes.Buffer
header string
start bool
scanner *bufio.Scanner
line int
more bool
}

func NewParser(r io.Reader) *Parser {
return &Parser{
start: true,
more: true,
scanner: bufio.NewScanner(r),
}
}

// Lines returns the number of lines parsed.
func (p *Parser) Lines() int {
return p.line
}

// HasNext returns true if the parser can continue parsing.
func (p *Parser) HasNext() bool {
return p.more
}

func (p *Parser) newRecord() Record {
sequence := p.buff.String()
record := Record{
Header: p.header,
Sequence: sequence,
}
// Reset sequence buffer
p.buff.Reset()
return record
}

// Next parsed the next record in the io.Reader and returns it, in case
// something went wrong an error and the partial result is returned.
func (p *Parser) Next() (Record, error) {
for p.scanner.Scan() {
line := p.scanner.Bytes()
p.line++
switch {
// if there's nothing on this line skip this iteration of the loop
case len(line) == 0:
continue
// if it's a comment skip this line
case line[0] == ';':
continue
// start of file with no header, error
case line[0] != '>' && p.start:
err := fmt.Errorf("invalid input: missing sequence header for sequence starting at line %d", p.line)
record := p.newRecord()
return record, err
// start of a fasta line
case line[0] != '>':
p.buff.Write(line)
// Process normal new lines
case line[0] == '>' && !p.start:
record := p.newRecord()
// New name
p.header = string(line[1:])
return record, nil
// Process first line of file
case line[0] == '>' && p.start:
p.header = string(line[1:])
p.start = false
}
}
p.more = false
// Add final sequence in file
record := p.newRecord()
return record, p.scanner.Err()
}

// ParseAll will parse all the records found in the reader and returns them in
// a slice.
func ParseAll(r io.Reader) ([]Record, error) {
var (
ret []Record
p = NewParser(r)
)

for p.HasNext() {
rec, err := p.Next()
if err != nil {
return ret, err
}
ret = append(ret, rec)
}

return ret, nil
}

// ReadFile will parse all the records found in the file and returns them in
// a slice.
func ReadFile(path string) ([]Record, error) {
var ret []Record
f, err := os.Open(path)
if err != nil {
return nil, fmt.Errorf("error while reading file %q: %w", path, err)
}
defer f.Close()

p := NewParser(f)
for p.HasNext() {
rec, err := p.Next()
if err != nil {
return ret, err
}
ret = append(ret, rec)
}

return ret, nil
}
Loading
Loading