In [20]:
import yaml
import pandas as pd
from typing import Generator, Tuple, Dict

In [5]:
def read_fasta(filepath: str) -> Generator[Tuple[str, str], None, None]:
    """
    Parses a FASTA file and yields record names and sequences.
    Args:
        filepath (str): Path to the FASTA file.
    Yields:
        tuple: A tuple containing the record name and sequence.
    """
    try:
        with open(filepath, 'r') as file:
            name, sequence = None, []
            for line in file:
                line = line.strip()
                if line.startswith(">"):
                    if name:
                        yield name, ''.join(sequence)
                    name, sequence = line[1:], []
                else:
                    sequence.append(line)
            if name:
                yield name, ''.join(sequence)
    except FileNotFoundError:
        raise FileNotFoundError(f"File not found: {filepath}")
    except Exception as e:
        raise RuntimeError(f"An error occurred while reading the FASTA file: {e}")


In [12]:
filepath = "tests/test_fasta.fa"

In [15]:
for _,seq in read_fasta(filepath):
    print(seq)

ATGCGTACGTAGCTAGCGTAGCTAGT
CGTAGCTAGTACGATCGTACGTAGCT


In [21]:

def load_feature_data(kmer_len: int, feature_file: str="data/lookup.yaml") -> Dict[str, float]:
    """
    Loads feature data from a YAML file for a given k-mer length.

    Args:
        feature_file (str): Path to the YAML file containing feature data.
        kmer_len (int): Length of the k-mer.

    Returns:
        dict: A dictionary of feature data.
    """
    try:
        with open(feature_file, 'r') as f:
            data = yaml.safe_load(f)
            if str(kmer_len) in data:
                return data[str(kmer_len)]
            else:
                raise ValueError(f"k-mer length {kmer_len} not found in feature data.")
    except FileNotFoundError:
        raise FileNotFoundError(f"Feature file not found: {feature_file}")
    except yaml.YAMLError as e:
        raise RuntimeError(f"An error occurred while parsing the YAML file: {e}")
    except Exception as e:
        raise RuntimeError(f"An error occurred while loading feature data: {e}")


In [24]:
load_feature_data(kmer_len= "trinucleotide", 
                  feature_file="DNAflexpy/data/lookup.yaml")

{'NPP': {'AAA': 36,
  'AAC': 6,
  'AAG': 6,
  'AAT': 30,
  'ACA': 6,
  'ACC': 8,
  'ACG': 8,
  'ACT': 11,
  'AGA': 9,
  'AGC': 25,
  'AGG': 8,
  'AGT': 11,
  'ATA': 13,
  'ATC': 7,
  'ATG': 18,
  'ATT': 30,
  'CAA': 9,
  'CAC': 17,
  'CAG': 2,
  'CAT': 18,
  'CCA': 8,
  'CCC': 13,
  'CCG': 2,
  'CCT': 8,
  'CGA': 31,
  'CGC': 25,
  'CGG': 2,
  'CGT': 8,
  'CTA': 18,
  'CTC': 8,
  'CTG': 2,
  'CTT': 6,
  'GAA': 12,
  'GAC': 8,
  'GAG': 8,
  'GAT': 7,
  'GCA': 13,
  'GCC': 45,
  'GCG': 25,
  'GCT': 25,
  'GGA': 5,
  'GGC': 45,
  'GGG': 13,
  'GGT': 8,
  'GTA': 6,
  'GTC': 8,
  'GTG': 17,
  'GTT': 6,
  'TAA': 20,
  'TAC': 6,
  'TAG': 18,
  'TAT': 13,
  'TCA': 8,
  'TCC': 5,
  'TCG': 31,
  'TCT': 9,
  'TGA': 8,
  'TGC': 13,
  'TGG': 8,
  'TGT': 6,
  'TTA': 20,
  'TTC': 12,
  'TTG': 9,
  'TTT': 36},
 'DNaseI': {'AAT': -0.28,
  'ATT': -0.28,
  'AAA': -0.274,
  'TTT': -0.274,
  'CCA': -0.246,
  'TGG': -0.246,
  'AAC': -0.205,
  'GTT': -0.205,
  'CCG': -0.136,
  'CGG': -0.136,
  'ATC': -0.11,
