In [1]:
import os
import sys
from Bio import SeqIO


def getnifHclusterID(seq: list) -> str:
    """
    Assign cluster to nifH sequence based on CART model in
    https://sfamjournals.onlinelibrary.wiley.com/doi/10.1111/1758-2229.12455
    """
    CART = {
        109: ['F', 'W', 'Y'],
        49: ['A', 'D', 'I'],
        53: ['L', 'M', 'W']
    }
    aa_pos = list(CART.keys())
    if len(seq) < max(aa_pos):
        return '0'
    if seq[aa_pos[0] - 1] in CART[aa_pos[0]]:
        return 'I'
    elif seq[aa_pos[1] - 1] in CART[aa_pos[1]]:
        return 'II'
    elif seq[aa_pos[2] - 1] in CART[aa_pos[2]]:
        return 'III'
    else:
        return 'IV'

def getRecordAlignments(input_alignment: str) -> dict:
    """
    Get dict of lists containing sequence alignments
    """
    with open(input_alignment) as inalign:
        return {
            record.id.split('_')[0]: list(record.seq)
            for record in SeqIO.parse(inalign, 'fasta')
        }

def addClusterToNifH(input_fasta: str, input_alignment: str,
                     output_fasta: str = None) -> None:
    """
    Add assigned cluster to nifH sequences in fasta file
    """
    input_fasta = os.path.abspath(input_fasta)
    input_alignment = os.path.abspath(input_alignment)
    recordAligns = getRecordAlignments(input_alignment)

    if output_fasta is None:
        base, ext = os.path.splitext(input_fasta)
        output_fasta = f'{base}_clustered{ext}'
    else:
        output_fasta = os.path.abspath(output_fasta)

    with open(input_fasta) as infasta, open(output_fasta, 'w') as outfasta:
        for record in SeqIO.parse(infasta, 'fasta'):
            record_align_seq = recordAligns[record.id.split('_')[0]]
            cluster_id = getnifHclusterID(record_align_seq)
            record.id = f'{record.id}_cluster_{cluster_id}'
            record.name = ''
            record.description = ''
            SeqIO.write(record, outfasta, 'fasta')

In [7]:
addClusterToNifH(input_fasta='../data/sequencesLongLabels.fasta',
                 input_alignment='../data/sequences.fasta.aln')

In [2]:
"""
WP_039801084 should be of type I
"""
alns = getRecordAlignments('../data/sequences.fasta.aln')

In [11]:
getnifHclusterID(alns['001'])

'IV'

In [9]:
azo = getRecordAlignments('../data/sequencesLongLabels.fasta')
print(azo['001'])

['M', 'A', 'L', 'R', 'Q', 'C', 'A', 'I', 'Y', 'G', 'K', 'G', 'G', 'I', 'G', 'K', 'S', 'T', 'T', 'T', 'Q', 'N', 'L', 'V', 'A', 'A', 'L', 'A', 'E', 'A', 'G', 'K', 'K', 'V', 'M', 'I', 'V', 'G', 'C', 'D', 'P', 'K', 'A', 'D', 'S', 'T', 'R', 'L', 'I', 'L', 'H', 'S', 'K', 'A', 'Q', 'N', 'T', 'V', 'M', 'E', 'M', 'A', 'A', 'S', 'A', 'G', 'S', 'V', 'E', 'D', 'L', 'E', 'L', 'E', 'D', 'V', 'L', 'Q', 'I', 'G', 'Y', 'G', 'G', 'V', 'K', 'C', 'V', 'E', 'S', 'G', 'G', 'P', 'E', 'P', 'G', 'V', 'G', 'C', 'A', 'G', 'R', 'G', 'V', 'I', 'T', 'A', 'I', 'N', 'F', 'L', 'E', 'E', 'E', 'G', 'A', 'Y', 'S', 'D', 'D', 'L', 'D', 'F', 'V', 'F', 'Y', 'D', 'V', 'L', 'G', 'D', 'V', 'V', 'C', 'G', 'G', 'F', 'A', 'M', 'P', 'I', 'R', 'E', 'N', 'K', 'A', 'Q', 'E', 'I', 'Y', 'I', 'V', 'C', 'S', 'G', 'E', 'M', 'M', 'A', 'M', 'Y', 'A', 'A', 'N', 'N', 'I', 'A', 'K', 'G', 'I', 'V', 'K', 'Y', 'A', 'H', 'S', 'G', 'S', 'V', 'R', 'L', 'G', 'G', 'L', 'I', 'C', 'N', 'S', 'R', 'K', 'T', 'D', 'R', 'E', 'D', 'E', 'L', 'I', 'M', 'A', 'L',

In [17]:
azo['001'][108]

'F'

In [13]:
azo['001'][100:110]

['R', 'G', 'V', 'I', 'T', 'A', 'I', 'N', 'F', 'L']

In [20]:
print(alns['001'][108 + 85])

F


In [8]:
alns['001'][107]

'A'