In [1]:
import re
import typing as t

import numpy as np
import pandas as pd


In [2]:
assignments = pd.read_csv('domains_pfam/assigned_components.tsv', sep='\t', header=None, names=['id', 'component'])
assignments.head()

Unnamed: 0,id,component
0,GCF_000006605.1,1
1,GCF_000006985.1,8
2,GCF_000007085.1,3
3,GCF_000007425.1,5
4,GCF_000007465.2,10


In [3]:
samples = pd.read_csv('samples.tsv', sep='\t', index_col=0)
samples.head()

Unnamed: 0_level_0,species
id,Unnamed: 1_level_1
GCF_002881935.1,Streptococcus agalactiae
GCF_000013405.1,Syntrophus aciditrophicus
GCF_000427275.1,Mannheimia haemolytica
GCF_003325015.1,Salmonella enterica
GCF_002313025.1,Vibrio cholerae


In [4]:
def extract_genus(species: str) -> t.Optional[str]:
    """
    Edit for proper tree rendering
    """
    if species[0].islower():
        return None
    is_invalid = '[' in species
    is_candidatus = 'Candidatus' in species
    if is_invalid and is_candidatus:
        raise AssertionError
    genus = (
        ' '.join(species.split()[:2]) if is_candidatus else 
        species.split()[0]
    )
    genus_renamed = ('Invalid ' if is_invalid else '') + re.sub("[\[\]']", '', genus)
    return genus_renamed.replace(' ', '_')
    

id_to_genus = dict(samples['species'].apply(extract_genus).iteritems())

In [5]:
assignments['genus_renamed'] = assignments['id'].apply(id_to_genus.get).apply(lambda x: np.nan if x is None else x)

In [6]:
assignments_complete = assignments.dropna()
assignments_complete.head()

Unnamed: 0,id,component,genus_renamed
0,GCF_000006605.1,1,Corynebacterium
1,GCF_000006985.1,8,Chlorobium
2,GCF_000007085.1,3,Caldanaerobacter
3,GCF_000007425.1,5,Streptococcus
4,GCF_000007465.2,10,Streptococcus


In [7]:
assignments.shape, assignments_complete.shape

((1945, 3), (1939, 3))

In [8]:
assignments_complete.to_csv('domains_pfam/assigned_components_annotated_filt.tsv', sep='\t', index=False)