In [2]:
import pandas as pd

In [3]:
samples = pd.read_csv('samples.tsv', sep='\t', index_col=0)
samples.head()

Unnamed: 0_level_0,species
id,Unnamed: 1_level_1
GCF_002881935.1,Streptococcus agalactiae
GCF_000013405.1,Syntrophus aciditrophicus
GCF_000427275.1,Mannheimia haemolytica
GCF_003325015.1,Salmonella enterica
GCF_002313025.1,Vibrio cholerae


In [4]:
samples['genus'] = samples['species'].apply(
    lambda x: ' '.join(x.split()[:2]) if x.startswith('Candidatus') else x.split()[0]
)
samples.head()

Unnamed: 0_level_0,species,genus
id,Unnamed: 1_level_1,Unnamed: 2_level_1
GCF_002881935.1,Streptococcus agalactiae,Streptococcus
GCF_000013405.1,Syntrophus aciditrophicus,Syntrophus
GCF_000427275.1,Mannheimia haemolytica,Mannheimia
GCF_003325015.1,Salmonella enterica,Salmonella
GCF_002313025.1,Vibrio cholerae,Vibrio


In [5]:
# make pandas read counts as integers
structures = pd.read_csv('domains_pfam/sample_domain_counts.tsv', sep='\t')
domains = structures.keys()[1:]
structures = pd.read_csv('domains_pfam/sample_domain_counts.tsv', sep='\t', index_col=0,
                         dtype={key: int for key in domains})

structures.head()

Unnamed: 0_level_0,PF13280.6,PF09344.10,PF08798.11,PF09485.10,PF09481.10,PF09707.10,PF09704.10,PF18395.1,PF05107.12,PF09709.10,...,PF17262.2,PF17955.1,PF18070.1,PF16813.5,PF18061.1,PF06023.12,PF17894.1,PF18516.1,PF18501.1,PF18510.1
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GCF_000003925.1,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GCF_000005825.2,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GCF_000006605.1,2,1,1,1,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
GCF_000006905.1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GCF_000006965.1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
structures['genus'] = samples.loc[structures.index, 'genus']
structures.head()

Unnamed: 0_level_0,PF13280.6,PF09344.10,PF08798.11,PF09485.10,PF09481.10,PF09707.10,PF09704.10,PF18395.1,PF05107.12,PF09709.10,...,PF17955.1,PF18070.1,PF16813.5,PF18061.1,PF06023.12,PF17894.1,PF18516.1,PF18501.1,PF18510.1,genus
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GCF_000003925.1,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Bacillus
GCF_000005825.2,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Bacillus
GCF_000006605.1,2,1,1,1,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,Corynebacterium
GCF_000006905.1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Caulobacter
GCF_000006965.1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Sinorhizobium


In [7]:
structures_filt = structures[structures['genus'].apply(lambda x: x[0].isupper())]
structures_filt.shape

(4176, 57)

In [8]:
# leave only the most common domain structure in each genus
structures_subsampled = (
    structures_filt
    .groupby('genus')
    .apply(
        lambda genus: 
            genus
            .groupby(list(domains))
            .apply(len)
    )
    .reset_index()
    .rename(columns={0: 'count'})
    .groupby('genus')
    .apply(
        lambda grp: grp.sort_values('count', ascending=False).iloc[0]
    )
    .reset_index(drop=True)
    .set_index('genus')
    [list(domains)]
)
structures_subsampled.head()

Unnamed: 0_level_0,PF13280.6,PF09344.10,PF08798.11,PF09485.10,PF09481.10,PF09707.10,PF09704.10,PF18395.1,PF05107.12,PF09709.10,...,PF17262.2,PF17955.1,PF18070.1,PF16813.5,PF18061.1,PF06023.12,PF17894.1,PF18516.1,PF18501.1,PF18510.1
genus,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Acetobacter,0,1,1,0,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Acetobacteraceae,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Acetobacterium,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Acetohalobium,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Acetomicrobium,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
domains_filtered = structures_subsampled.keys()[(structures_subsampled > 0).sum(0) >= 10]
domains_filtered

Index(['PF13280.6', 'PF09344.10', 'PF08798.11', 'PF09485.10', 'PF09481.10',
       'PF09707.10', 'PF09704.10', 'PF18395.1', 'PF05107.12', 'PF09709.10',
       'PF01867.16', 'PF09827.9', 'PF01930.17', 'PF03787.15', 'PF01881.16',
       'PF09623.10', 'PF12469.8', 'PF09455.10', 'PF09700.10', 'PF09701.10',
       'PF09659.10', 'PF09711.10', 'PF16595.5', 'PF16592.5', 'PF01905.16',
       'PF09614.10', 'PF09611.10', 'PF09618.10', 'PF09615.10', 'PF09652.10',
       'PF03750.13', 'PF18211.1', 'PF09609.10', 'PF09617.10', 'PF09620.10',
       'PF09559.10', 'PF18320.1', 'PF10040.9', 'PF09484.10', 'PF09651.10',
       'PF17953.1', 'PF09670.10', 'PF17262.2', 'PF17955.1'],
      dtype='object')

In [24]:
structures_subsampled[domains_filtered].to_csv('domains_pfam/sample_domain_counts_subsampled.tsv', sep='\t')