# Generate matrix by phylotypes

Given the sparsity of running correlation analyses between gene expression and ASVs, an alternative (and possibly better) solution is to colapse ASVs generated with Dada2 into phylotypes. A promising way to do it is using methods implemented by [Minot et al (2023)](https://doi.org/10.1016/j.crmeth.2023.100639), who described the Nextflow workflow MaLiAmPi and the Python package called "phylotypes".

I (RACS) installed phylotypes and dependencies in a conda environment with Python 3.10 (which can be used to install all of its dependencies).

Two files will be used to generate the matrix with counts by sample and phylotypes:

 * `dada2.sv.shared.txt`, which contains the SV counts in a TSV format similar to mothur sharefile. (described [here](https://github.com/jgolob/maliampi))
 * `phylotypes_maliampi_q20_fw`, output of phylotypes, which is described [here](https://github.com/jgolob/phylotypes)

In [359]:
import pandas as pd

In [360]:
# Read the sv counts file from MaLiAmPi (that runs Dada2)
dada2_sv_shared_df = pd.read_csv('dada2.sv.shared.txt', sep='\t')

In [361]:
dada2_sv_shared_df.head()

Unnamed: 0,label,group,numsvs,sv-1,sv-2,sv-3,sv-4,sv-5,sv-6,sv-7,...,sv-7874,sv-7875,sv-7876,sv-7877,sv-7878,sv-7879,sv-7880,sv-7881,sv-7882,sv-7883
0,SRR6665481,dada2,7883,0,3096,2122,1988,7433,1560,9158,...,0,0,0,0,0,0,0,0,0,0
1,SRR6665480,dada2,7883,0,7424,5338,4579,6608,3668,19625,...,0,0,0,0,0,0,0,0,0,0
2,SRR6665490,dada2,7883,0,9923,6263,6633,4429,4266,1344,...,0,0,0,0,0,0,0,0,0,0
3,SRR6665479,dada2,7883,0,28314,20682,18702,282,14057,13932,...,0,0,0,0,0,0,0,0,0,0
4,SRR6665489,dada2,7883,43289,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [362]:
dada2_sv_shared_df = dada2_sv_shared_df.rename(columns={'label': 'sv'})

In [363]:
dada2_sv_shared_df.drop(columns=['group', 'numsvs'], inplace=True)

In [364]:
dada2_sv_shared_df.set_index('sv', inplace=True)

In [365]:
dada2_sv_shared_transposed_df = dada2_sv_shared_df.transpose()

In [366]:
dada2_sv_shared_transposed_df = dada2_sv_shared_transposed_df.rename_axis('sv')

In [367]:
dada2_sv_shared_transposed_df.sample(10)

sv,SRR6665481,SRR6665480,SRR6665490,SRR6665479,SRR6665489,SRR6665487,SRR6665478,SRR6665486,SRR6665483,SRR6665476,...,SRR6666061,SRR6666053,SRR6666059,SRR6666062,SRR6666064,SRR6666058,SRR6666066,SRR6666065,SRR6666067,SRR6666063
sv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
sv-3220,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
sv-1289,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
sv-3124,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
sv-6053,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
sv-5178,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
sv-3321,0,0,0,0,0,0,0,0,0,0,...,16,0,0,0,0,0,0,0,0,0
sv-1521,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
sv-2184,0,0,0,0,0,0,0,0,0,0,...,0,8,0,0,0,0,0,0,0,0
sv-7272,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
sv-7849,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [368]:
# Read the file mapping SVs to phylotypes
phylotypes_maliampi_q20_fw_df = pd.read_csv('phylotypes_maliampi_q20_fw_1_0.txt')

In [369]:
phylotypes_maliampi_q20_fw_df.head()

Unnamed: 0,phylotype,sv
0,pt__00001,sv-3378:SRR6665621
1,pt__00001,sv-6195:SRR6665886
2,pt__00001,sv-1406:SRR6665953
3,pt__00001,sv-4698:SRR6666014
4,pt__00001,sv-580:SRR6666009


In [370]:
# Assuming the column with the string is named 'column_name'
phylotypes_maliampi_q20_fw_df[['sv_from_str',
                               'sample']] = phylotypes_maliampi_q20_fw_df['sv'].str.split(":",
                                                                                          regex=True,
                                                                                          expand=True)

In [371]:
phylotypes_maliampi_q20_fw_df.head()

Unnamed: 0,phylotype,sv,sv_from_str,sample
0,pt__00001,sv-3378:SRR6665621,sv-3378,SRR6665621
1,pt__00001,sv-6195:SRR6665886,sv-6195,SRR6665886
2,pt__00001,sv-1406:SRR6665953,sv-1406,SRR6665953
3,pt__00001,sv-4698:SRR6666014,sv-4698,SRR6666014
4,pt__00001,sv-580:SRR6666009,sv-580,SRR6666009


In [372]:
phylotypes_maliampi_q20_fw_df.drop(columns=['sv', 'sample'], inplace=True)
phylotypes_maliampi_q20_fw_df = phylotypes_maliampi_q20_fw_df.rename(columns={'sv_from_str': 'sv'})

In [373]:
phylotypes_maliampi_q20_fw_df.head()

Unnamed: 0,phylotype,sv
0,pt__00001,sv-3378
1,pt__00001,sv-6195
2,pt__00001,sv-1406
3,pt__00001,sv-4698
4,pt__00001,sv-580


In [374]:
phylotypes_counts_df = pd.merge(phylotypes_maliampi_q20_fw_df,
         dada2_sv_shared_transposed_df,
         on='sv',
         how='inner')

In [375]:
phylotypes_counts_df.drop(columns=['sv'], inplace=True)

In [376]:
phylotypes_counts_df.shape

(7883, 593)

In [377]:
phylotypes_counts_df.set_index('phylotype', inplace=True)
phylotypes_counts_df.head()

Unnamed: 0_level_0,SRR6665481,SRR6665480,SRR6665490,SRR6665479,SRR6665489,SRR6665487,SRR6665478,SRR6665486,SRR6665483,SRR6665476,...,SRR6666061,SRR6666053,SRR6666059,SRR6666062,SRR6666064,SRR6666058,SRR6666066,SRR6666065,SRR6666067,SRR6666063
phylotype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
pt__00001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
pt__00001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
pt__00001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
pt__00001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
pt__00001,8,26,0,0,0,0,8,0,0,0,...,0,13,0,0,0,0,0,0,0,10


In [378]:
# Group by the phylotype column and sum the numeric columns
sum_by_group = phylotypes_counts_df.groupby('phylotype').sum()

In [379]:
phylotypes_counts_df.loc[['pt__00001']]['SRR6665481'].sum()

8122

In [380]:
sum_by_group.shape

(828, 592)

In [381]:
sum_by_group.head()

Unnamed: 0_level_0,SRR6665481,SRR6665480,SRR6665490,SRR6665479,SRR6665489,SRR6665487,SRR6665478,SRR6665486,SRR6665483,SRR6665476,...,SRR6666061,SRR6666053,SRR6666059,SRR6666062,SRR6666064,SRR6666058,SRR6666066,SRR6666065,SRR6666067,SRR6666063
phylotype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
pt__00001,8122,5519,1854,48309,0,0,7870,673,3924,0,...,3091,18535,0,13633,7134,6358,0,0,3501,9797
pt__00002,171679,92936,56503,40244,0,0,75404,4138,6457,0,...,4068,7881,0,19807,4134,6247,0,0,5635,34287
pt__00003,36763,68495,40287,95309,0,0,41507,1589,24237,0,...,40750,57496,0,128947,39397,55763,0,0,27144,140189
pt__00004,60,28,55,14,0,0,0,18,53,0,...,293,526,0,152,22,241,0,0,36,99
pt__00005,3069,3830,271,28520,0,0,466,45,902,0,...,53804,8043,0,3010,1464,9920,0,0,721,1955


In [382]:
def count_zeros(df, threshold=0.5):
    # Count the number of zeros in each row
    # If the number of zeros is greater than the threshold, remove the row
    threshold_int = int(df.shape[1] * threshold)
    print(f'Threshold: {threshold_int} (threshold * number of columns)')
    zero_counts = df.apply(lambda row: (row == 0).sum(), axis=1)
    return df[zero_counts < threshold_int]

In [383]:
print(sum_by_group.shape)
print(count_zeros(sum_by_group, 0.5).shape)

(828, 592)
Threshold: 296 (threshold * number of columns)
(16, 592)


In [384]:
print(sum_by_group.shape)
print(count_zeros(sum_by_group, 0.6).shape)

(828, 592)
Threshold: 355 (threshold * number of columns)
(23, 592)


In [385]:
print(sum_by_group.shape)
print(count_zeros(sum_by_group, 0.7).shape)

(828, 592)
Threshold: 414 (threshold * number of columns)
(29, 592)


In [386]:
print(sum_by_group.shape)
print(count_zeros(sum_by_group, 0.8).shape)

(828, 592)
Threshold: 473 (threshold * number of columns)
(42, 592)


In [387]:
print(sum_by_group.shape)
print(count_zeros(sum_by_group, 0.9).shape)

(828, 592)
Threshold: 532 (threshold * number of columns)
(63, 592)


In [393]:
print(sum_by_group.shape)
print(count_zeros(sum_by_group, 0.95).shape)

(828, 592)
Threshold: 562 (threshold * number of columns)
(82, 592)


In [388]:
print(phylotypes_counts_df.shape)
print(count_zeros(phylotypes_counts_df, 0.5).shape)

(7883, 592)
Threshold: 296 (threshold * number of columns)
(38, 592)


In [389]:
print(phylotypes_counts_df.shape)
print(count_zeros(phylotypes_counts_df, 0.6).shape)

(7883, 592)
Threshold: 355 (threshold * number of columns)
(85, 592)


In [390]:
print(phylotypes_counts_df.shape)
print(count_zeros(phylotypes_counts_df, 0.7).shape)

(7883, 592)
Threshold: 414 (threshold * number of columns)
(149, 592)


In [391]:
print(phylotypes_counts_df.shape)
print(count_zeros(phylotypes_counts_df, 0.8).shape)

(7883, 592)
Threshold: 473 (threshold * number of columns)
(242, 592)


In [392]:
print(phylotypes_counts_df.shape)
print(count_zeros(phylotypes_counts_df, 0.9).shape)

(7883, 592)
Threshold: 532 (threshold * number of columns)
(471, 592)
