# Correlations after clustering ASVs into OTUs

Given the difficulty of running correlations using very sparse ASV table generated with dada2 (min quality 20, only forward reads), we decided to try the clustering approach to group ASVs into OTUs. This comprises a higher level of resolution that could potentially work.

Qiime2 has nice tutorial on how to do this.

## Correlations with OTUs at 99% using SILVA (closed reference)


In [44]:
#kremling_expression_key = '/media/renato/SSD1TB/Repositories/maize_microbiome_transcriptomics/correlations_rnaseq_metataxonomics/0_kremling_expression_key.txt'
#sra_run_table_16s = '/media/renato/SSD1TB/Repositories/maize_microbiome_transcriptomics/16S_wallace2018/SraRunInfo_Wallace_etal_2018.csv'
#sra_run_table_rnaseq = '/media/renato/SSD1TB/Repositories/maize_microbiome_transcriptomics/rnaseq_kremling2018/run_info/SraRunInfo_Kremling_etal_2018.csv'
kremling_expression_key = '/home/rsantos/Repositories/maize_microbiome_transcriptomics/correlations_rnaseq_metataxonomics/0_kremling_expression_key.txt'
sra_run_table_16s = '/home/rsantos/Repositories/maize_microbiome_transcriptomics/16S_wallace2018/SraRunInfo_Wallace_etal_2018.csv'
sra_run_table_rnaseq = '/home/rsantos/Repositories/maize_microbiome_transcriptomics/rnaseq_kremling2018/run_info/SraRunInfo_Kremling_etal_2018.csv'

dict_wallace_kremling_2018 = {}
kremling_expression_key_dict = {}

In [45]:
with open(kremling_expression_key, 'r') as file:

    _ = file.readline()

    for line in file:
        fields = line.strip().split('\t')
        
        kremling_identifier = fields[0]
        wallace_identifier = fields[1]

        kremling_expression_key_dict[kremling_identifier] = wallace_identifier

In [46]:
import re

with open(sra_run_table_rnaseq, 'r') as file:

    _ = file.readline()

    for line in file:
        fields = line.strip().split(',')
        fields2 = fields[11].split('_')
        rnaseq_run_id = fields[0]
        sample_id = fields2[1]
        rnaseq_genotype = fields2[2]
        day = ''
        match = re.search(r'\d+', sample_id)
        unmatched_parts = re.split(r'\d+', sample_id)
        day_period = unmatched_parts[0]
        if match:
            day = int(match.group())
        if sample_id.startswith('LMA') and rnaseq_genotype != '#N/A':
            dict_wallace_kremling_2018[fields[11]] = {'run_accession_16s': '',
                                    'run_accession_rnaseq': rnaseq_run_id,
                                    'day': day,
                                    'day_period': day_period,
                                    'genotype_16s': '',
                                    'genotype_rnaseq': rnaseq_genotype}

In [47]:
rnaseq_samples_with_16s = 0

with open(sra_run_table_16s, 'r') as file:

    _ = file.readline()

    for line in file:
        fields = line.strip().split(',')
        fields2 = fields[11].split('.')
        metataxonomics_run_id = fields[0]
        day = int(fields2[1])
        day_period = fields2[0]
        for key, value in kremling_expression_key_dict.items():
            if value == fields[11]:
                if dict_wallace_kremling_2018[key]['day'] != day:
                    print('Big problem!')
                    print(day, dict_wallace_kremling_2018[key]['day'])
                    print(dict_wallace_kremling_2018[key])
                    print(value, fields[11], key)
                    exit(1)
                if dict_wallace_kremling_2018[key]['day_period'] != day_period:
                    print('Big problem!')
                    if key == '10343927_LMAN8_CML505_CAACAG':
                        #print("It's ok. I know this sample is problematic.")
                        continue
                    else:
                        print(day_period, dict_wallace_kremling_2018[key]['day_period'])
                        print(dict_wallace_kremling_2018[key])
                        print(value, fields[11], key)
                        exit(1)
                dict_wallace_kremling_2018[key]['run_accession_16s'] = metataxonomics_run_id
                rnaseq_samples_with_16s+=1

print(f'{rnaseq_samples_with_16s} sample pairs found.')

Big problem!
484 sample pairs found.


In [48]:
no_16s = 0
for key, value in dict_wallace_kremling_2018.items():
    if value['run_accession_16s'] == '':
        print(key, value)
        no_16s+=1
print(f'{no_16s} samples without 16S data.')

10343927_LMAN8_B73_CACACT {'run_accession_16s': '', 'run_accession_rnaseq': 'SRR5909633', 'day': 8, 'day_period': 'LMAN', 'genotype_16s': '', 'genotype_rnaseq': 'B73'}
10343927_LMAN8_CML505_CAACAG {'run_accession_16s': '', 'run_accession_rnaseq': 'SRR5911345', 'day': 8, 'day_period': 'LMAN', 'genotype_16s': '', 'genotype_rnaseq': 'CML505'}
2 samples without 16S data.


In [49]:
run2my_sample_id = {}

for key in dict_wallace_kremling_2018:
    if dict_wallace_kremling_2018[key]['run_accession_rnaseq']:
        run2my_sample_id[dict_wallace_kremling_2018[key]['run_accession_rnaseq']] = key
    if dict_wallace_kremling_2018[key]['run_accession_16s']:
        run2my_sample_id[dict_wallace_kremling_2018[key]['run_accession_16s']] = key

In [50]:
import pandas as pd

# Importing expression data from Kremling et al. 2018 (TPM matrix on Maize v5 using Salmon after cleaning with cutadapt)
kremling_expression_v5 = pd.read_csv('/media/rsantos/4TB_drive/Projects/UGA_RACS/RNAseq/Salmon/Zma2_tpm_matrix.txt', sep='\t')
#kremling_expression_v5 = pd.read_csv('/media/renato/SSD1TB/Projects/UGA_RACS/RNAseq/Salmon/Zma2_tpm_matrix.txt', sep='\t')

# Rename column and reset the index
kremling_expression_v5.set_index('Name', inplace=True)

# Print the dataframe
kremling_expression_v5.head()

Unnamed: 0_level_0,SRR5909626,SRR5909627,SRR5909633,SRR5909635,SRR5909639,SRR5909642,SRR5909645,SRR5909653,SRR5909655,SRR5909665,...,SRR5912073,SRR5912081,SRR5912082,SRR5912083,SRR5912093,SRR5912094,SRR5912104,SRR5912105,SRR5912111,SRR5912116
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Zm00001eb371370_T002,1.04145,0.0,3.39106,0.0,0.0,1.82712,0.284514,2.23201,0.437147,0.468934,...,0.0,1.51042,0.0,0.0,0.0,2.82055,3.96967,0.0,2.96105,0.0
Zm00001eb371350_T001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zm00001eb371330_T001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zm00001eb371310_T001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zm00001eb371280_T001,1.2765,2.1092,0.692731,0.0,4.2798,1.47496,2.55732,0.0,1.06594,1.14953,...,3.02253,0.4114,1.17447,0.0,3.48749,9.47506,6.19189,3.80776,1.03695,1.14981


In [51]:
# Rename the columns using the dictionary
kremling_expression_v5 = kremling_expression_v5.rename(columns=run2my_sample_id)
kremling_expression_v5.columns = [str(x) for x in kremling_expression_v5.columns]

kremling_expression_v5.head()

Unnamed: 0_level_0,10343927_LMAD26_CI21E_AAGTGG,10343264_LMAN26_CI21E_ATGAAC,10343927_LMAN8_B73_CACACT,10343264_LMAN26_B64_ACCAGT,10343262_LMAN8_B109_TGCTAT,10343262_LMAN8_B14A_CTCTCG,10343262_LMAN8_B57_CCTAAG,10343927_LMAD26_B77_TAATCG,10343262_LMAN8_B79_GCAGCC,10343927_LMAN8_CI187-2_GACGAT,...,10344826_LMAN8_I29_ACGTCT,10344823_LMAD8_IA2132_ACACGC,10343264_LMAD26_CML91_AACGCC,10344827_LMAN26_CML91_AATCCG,10344827_LMAN26_Ki21_AAGACA,10343927_LMAD26_Ki21_ACGTCT,10344826_LMAD8_E2558W_CGCAAC,10343927_LMAN8_E2558W_GAACCT,10344826_LMAD8_IDS69_CAGGAC,10343927_LMAN8_IDS69_ACATTA
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Zm00001eb371370_T002,1.04145,0.0,3.39106,0.0,0.0,1.82712,0.284514,2.23201,0.437147,0.468934,...,0.0,1.51042,0.0,0.0,0.0,2.82055,3.96967,0.0,2.96105,0.0
Zm00001eb371350_T001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zm00001eb371330_T001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zm00001eb371310_T001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zm00001eb371280_T001,1.2765,2.1092,0.692731,0.0,4.2798,1.47496,2.55732,0.0,1.06594,1.14953,...,3.02253,0.4114,1.17447,0.0,3.48749,9.47506,6.19189,3.80776,1.03695,1.14981


### Importing feature table with ASV-generated OTU at 99% (SILVA)

ASVs generated from dada2 with reads quality 20, only forward reads, were clustered into OTUs at 99% using SILVA as reference (pre-processing steps are described on GitLab).

Here, I (RACS) import this table:

In [52]:
wallace_asvs_q20_fw_closedref_silva = pd.read_csv('/media/rsantos/4TB_drive/Projects/UGA_RACS/16S/Qiime2/clustering/FeatureTable/cr99_q20_rv_feature-table.tsv',
                           sep='\t', comment='#')
#wallace_asvs_q20_fw_closedref_silva = pd.read_csv('/media/renato/SSD1TB/Projects/UGA_RACS/16S/Qiime2/clustering/q20_forward/close_ref_99/cr-wallace2018_forward_q20-99_feature-table/cr99_q20_rv_feature-table.tsv',
#                           sep='\t', comment='#')

# Rename column and reset the index
wallace_asvs_q20_fw_closedref_silva.rename(columns={'OTU ID': 'Name'}, inplace=True)
wallace_asvs_q20_fw_closedref_silva.set_index('Name', inplace=True)

# Print the dataframe
wallace_asvs_q20_fw_closedref_silva.head(n=2)

Unnamed: 0_level_0,SRR6665476,SRR6665477,SRR6665478,SRR6665479,SRR6665480,SRR6665481,SRR6665482,SRR6665483,SRR6665484,SRR6665485,...,SRR6666058,SRR6666059,SRR6666060,SRR6666061,SRR6666062,SRR6666063,SRR6666064,SRR6666065,SRR6666066,SRR6666067
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HG328252.1.1392,47869.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,78028.0,0.0,0.0,0.0,0.0,0.0,58946.0,3868.0,0.0
AAAA02020714.1.1202,0.0,4494.0,7158.0,47516.0,12429.0,5135.0,4201.0,1389.0,3836.0,1433.0,...,30209.0,0.0,16345.0,23309.0,55388.0,2754.0,19631.0,0.0,0.0,5564.0


In [53]:
# Rename the columns using the dictionary
wallace_asvs_q20_fw_closedref_silva = wallace_asvs_q20_fw_closedref_silva.rename(columns=run2my_sample_id)
wallace_asvs_q20_fw_closedref_silva.columns = [str(x) for x in wallace_asvs_q20_fw_closedref_silva.columns]

In [54]:
wallace_asvs_q20_fw_closedref_silva.head()

Unnamed: 0_level_0,SRR6665476,10343264_LMAN26_B73_GTGTAG,10343264_LMAN26_NC262_ACAGAT,10343264_LMAN26_CML10_AGACCA,10343264_LMAN26_NC314_ACGTCT,10343264_LMAN26_B46_ACCGTG,10343264_LMAN26_B84_GTGCCA,10343264_LMAN26_B73_ACTCTT,10343264_LMAN26_B77_GTAGAA,10344826_LMAN8_F7_GGCTGC,...,SRR6666058,SRR6666059,SRR6666060,SRR6666061,10344827_LMAN26_I137TN_ACATTA,10343264_LMAN26_CI64_ATATCC,10343927_LMAD26_CML154Q_ACAGAT,10343927_LMAD26_T234_GTCAGG,10343927_LMAD26_NC344_AATGAA,10343927_LMAD26_K64_CCTGCT
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HG328252.1.1392,47869.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,78028.0,0.0,0.0,0.0,0.0,0.0,58946.0,3868.0,0.0
AAAA02020714.1.1202,0.0,4494.0,7158.0,47516.0,12429.0,5135.0,4201.0,1389.0,3836.0,1433.0,...,30209.0,0.0,16345.0,23309.0,55388.0,2754.0,19631.0,0.0,0.0,5564.0
KF101313.1.1306,0.0,1000.0,6839.0,478.0,11097.0,12190.0,11492.0,12074.0,19987.0,2239.0,...,1253.0,0.0,22029.0,367.0,14655.0,75048.0,2044.0,0.0,0.0,1542.0
FN421747.1.1368,0.0,2086.0,7930.0,13604.0,19280.0,8901.0,1051.0,667.0,752.0,14335.0,...,237.0,0.0,14597.0,4431.0,12959.0,478.0,9602.0,0.0,0.0,227.0
MH337967.1.1252,0.0,1351.0,5537.0,10010.0,13543.0,6101.0,766.0,407.0,446.0,9845.0,...,152.0,0.0,8222.0,2908.0,10874.0,293.0,6294.0,0.0,0.0,193.0


In [55]:
kremling_expression_v5_closedref = kremling_expression_v5.filter(items=wallace_asvs_q20_fw_closedref_silva.columns)

In [56]:
wallace_asvs_q20_fw_closedref_silva = wallace_asvs_q20_fw_closedref_silva.filter(items=kremling_expression_v5_closedref.columns)

In [57]:
print(kremling_expression_v5_closedref.shape)
kremling_expression_v5_closedref.head()

(39096, 482)


Unnamed: 0_level_0,10343264_LMAN26_B73_GTGTAG,10343264_LMAN26_NC262_ACAGAT,10343264_LMAN26_CML10_AGACCA,10343264_LMAN26_NC314_ACGTCT,10343264_LMAN26_B46_ACCGTG,10343264_LMAN26_B84_GTGCCA,10343264_LMAN26_B73_ACTCTT,10343264_LMAN26_B77_GTAGAA,10344826_LMAN8_F7_GGCTGC,10344826_LMAN8_ND246_CGTCGC,...,10344826_LMAD8_NC358_GCAGCC,10344826_LMAD8_NC294_CGATCT,10344826_LMAD8_K55_AAGACA,10344827_LMAN26_B73_GAACCT,10344827_LMAN26_I137TN_ACATTA,10343264_LMAN26_CI64_ATATCC,10343927_LMAD26_CML154Q_ACAGAT,10343927_LMAD26_T234_GTCAGG,10343927_LMAD26_NC344_AATGAA,10343927_LMAD26_K64_CCTGCT
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Zm00001eb371370_T002,1.34982,0.0,0.0,0.0,0.0,0.359664,0.541724,0.0,0.0,0.0,...,2.11575,1.24155,1.24528,0.0,0.0,0.0,0.808996,0.74369,3.4292,0.0
Zm00001eb371350_T001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zm00001eb371330_T001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zm00001eb371310_T001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zm00001eb371280_T001,0.441188,0.0,0.215403,0.0,0.817563,2.35113,0.885315,0.517418,1.56597,0.0,...,2.99848,2.70535,2.00021,2.23636,0.0,2.35318,2.97472,0.911501,1.52984,4.15375


In [58]:
print(wallace_asvs_q20_fw_closedref_silva.shape)
wallace_asvs_q20_fw_closedref_silva.head()

(1896, 482)


Unnamed: 0_level_0,10343264_LMAN26_B73_GTGTAG,10343264_LMAN26_NC262_ACAGAT,10343264_LMAN26_CML10_AGACCA,10343264_LMAN26_NC314_ACGTCT,10343264_LMAN26_B46_ACCGTG,10343264_LMAN26_B84_GTGCCA,10343264_LMAN26_B73_ACTCTT,10343264_LMAN26_B77_GTAGAA,10344826_LMAN8_F7_GGCTGC,10344826_LMAN8_ND246_CGTCGC,...,10344826_LMAD8_NC358_GCAGCC,10344826_LMAD8_NC294_CGATCT,10344826_LMAD8_K55_AAGACA,10344827_LMAN26_B73_GAACCT,10344827_LMAN26_I137TN_ACATTA,10343264_LMAN26_CI64_ATATCC,10343927_LMAD26_CML154Q_ACAGAT,10343927_LMAD26_T234_GTCAGG,10343927_LMAD26_NC344_AATGAA,10343927_LMAD26_K64_CCTGCT
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HG328252.1.1392,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,156104.0,91111.0,90631.0,0.0,0.0,0.0,0.0,58946.0,3868.0,0.0
AAAA02020714.1.1202,4494.0,7158.0,47516.0,12429.0,5135.0,4201.0,1389.0,3836.0,1433.0,229.0,...,0.0,0.0,0.0,731.0,55388.0,2754.0,19631.0,0.0,0.0,5564.0
KF101313.1.1306,1000.0,6839.0,478.0,11097.0,12190.0,11492.0,12074.0,19987.0,2239.0,519.0,...,0.0,0.0,0.0,2957.0,14655.0,75048.0,2044.0,0.0,0.0,1542.0
FN421747.1.1368,2086.0,7930.0,13604.0,19280.0,8901.0,1051.0,667.0,752.0,14335.0,285.0,...,0.0,0.0,0.0,2362.0,12959.0,478.0,9602.0,0.0,0.0,227.0
MH337967.1.1252,1351.0,5537.0,10010.0,13543.0,6101.0,766.0,407.0,446.0,9845.0,200.0,...,0.0,0.0,0.0,1775.0,10874.0,293.0,6294.0,0.0,0.0,193.0


In [59]:
if wallace_asvs_q20_fw_closedref_silva.columns.all() == kremling_expression_v5_closedref.columns.all():
    print('Columns are equal!')

Columns are equal!


In [60]:
def count_zeros(df, threshold=0.5):
    # Count the number of zeros in each row
    # If the number of zeros is greater than the threshold, remove the row
    threshold_int = int(df.shape[1] * threshold)
    print(f'Threshold: {threshold_int} (threshold * number of columns)')
    zero_counts = df.apply(lambda row: (row == 0).sum(), axis=1)
    return df[zero_counts < threshold_int]

In [61]:
wallace_asvs_q20_fw_closedref_silva_zeros_filtered = count_zeros(wallace_asvs_q20_fw_closedref_silva)
kremling_expression_v5_zeros_filtered = count_zeros(kremling_expression_v5_closedref)

Threshold: 241 (threshold * number of columns)
Threshold: 241 (threshold * number of columns)


In [62]:
print(wallace_asvs_q20_fw_closedref_silva_zeros_filtered.shape)
print(kremling_expression_v5_zeros_filtered.shape)

(7, 482)
(19953, 482)


In [63]:
wallace_asvs_q20_fw_closedref_silva_zeros_filtered

Unnamed: 0_level_0,10343264_LMAN26_B73_GTGTAG,10343264_LMAN26_NC262_ACAGAT,10343264_LMAN26_CML10_AGACCA,10343264_LMAN26_NC314_ACGTCT,10343264_LMAN26_B46_ACCGTG,10343264_LMAN26_B84_GTGCCA,10343264_LMAN26_B73_ACTCTT,10343264_LMAN26_B77_GTAGAA,10344826_LMAN8_F7_GGCTGC,10344826_LMAN8_ND246_CGTCGC,...,10344826_LMAD8_NC358_GCAGCC,10344826_LMAD8_NC294_CGATCT,10344826_LMAD8_K55_AAGACA,10344827_LMAN26_B73_GAACCT,10344827_LMAN26_I137TN_ACATTA,10343264_LMAN26_CI64_ATATCC,10343927_LMAD26_CML154Q_ACAGAT,10343927_LMAD26_T234_GTCAGG,10343927_LMAD26_NC344_AATGAA,10343927_LMAD26_K64_CCTGCT
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAAA02020714.1.1202,4494.0,7158.0,47516.0,12429.0,5135.0,4201.0,1389.0,3836.0,1433.0,229.0,...,0.0,0.0,0.0,731.0,55388.0,2754.0,19631.0,0.0,0.0,5564.0
KF101313.1.1306,1000.0,6839.0,478.0,11097.0,12190.0,11492.0,12074.0,19987.0,2239.0,519.0,...,0.0,0.0,0.0,2957.0,14655.0,75048.0,2044.0,0.0,0.0,1542.0
FN421747.1.1368,2086.0,7930.0,13604.0,19280.0,8901.0,1051.0,667.0,752.0,14335.0,285.0,...,0.0,0.0,0.0,2362.0,12959.0,478.0,9602.0,0.0,0.0,227.0
MH337967.1.1252,1351.0,5537.0,10010.0,13543.0,6101.0,766.0,407.0,446.0,9845.0,200.0,...,0.0,0.0,0.0,1775.0,10874.0,293.0,6294.0,0.0,0.0,193.0
JF177515.1.1306,636.0,7046.0,652.0,7960.0,85873.0,901.0,652.0,5983.0,7041.0,227.0,...,0.0,0.0,0.0,197.0,3584.0,4680.0,190.0,0.0,0.0,303.0
JQ904048.1.1214,286.0,26752.0,288.0,31585.0,7985.0,1031.0,424.0,6752.0,12079.0,534.0,...,0.0,0.0,0.0,308.0,3406.0,961.0,118.0,0.0,0.0,619.0
HM487994.1.1230,184.0,10256.0,152.0,8088.0,6904.0,903.0,123.0,452.0,2735.0,0.0,...,0.0,0.0,0.0,526.0,1881.0,565.0,1773.0,0.0,0.0,185.0


Taxonomy:

| ID | Taxonomy info |
|----|---------------|
| AAAA02020714.1.1202 | Bacteria;Proteobacteria;Alphaproteobacteria;Sphingomonadales;Sphingomonadaceae;Sphingomonas;Oryza sativa Indica Group (long-grained rice) |
| KF101313.1.1306	 | Bacteria;Proteobacteria;Alphaproteobacteria;Sphingomonadales;Sphingomonadaceae;Sphingomonas;uncultured bacterium |
| N421747.1.1368	 | Bacteria;Proteobacteria;Alphaproteobacteria;Sphingomonadales;Sphingomonadaceae;Sphingomonas;uncultured bacterium |
| MH337967.1.1252	 | Bacteria;Proteobacteria;Alphaproteobacteria;Sphingomonadales;Sphingomonadaceae;Novosphingobium;Novosphingobium sp. |
| JF177515.1.1306 | Bacteria;Proteobacteria;Alphaproteobacteria;Rhizobiales;Beijerinckiaceae;Methylobacterium-Methylorubrum;uncultured bacterium |
| JQ904048.1.1214 | Bacteria;Proteobacteria;Alphaproteobacteria;Rhizobiales;Beijerinckiaceae;Methylobacterium-Methylorubrum;Methylorubrum rhodesianum |
| HM487994.1.1230 | Bacteria;Bacteroidota;Bacteroidia;Cytophagales;Hymenobacteraceae;Hymenobacter;uncultured bacterium |


## Correlations with OTUs at 97% (de novo clustering)

ASVs generated from dada2 with reads quality 20, only forward reads, were clustered into OTUs at 97% using a de novo method (pre-processing steps are described on GitLab).

Here, I (RACS) import this table:

In [64]:
wallace_asvs_q20_fw_denovo97_silva = pd.read_csv('/media/rsantos/4TB_drive/Projects/UGA_RACS/16S/Qiime2/clustering/FeatureTable/denovo_97_q20_rv_feature-table.tsv',
                           sep='\t', comment='#')

# Rename column and reset the index
wallace_asvs_q20_fw_denovo97_silva.rename(columns={'OTU ID': 'Name'}, inplace=True)
wallace_asvs_q20_fw_denovo97_silva.set_index('Name', inplace=True)

# Print the dataframe
print(wallace_asvs_q20_fw_denovo97_silva.shape)
wallace_asvs_q20_fw_denovo97_silva.head(n=2)

(1360, 592)


Unnamed: 0_level_0,SRR6665476,SRR6665477,SRR6665478,SRR6665479,SRR6665480,SRR6665481,SRR6665482,SRR6665483,SRR6665484,SRR6665485,...,SRR6666058,SRR6666059,SRR6666060,SRR6666061,SRR6666062,SRR6666063,SRR6666064,SRR6666065,SRR6666066,SRR6666067
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
bc664ea528899e36452dd37c1f55a48a,47869.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,78028.0,0.0,0.0,0.0,0.0,0.0,58946.0,3868.0,0.0
232ad9e267688a5d573112b4855bac96,0.0,13538.0,55068.0,103357.0,94445.0,48809.0,26670.0,20909.0,41009.0,41676.0,...,45464.0,0.0,86537.0,42962.0,133335.0,114052.0,53169.0,0.0,0.0,18971.0


In [65]:
# Rename the columns using the dictionary
wallace_asvs_q20_fw_denovo97_silva = wallace_asvs_q20_fw_denovo97_silva.rename(columns=run2my_sample_id)
wallace_asvs_q20_fw_denovo97_silva.columns = [str(x) for x in wallace_asvs_q20_fw_denovo97_silva.columns]

In [66]:
wallace_asvs_q20_fw_denovo97_silva.head()

Unnamed: 0_level_0,SRR6665476,10343264_LMAN26_B73_GTGTAG,10343264_LMAN26_NC262_ACAGAT,10343264_LMAN26_CML10_AGACCA,10343264_LMAN26_NC314_ACGTCT,10343264_LMAN26_B46_ACCGTG,10343264_LMAN26_B84_GTGCCA,10343264_LMAN26_B73_ACTCTT,10343264_LMAN26_B77_GTAGAA,10344826_LMAN8_F7_GGCTGC,...,SRR6666058,SRR6666059,SRR6666060,SRR6666061,10344827_LMAN26_I137TN_ACATTA,10343264_LMAN26_CI64_ATATCC,10343927_LMAD26_CML154Q_ACAGAT,10343927_LMAD26_T234_GTCAGG,10343927_LMAD26_NC344_AATGAA,10343927_LMAD26_K64_CCTGCT
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
bc664ea528899e36452dd37c1f55a48a,47869.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,78028.0,0.0,0.0,0.0,0.0,0.0,58946.0,3868.0,0.0
232ad9e267688a5d573112b4855bac96,0.0,13538.0,55068.0,103357.0,94445.0,48809.0,26670.0,20909.0,41009.0,41676.0,...,45464.0,0.0,86537.0,42962.0,133335.0,114052.0,53169.0,0.0,0.0,18971.0
d8fac1aa74436b8041e29a3237da7955,0.0,1333.0,11446.0,3295.0,22949.0,120770.0,2393.0,1295.0,9106.0,32376.0,...,205.0,0.0,1001.0,141.0,5172.0,8277.0,190.0,0.0,0.0,407.0
d7386797b5be1ee26245561ee11ce7d4,0.0,392.0,37685.0,399.0,44248.0,11092.0,1414.0,609.0,9471.0,17001.0,...,97.0,0.0,1635.0,49.0,4849.0,1349.0,118.0,0.0,0.0,875.0
225bc1cb152b786927ca748b98403d94,0.0,364.0,17363.0,298.0,15272.0,10443.0,1456.0,277.0,849.0,4929.0,...,66.0,0.0,147.0,98.0,3222.0,2003.0,2516.0,0.0,0.0,376.0


In [67]:
kremling_expression_v5_denovo97 = kremling_expression_v5.filter(items=wallace_asvs_q20_fw_denovo97_silva.columns)

In [68]:
wallace_asvs_q20_fw_denovo97_silva = wallace_asvs_q20_fw_denovo97_silva.filter(items=kremling_expression_v5_denovo97.columns)

In [69]:
print(kremling_expression_v5_denovo97.shape)
kremling_expression_v5_denovo97.head()

(39096, 482)


Unnamed: 0_level_0,10343264_LMAN26_B73_GTGTAG,10343264_LMAN26_NC262_ACAGAT,10343264_LMAN26_CML10_AGACCA,10343264_LMAN26_NC314_ACGTCT,10343264_LMAN26_B46_ACCGTG,10343264_LMAN26_B84_GTGCCA,10343264_LMAN26_B73_ACTCTT,10343264_LMAN26_B77_GTAGAA,10344826_LMAN8_F7_GGCTGC,10344826_LMAN8_ND246_CGTCGC,...,10344826_LMAD8_NC358_GCAGCC,10344826_LMAD8_NC294_CGATCT,10344826_LMAD8_K55_AAGACA,10344827_LMAN26_B73_GAACCT,10344827_LMAN26_I137TN_ACATTA,10343264_LMAN26_CI64_ATATCC,10343927_LMAD26_CML154Q_ACAGAT,10343927_LMAD26_T234_GTCAGG,10343927_LMAD26_NC344_AATGAA,10343927_LMAD26_K64_CCTGCT
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Zm00001eb371370_T002,1.34982,0.0,0.0,0.0,0.0,0.359664,0.541724,0.0,0.0,0.0,...,2.11575,1.24155,1.24528,0.0,0.0,0.0,0.808996,0.74369,3.4292,0.0
Zm00001eb371350_T001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zm00001eb371330_T001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zm00001eb371310_T001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zm00001eb371280_T001,0.441188,0.0,0.215403,0.0,0.817563,2.35113,0.885315,0.517418,1.56597,0.0,...,2.99848,2.70535,2.00021,2.23636,0.0,2.35318,2.97472,0.911501,1.52984,4.15375


In [70]:
print(wallace_asvs_q20_fw_denovo97_silva.shape)
wallace_asvs_q20_fw_denovo97_silva.head()

(1360, 482)


Unnamed: 0_level_0,10343264_LMAN26_B73_GTGTAG,10343264_LMAN26_NC262_ACAGAT,10343264_LMAN26_CML10_AGACCA,10343264_LMAN26_NC314_ACGTCT,10343264_LMAN26_B46_ACCGTG,10343264_LMAN26_B84_GTGCCA,10343264_LMAN26_B73_ACTCTT,10343264_LMAN26_B77_GTAGAA,10344826_LMAN8_F7_GGCTGC,10344826_LMAN8_ND246_CGTCGC,...,10344826_LMAD8_NC358_GCAGCC,10344826_LMAD8_NC294_CGATCT,10344826_LMAD8_K55_AAGACA,10344827_LMAN26_B73_GAACCT,10344827_LMAN26_I137TN_ACATTA,10343264_LMAN26_CI64_ATATCC,10343927_LMAD26_CML154Q_ACAGAT,10343927_LMAD26_T234_GTCAGG,10343927_LMAD26_NC344_AATGAA,10343927_LMAD26_K64_CCTGCT
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
bc664ea528899e36452dd37c1f55a48a,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,156104.0,91111.0,90631.0,0.0,0.0,0.0,0.0,58946.0,3868.0,0.0
232ad9e267688a5d573112b4855bac96,13538.0,55068.0,103357.0,94445.0,48809.0,26670.0,20909.0,41009.0,41676.0,1858.0,...,0.0,0.0,0.0,12464.0,133335.0,114052.0,53169.0,0.0,0.0,18971.0
d8fac1aa74436b8041e29a3237da7955,1333.0,11446.0,3295.0,22949.0,120770.0,2393.0,1295.0,9106.0,32376.0,1754.0,...,0.0,0.0,0.0,1037.0,5172.0,8277.0,190.0,0.0,0.0,407.0
d7386797b5be1ee26245561ee11ce7d4,392.0,37685.0,399.0,44248.0,11092.0,1414.0,609.0,9471.0,17001.0,852.0,...,0.0,0.0,0.0,526.0,4849.0,1349.0,118.0,0.0,0.0,875.0
225bc1cb152b786927ca748b98403d94,364.0,17363.0,298.0,15272.0,10443.0,1456.0,277.0,849.0,4929.0,32.0,...,0.0,0.0,0.0,824.0,3222.0,2003.0,2516.0,0.0,0.0,376.0


In [71]:
if wallace_asvs_q20_fw_denovo97_silva.columns.all() == kremling_expression_v5_denovo97.columns.all():
    print('Columns are equal!')

Columns are equal!


In [72]:
wallace_asvs_q20_fw_denovo97_silva_zeros_filtered = count_zeros(wallace_asvs_q20_fw_denovo97_silva)
kremling_expression_v5_denovo97_zeros_filtered = count_zeros(kremling_expression_v5_denovo97)

Threshold: 241 (threshold * number of columns)
Threshold: 241 (threshold * number of columns)


In [73]:
print(wallace_asvs_q20_fw_denovo97_silva_zeros_filtered.shape)
print(kremling_expression_v5_denovo97_zeros_filtered.shape)

(12, 482)
(19953, 482)


In [88]:
wallace_asvs_q20_fw_denovo97_silva_zeros_filtered.index

Index(['232ad9e267688a5d573112b4855bac96', 'd8fac1aa74436b8041e29a3237da7955',
       'd7386797b5be1ee26245561ee11ce7d4', '225bc1cb152b786927ca748b98403d94',
       '876139714249d7a0203594416545eda6', 'a6c946adedd20c184243c5a9580120c7',
       '9513fb99947860ebff7ca131f8cee160', 'dc8c3ff51b0e92f1f7a0c879437a6185',
       '015c66180ecdb90b731d72f74a541767', 'd256dc8e2d15b37c97e829c7f6fe5d23',
       'acca42a5ffae1505efc844b610370e80', 'b1bda8871b67d26c1b0c1ad6d5c57c56'],
      dtype='object', name='Name')

Taxonomy:

| ID | Taxonomy info |
|----|---------------|
| 232ad9e267688a5d573112b4855bac96 | d__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Sphingomonadales; f__Sphingomonadaceae; g__Sphingomonas |
| d8fac1aa74436b8041e29a3237da7955 | d__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Rhizobiales; f__Beijerinckiaceae; g__Methylobacterium-Methylorubrum |
| d7386797b5be1ee26245561ee11ce7d4 | d__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Rhizobiales; f__Beijerinckiaceae; g__Methylobacterium-Methylorubrum |
| 225bc1cb152b786927ca748b98403d94 | d__Bacteria; p__Bacteroidota; c__Bacteroidia; o__Cytophagales; f__Hymenobacteraceae; g__Hymenobacter |
| 876139714249d7a0203594416545eda6 | d__Bacteria; p__Actinobacteriota; c__Actinobacteria; o__Micrococcales; f__Microbacteriaceae; g__Microbacterium |
| a6c946adedd20c184243c5a9580120c7 | d__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Enterobacterales |
| 9513fb99947860ebff7ca131f8cee160 | d__Bacteria; p__Bacteroidota; c__Bacteroidia; o__Flavobacteriales; f__Weeksellaceae; g__Chryseobacterium |
| dc8c3ff51b0e92f1f7a0c879437a6185 | d__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Rickettsiales; f__Mitochondria; g__Mitochondria |
| 015c66180ecdb90b731d72f74a541767 | d__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Rhizobiales; f__Rhizobiaceae; g__Allorhizobium-Neorhizobium-Pararhizobium-Rhizobium |
| d256dc8e2d15b37c97e829c7f6fe5d23 | d__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Pseudomonadales; f__Moraxellaceae; g__Acinetobacter |
| acca42a5ffae1505efc844b610370e80 | d__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Pseudomonadales; f__Pseudomonadaceae; g__Pseudomonas |
| b1bda8871b67d26c1b0c1ad6d5c57c56 | d__Bacteria; p__Actinobacteriota; c__Actinobacteria; o__Propionibacteriales; f__Nocardioidaceae; g__Nocardioides |

## Correlations with OTUs at 85% (de novo clustering)

In [75]:
wallace_asvs_q20_fw_denovo85_silva = pd.read_csv('/media/rsantos/4TB_drive/Projects/UGA_RACS/16S/Qiime2/clustering/FeatureTable/denovo_85_q20_rv_feature-table.tsv',
                           sep='\t', comment='#')

# Rename column and reset the index
wallace_asvs_q20_fw_denovo85_silva.rename(columns={'OTU ID': 'Name'}, inplace=True)
wallace_asvs_q20_fw_denovo85_silva.set_index('Name', inplace=True)

# Print the dataframe
print(wallace_asvs_q20_fw_denovo85_silva.shape)
wallace_asvs_q20_fw_denovo85_silva.head(n=2)

(278, 592)


Unnamed: 0_level_0,SRR6665476,SRR6665477,SRR6665478,SRR6665479,SRR6665480,SRR6665481,SRR6665482,SRR6665483,SRR6665484,SRR6665485,...,SRR6666058,SRR6666059,SRR6666060,SRR6666061,SRR6666062,SRR6666063,SRR6666064,SRR6666065,SRR6666066,SRR6666067
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
bc664ea528899e36452dd37c1f55a48a,47869.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,78028.0,0.0,0.0,0.0,0.0,0.0,58946.0,3868.0,0.0
232ad9e267688a5d573112b4855bac96,0.0,17150.0,113657.0,181791.0,166400.0,185588.0,32209.0,35833.0,64406.0,93505.0,...,59955.0,0.0,102025.0,53111.0,152497.0,195690.0,59885.0,0.0,0.0,98880.0


In [76]:
# Rename the columns using the dictionary
wallace_asvs_q20_fw_denovo85_silva = wallace_asvs_q20_fw_denovo85_silva.rename(columns=run2my_sample_id)
wallace_asvs_q20_fw_denovo85_silva.columns = [str(x) for x in wallace_asvs_q20_fw_denovo85_silva.columns]

In [77]:
wallace_asvs_q20_fw_denovo85_silva.head()

Unnamed: 0_level_0,SRR6665476,10343264_LMAN26_B73_GTGTAG,10343264_LMAN26_NC262_ACAGAT,10343264_LMAN26_CML10_AGACCA,10343264_LMAN26_NC314_ACGTCT,10343264_LMAN26_B46_ACCGTG,10343264_LMAN26_B84_GTGCCA,10343264_LMAN26_B73_ACTCTT,10343264_LMAN26_B77_GTAGAA,10344826_LMAN8_F7_GGCTGC,...,SRR6666058,SRR6666059,SRR6666060,SRR6666061,10344827_LMAN26_I137TN_ACATTA,10343264_LMAN26_CI64_ATATCC,10343927_LMAD26_CML154Q_ACAGAT,10343927_LMAD26_T234_GTCAGG,10343927_LMAD26_NC344_AATGAA,10343927_LMAD26_K64_CCTGCT
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
bc664ea528899e36452dd37c1f55a48a,47869.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,78028.0,0.0,0.0,0.0,0.0,0.0,58946.0,3868.0,0.0
232ad9e267688a5d573112b4855bac96,0.0,17150.0,113657.0,181791.0,166400.0,185588.0,32209.0,35833.0,64406.0,93505.0,...,59955.0,0.0,102025.0,53111.0,152497.0,195690.0,59885.0,0.0,0.0,98880.0
225bc1cb152b786927ca748b98403d94,0.0,445.0,17747.0,1673.0,15666.0,11841.0,1522.0,720.0,1104.0,5969.0,...,2458.0,0.0,1059.0,598.0,11127.0,8230.0,2751.0,0.0,0.0,608.0
78bf7c4729be138cb22ea5f917366900,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1e79e797df27c1c15d67400dde4286e2,934.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,806.0,0.0,0.0,0.0,0.0,0.0,332.0,164.0,0.0


In [78]:
kremling_expression_v5_denovo85 = kremling_expression_v5.filter(items=wallace_asvs_q20_fw_denovo85_silva.columns)

In [79]:
wallace_asvs_q20_fw_denovo85_silva = wallace_asvs_q20_fw_denovo85_silva.filter(items=kremling_expression_v5_denovo85.columns)

In [80]:
print(kremling_expression_v5_denovo85.shape)
kremling_expression_v5_denovo85.head()

(39096, 482)


Unnamed: 0_level_0,10343264_LMAN26_B73_GTGTAG,10343264_LMAN26_NC262_ACAGAT,10343264_LMAN26_CML10_AGACCA,10343264_LMAN26_NC314_ACGTCT,10343264_LMAN26_B46_ACCGTG,10343264_LMAN26_B84_GTGCCA,10343264_LMAN26_B73_ACTCTT,10343264_LMAN26_B77_GTAGAA,10344826_LMAN8_F7_GGCTGC,10344826_LMAN8_ND246_CGTCGC,...,10344826_LMAD8_NC358_GCAGCC,10344826_LMAD8_NC294_CGATCT,10344826_LMAD8_K55_AAGACA,10344827_LMAN26_B73_GAACCT,10344827_LMAN26_I137TN_ACATTA,10343264_LMAN26_CI64_ATATCC,10343927_LMAD26_CML154Q_ACAGAT,10343927_LMAD26_T234_GTCAGG,10343927_LMAD26_NC344_AATGAA,10343927_LMAD26_K64_CCTGCT
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Zm00001eb371370_T002,1.34982,0.0,0.0,0.0,0.0,0.359664,0.541724,0.0,0.0,0.0,...,2.11575,1.24155,1.24528,0.0,0.0,0.0,0.808996,0.74369,3.4292,0.0
Zm00001eb371350_T001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zm00001eb371330_T001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zm00001eb371310_T001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zm00001eb371280_T001,0.441188,0.0,0.215403,0.0,0.817563,2.35113,0.885315,0.517418,1.56597,0.0,...,2.99848,2.70535,2.00021,2.23636,0.0,2.35318,2.97472,0.911501,1.52984,4.15375


In [81]:
print(wallace_asvs_q20_fw_denovo85_silva.shape)
wallace_asvs_q20_fw_denovo85_silva.head()

(278, 482)


Unnamed: 0_level_0,10343264_LMAN26_B73_GTGTAG,10343264_LMAN26_NC262_ACAGAT,10343264_LMAN26_CML10_AGACCA,10343264_LMAN26_NC314_ACGTCT,10343264_LMAN26_B46_ACCGTG,10343264_LMAN26_B84_GTGCCA,10343264_LMAN26_B73_ACTCTT,10343264_LMAN26_B77_GTAGAA,10344826_LMAN8_F7_GGCTGC,10344826_LMAN8_ND246_CGTCGC,...,10344826_LMAD8_NC358_GCAGCC,10344826_LMAD8_NC294_CGATCT,10344826_LMAD8_K55_AAGACA,10344827_LMAN26_B73_GAACCT,10344827_LMAN26_I137TN_ACATTA,10343264_LMAN26_CI64_ATATCC,10343927_LMAD26_CML154Q_ACAGAT,10343927_LMAD26_T234_GTCAGG,10343927_LMAD26_NC344_AATGAA,10343927_LMAD26_K64_CCTGCT
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
bc664ea528899e36452dd37c1f55a48a,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,156104.0,91111.0,90631.0,0.0,0.0,0.0,0.0,58946.0,3868.0,0.0
232ad9e267688a5d573112b4855bac96,17150.0,113657.0,181791.0,166400.0,185588.0,32209.0,35833.0,64406.0,93505.0,5310.0,...,0.0,0.0,0.0,17633.0,152497.0,195690.0,59885.0,0.0,0.0,98880.0
225bc1cb152b786927ca748b98403d94,445.0,17747.0,1673.0,15666.0,11841.0,1522.0,720.0,1104.0,5969.0,213.0,...,0.0,0.0,0.0,941.0,11127.0,8230.0,2751.0,0.0,0.0,608.0
78bf7c4729be138cb22ea5f917366900,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,64089.0,23578.0,4062.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1e79e797df27c1c15d67400dde4286e2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,874.0,0.0,0.0,0.0,0.0,0.0,332.0,164.0,0.0


In [82]:
if wallace_asvs_q20_fw_denovo85_silva.columns.all() == kremling_expression_v5_denovo85.columns.all():
    print('Columns are equal!')

Columns are equal!


In [83]:
wallace_asvs_q20_fw_denovo85_silva_zeros_filtered = count_zeros(wallace_asvs_q20_fw_denovo85_silva)
kremling_expression_v5_denovo85_zeros_filtered = count_zeros(kremling_expression_v5_denovo85)

Threshold: 241 (threshold * number of columns)
Threshold: 241 (threshold * number of columns)


In [84]:
print(wallace_asvs_q20_fw_denovo85_silva_zeros_filtered.shape)
print(kremling_expression_v5_denovo85_zeros_filtered.shape)

(6, 482)
(19953, 482)


In [87]:
wallace_asvs_q20_fw_denovo85_silva_zeros_filtered.index

Index(['232ad9e267688a5d573112b4855bac96', '225bc1cb152b786927ca748b98403d94',
       '9513fb99947860ebff7ca131f8cee160', 'dc8c3ff51b0e92f1f7a0c879437a6185',
       '3a3a197520db6ad8f425e4d0935e5f23', '343621515628bbb16a8aa3cfbf103abe'],
      dtype='object', name='Name')

Taxonomy:

| ID | Taxonomy info |
|----|---------------|
| 232ad9e267688a5d573112b4855bac96 | d__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Sphingomonadales; f__Sphingomonadaceae; g__Sphingomonas |
| 225bc1cb152b786927ca748b98403d94 | d__Bacteria; p__Bacteroidota; c__Bacteroidia; o__Cytophagales; f__Hymenobacteraceae; g__Hymenobacter |
| 9513fb99947860ebff7ca131f8cee160 | d__Bacteria; p__Bacteroidota; c__Bacteroidia; o__Flavobacteriales; f__Weeksellaceae; g__Chryseobacterium |
| dc8c3ff51b0e92f1f7a0c879437a6185 | d__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Rickettsiales; f__Mitochondria; g__Mitochondria |
| 3a3a197520db6ad8f425e4d0935e5f23 | d__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Burkholderiales; f__Comamonadaceae |
| 343621515628bbb16a8aa3cfbf103abe | d__Bacteria; p__Actinobacteriota; c__Actinobacteria |