# Exporting RNAseq and Metataxonomic data to file

The aim here is to export paired data, after selection of samples present in both RNAseq and Microbiome:

 * RNAseq - TPM
 * RNAseq - numreads
 * Metataxonomic data - OTUs from Wallace et al (2018)
 * Metataxonomic data - ASVs
 * Metataxonomic data - ASVs clustered in different ways (de novo, open and closed refefence)

Exported matrices will be available for further analyses using other languages (e.g., using Vegan in R or even for other analyses with Python)

In [1]:
kremling_expression_key = '/home/rsantos/Repositories/maize_microbiome_transcriptomics/correlations_rnaseq_metataxonomics/0_kremling_expression_key.txt'
sra_run_table_16s = '/home/rsantos/Repositories/maize_microbiome_transcriptomics/16S_wallace2018/SraRunInfo_Wallace_etal_2018.csv'
sra_run_table_rnaseq = '/home/rsantos/Repositories/maize_microbiome_transcriptomics/rnaseq_kremling2018/run_info/SraRunInfo_Kremling_etal_2018.csv'

dict_wallace_kremling_2018 = {}
kremling_expression_key_dict = {}

In [2]:
with open(kremling_expression_key, 'r') as file:

    _ = file.readline()

    for line in file:
        fields = line.strip().split('\t')
        
        kremling_identifier = fields[0]
        wallace_identifier = fields[1]

        kremling_expression_key_dict[kremling_identifier] = wallace_identifier

In [3]:
import re

with open(sra_run_table_rnaseq, 'r') as file:

    _ = file.readline()

    for line in file:
        fields = line.strip().split(',')
        fields2 = fields[11].split('_')
        rnaseq_run_id = fields[0]
        sample_id = fields2[1]
        rnaseq_genotype = fields2[2]
        day = ''
        match = re.search(r'\d+', sample_id)
        unmatched_parts = re.split(r'\d+', sample_id)
        day_period = unmatched_parts[0]
        if match:
            day = int(match.group())
        if sample_id.startswith('LMA') and rnaseq_genotype != '#N/A':
            dict_wallace_kremling_2018[fields[11]] = {'run_accession_16s': '',
                                    'run_accession_rnaseq': rnaseq_run_id,
                                    'day': day,
                                    'day_period': day_period,
                                    'genotype_16s': '',
                                    'genotype_rnaseq': rnaseq_genotype}

In [4]:
rnaseq_samples_with_16s = 0

with open(sra_run_table_16s, 'r') as file:

    _ = file.readline()

    for line in file:
        fields = line.strip().split(',')
        fields2 = fields[11].split('.')
        metataxonomics_run_id = fields[0]
        day = int(fields2[1])
        day_period = fields2[0]
        for key, value in kremling_expression_key_dict.items():
            if value == fields[11]:
                if dict_wallace_kremling_2018[key]['day'] != day:
                    print('Big problem!')
                    print(day, dict_wallace_kremling_2018[key]['day'])
                    print(dict_wallace_kremling_2018[key])
                    print(value, fields[11], key)
                    exit(1)
                if dict_wallace_kremling_2018[key]['day_period'] != day_period:
                    print('Big problem!')
                    if key == '10343927_LMAN8_CML505_CAACAG':
                        #print("It's ok. I know this sample is problematic.")
                        continue
                    else:
                        print(day_period, dict_wallace_kremling_2018[key]['day_period'])
                        print(dict_wallace_kremling_2018[key])
                        print(value, fields[11], key)
                        exit(1)
                dict_wallace_kremling_2018[key]['run_accession_16s'] = metataxonomics_run_id
                rnaseq_samples_with_16s+=1

print(f'{rnaseq_samples_with_16s} sample pairs found.')

Big problem!
484 sample pairs found.


In [5]:
no_16s = 0
for key, value in dict_wallace_kremling_2018.items():
    if value['run_accession_16s'] == '':
        print(key, value)
        no_16s+=1
print(f'{no_16s} samples without 16S data.')

10343927_LMAN8_B73_CACACT {'run_accession_16s': '', 'run_accession_rnaseq': 'SRR5909633', 'day': 8, 'day_period': 'LMAN', 'genotype_16s': '', 'genotype_rnaseq': 'B73'}
10343927_LMAN8_CML505_CAACAG {'run_accession_16s': '', 'run_accession_rnaseq': 'SRR5911345', 'day': 8, 'day_period': 'LMAN', 'genotype_16s': '', 'genotype_rnaseq': 'CML505'}
2 samples without 16S data.


In [6]:
run2my_sample_id = {}

for key in dict_wallace_kremling_2018:
    if dict_wallace_kremling_2018[key]['run_accession_rnaseq']:
        run2my_sample_id[dict_wallace_kremling_2018[key]['run_accession_rnaseq']] = key
    if dict_wallace_kremling_2018[key]['run_accession_16s']:
        run2my_sample_id[dict_wallace_kremling_2018[key]['run_accession_16s']] = key

In [7]:
import pandas as pd

# Importing expression data from Kremling et al. 2018 (TPM matrix on Maize v5 using Salmon after cleaning with cutadapt)
kremling_expression_tpm_v5 = pd.read_csv('/media/rsantos/4TB_drive/Projects/UGA_RACS/RNAseq/Salmon/Zma2_tpm_matrix.txt', sep='\t')
#kremling_expression_v5 = pd.read_csv('/media/renato/SSD1TB/Projects/UGA_RACS/RNAseq/Salmon/Zma2_tpm_matrix.txt', sep='\t')

# Rename column and reset the index
kremling_expression_tpm_v5.set_index('Name', inplace=True)

# Print the dataframe
kremling_expression_tpm_v5.head()

Unnamed: 0_level_0,SRR5909626,SRR5909627,SRR5909633,SRR5909635,SRR5909639,SRR5909642,SRR5909645,SRR5909653,SRR5909655,SRR5909665,...,SRR5912073,SRR5912081,SRR5912082,SRR5912083,SRR5912093,SRR5912094,SRR5912104,SRR5912105,SRR5912111,SRR5912116
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Zm00001eb371370_T002,1.04145,0.0,3.39106,0.0,0.0,1.82712,0.284514,2.23201,0.437147,0.468934,...,0.0,1.51042,0.0,0.0,0.0,2.82055,3.96967,0.0,2.96105,0.0
Zm00001eb371350_T001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zm00001eb371330_T001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zm00001eb371310_T001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zm00001eb371280_T001,1.2765,2.1092,0.692731,0.0,4.2798,1.47496,2.55732,0.0,1.06594,1.14953,...,3.02253,0.4114,1.17447,0.0,3.48749,9.47506,6.19189,3.80776,1.03695,1.14981


In [8]:
import pandas as pd

# Importing expression data from Kremling et al. 2018 (numreads matrix on Maize v5 using Salmon after cleaning with cutadapt)
kremling_expression_numreads_v5 = pd.read_csv('/media/rsantos/4TB_drive/Projects/UGA_RACS/RNAseq/Salmon/Zma2_counts_matrix.txt', sep='\t')

# Rename column and reset the index
kremling_expression_numreads_v5.set_index('Name', inplace=True)

# Print the dataframe
kremling_expression_numreads_v5.head()

Unnamed: 0_level_0,SRR5909626,SRR5909627,SRR5909633,SRR5909635,SRR5909639,SRR5909642,SRR5909645,SRR5909653,SRR5909655,SRR5909665,...,SRR5912073,SRR5912081,SRR5912082,SRR5912083,SRR5912093,SRR5912094,SRR5912104,SRR5912105,SRR5912111,SRR5912116
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Zm00001eb371370_T002,2.0,0.0,8.0,0.0,0.0,2.024,1.0,3.0,1.005,1.0,...,0.0,3.0,0.0,0.0,0.0,9.0,11.001,0.0,7.0,0.0
Zm00001eb371350_T001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zm00001eb371330_T001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zm00001eb371310_T001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zm00001eb371280_T001,3.0,6.0,2.0,0.0,11.0,2.0,11.0,0.0,3.0,3.0,...,4.0,1.0,1.0,0.0,11.0,37.0,21.0,9.0,3.0,4.0


In [9]:
# Rename the columns using the dictionary
kremling_expression_tpm_v5 = kremling_expression_tpm_v5.rename(columns=run2my_sample_id)
kremling_expression_tpm_v5.columns = [str(x) for x in kremling_expression_tpm_v5.columns]

kremling_expression_tpm_v5.head()

Unnamed: 0_level_0,10343927_LMAD26_CI21E_AAGTGG,10343264_LMAN26_CI21E_ATGAAC,10343927_LMAN8_B73_CACACT,10343264_LMAN26_B64_ACCAGT,10343262_LMAN8_B109_TGCTAT,10343262_LMAN8_B14A_CTCTCG,10343262_LMAN8_B57_CCTAAG,10343927_LMAD26_B77_TAATCG,10343262_LMAN8_B79_GCAGCC,10343927_LMAN8_CI187-2_GACGAT,...,10344826_LMAN8_I29_ACGTCT,10344823_LMAD8_IA2132_ACACGC,10343264_LMAD26_CML91_AACGCC,10344827_LMAN26_CML91_AATCCG,10344827_LMAN26_Ki21_AAGACA,10343927_LMAD26_Ki21_ACGTCT,10344826_LMAD8_E2558W_CGCAAC,10343927_LMAN8_E2558W_GAACCT,10344826_LMAD8_IDS69_CAGGAC,10343927_LMAN8_IDS69_ACATTA
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Zm00001eb371370_T002,1.04145,0.0,3.39106,0.0,0.0,1.82712,0.284514,2.23201,0.437147,0.468934,...,0.0,1.51042,0.0,0.0,0.0,2.82055,3.96967,0.0,2.96105,0.0
Zm00001eb371350_T001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zm00001eb371330_T001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zm00001eb371310_T001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zm00001eb371280_T001,1.2765,2.1092,0.692731,0.0,4.2798,1.47496,2.55732,0.0,1.06594,1.14953,...,3.02253,0.4114,1.17447,0.0,3.48749,9.47506,6.19189,3.80776,1.03695,1.14981


In [10]:
# Rename the columns using the dictionary
kremling_expression_numreads_v5 = kremling_expression_numreads_v5.rename(columns=run2my_sample_id)
kremling_expression_numreads_v5.columns = [str(x) for x in kremling_expression_numreads_v5.columns]

if kremling_expression_numreads_v5.columns.all() == kremling_expression_tpm_v5.columns.all():
    print('Columns are equal! (expected, because the TPM and numreads are from exactly the same salmon quant results)')

Columns are equal! (expected, because the TPM and numreads are from exactly the same salmon quant results)


### Exporting cr99_q20 table after matching Kremling

In [14]:
import pandas as pd

wallace_asvs_q20_fw_closedref_silva = pd.read_csv('/media/rsantos/4TB_drive/Projects/UGA_RACS/16S/Qiime2/clustering/FeatureTable/cr99_q20_rv_feature-table.tsv',
                           sep='\t', comment='#')

# Rename column and reset the index
wallace_asvs_q20_fw_closedref_silva.rename(columns={'OTU ID': 'Name'}, inplace=True)
wallace_asvs_q20_fw_closedref_silva.set_index('Name', inplace=True)

# Rename the columns using the dictionary
wallace_asvs_q20_fw_closedref_silva = wallace_asvs_q20_fw_closedref_silva.rename(columns=run2my_sample_id)
wallace_asvs_q20_fw_closedref_silva.columns = [str(x) for x in wallace_asvs_q20_fw_closedref_silva.columns]

kremling_expression_v5_numreads_closedref = kremling_expression_numreads_v5.filter(items=wallace_asvs_q20_fw_closedref_silva.columns)
kremling_expression_v5_tpm_closedref = kremling_expression_tpm_v5.filter(items=wallace_asvs_q20_fw_closedref_silva.columns)
wallace_asvs_q20_fw_closedref_silva = wallace_asvs_q20_fw_closedref_silva.filter(items=kremling_expression_v5_numreads_closedref.columns)

if wallace_asvs_q20_fw_closedref_silva.columns.all() == kremling_expression_v5_numreads_closedref.columns.all():
    print('Columns are equal!')

Columns are equal!


In [19]:
print(kremling_expression_v5_numreads_closedref.shape)
print(kremling_expression_v5_tpm_closedref.shape)
print(wallace_asvs_q20_fw_closedref_silva.shape)

(39096, 482)
(39096, 482)
(1896, 482)


In [20]:
import pandas as pd

kremling_expression_v5_numreads_closedref.to_csv('kremling_expression_v5_numreads_closedref.tsv', sep='\t')
kremling_expression_v5_tpm_closedref.to_csv('kremling_expression_v5_tpm_closedref.tsv', sep='\t')
wallace_asvs_q20_fw_closedref_silva.to_csv('wallace_asvs_q20_fw_closedref_silva.tsv', sep='\t')