# Correlations between transcriptome and microbiome



## Datasets

 * RNAseq matrix with TPM values
 * Feature table with ASV counts

### Associations between RNAseq and 16S 

Here is the list of genotypes:

In [90]:
genotypes = []

with open('genotypes.txt', 'r') as file:
    
    for line in file:
        genotypes.append(line.strip())

print(genotypes)

['33-16', '38-11', '4226', '4722', 'A188', 'A214N', 'A239', 'A272', 'A441-5', 'A554', 'A556', 'A6', 'A619', 'A632', 'A634', 'A635', 'A641', 'A654', 'A659', 'A661', 'A679', 'A680', 'A682', 'Ab28A', 'Ames26808', 'Ames27128', 'Ames28290', 'Ames28291', 'B10', 'B103', 'B104', 'B105', 'B109', 'B115', 'B14A', 'B164', 'B2', 'B37', 'B46', 'B52', 'B57', 'B64', 'B68', 'B73', 'B73(PI550473)', 'B73Htrhm', 'B75', 'B76', 'B77', 'B79', 'B84', 'B97', 'B97(PI564682)', 'C103', 'C123', 'C49A', 'CH701-30', 'CH9', 'CI 7 Goodman-Buckler', 'CI 187-2', 'CI 21E', 'CI28A Goodman-Buckler', 'CI31A', 'CI 3A', 'CI64', 'CI66', 'CI90C', 'CI 91B Goodman-Buckler', 'CM105', 'CM174', 'CM37', 'CM7', 'CML 10', 'CML 103', 'CML103(Ames27081)', 'CML 108', 'CML 11', 'CML 14', 'CML 154Q', 'CML 157Q', 'CML 158Q', 'CML206', 'CML206(Krakowsky)', 'CML 218', 'CML 220', 'CML228', 'CML228(Ames27088)', 'CML 238', 'CML 247', 'CML247(PI595541)', 'CML 254', 'CML 258', 'CML 261', 'CML 264', 'CML277', 'CML277(PI595550)', 'CML281', 'CML287', 

In [91]:
# Convert all strings in the list to upper case
genotypes_upper = [x.upper() for x in genotypes]

In [92]:
# Convert all strings in the list to upper case
print(f'List has {len(genotypes_upper)} genotypes from 0h_samples_sorted_filtered.hmp.txt')

List has 323 genotypes from 0h_samples_sorted_filtered.hmp.txt


For the correlations between Kremling and Wallace papers, information about the day and time of sampling and genotype must match in order to do any correlations.

 * Run information for [16S data]()
 * Run information for [RNAseq data]()

Such information can be easily extracted for the RNAseq data based on field `LibraryName`

For Dr. Wallace manuscript, a file is available on [FigShare](https://doi.org/10.6084/m9.figshare.5886769.v2) that can connect plots (therefore genotypes) and sequencing runs

 * 0_plate_key.txt has associations between genotypes and plots (NCBI run metadata have plots as part of the identifier)
 * 0_inbred_aliases_manual.txt has inbred aliases from Wallace et al (2018)

In [93]:
plate_key_file = '/home/rsantos/Repositories/maize_microbiome_transcriptomics/16S_wallace2018/0_plate_key.txt'
sra_run_table_16s = '/home/rsantos/Repositories/maize_microbiome_transcriptomics/16S_wallace2018/SraRunInfo_Wallace_etal_2018.csv'
sra_run_table_rnaseq = '/home/rsantos/Repositories/maize_microbiome_transcriptomics/rnaseq_kremling2018/run_info/SraRunInfo_Kremling_etal_2018.csv'

dict_wallace_kremling_2018 = {}

In [94]:
inbred_aliases_file_wallace2018 = '/home/rsantos/Repositories/maize_microbiome_transcriptomics/16S_wallace2018/0_inbred_aliases_manual.txt'

inbred_aliases_wallace2018_dict = {}

with open(inbred_aliases_file_wallace2018, 'r') as file:

    _ = file.readline()

    for line in file:
        
        fields = line.strip().split('\t')
        name = fields[0].upper()
        alias = fields[1].upper()

        inbred_aliases_wallace2018_dict[name] = alias
        inbred_aliases_wallace2018_dict[alias] = name


Checking consistency of maize genotype names between 0_plate_key.txt and 0h_samples_sorted_filtered.hmp.txt:

In [95]:
tmp_run_id = 1
genotypes_not_found = []
genotypes_found = []

with open(plate_key_file, 'r') as file:

    _ = file.readline()

    for line in file:
        fields = line.strip().split('\t')
        fields2 = fields[3].split('_')
        plot_id = fields[2]
        day = fields2[1]
        day_period = fields2[0]
        genotype = fields[14]

        #Check if genotype matches the list of genotypes
        if genotype == '#N/A':
            pass
            #print(f'Genotype is #N/A for plot {plot_id} (blank plot)')
        elif (genotype.upper() not in genotypes_upper) and\
                (genotype.upper() not in genotypes_not_found) and\
                (genotype.upper() not in inbred_aliases_wallace2018_dict.keys()):
                genotypes_not_found.append(genotype.upper())
            #print(f'Genotype {genotype} not in list of genotypes')
            #print(f'##{plot_id}\t{day}\t{day_period}\t{genotype}')
        else:
            #print(f'{plot_id}\t{day}\t{day_period}\t{genotype}')
            if genotype.upper() not in genotypes_found:
                genotypes_found.append(genotype.upper())

print(f'{len(genotypes_found)} Genotypes found in the list of genotypes: {genotypes_found}')
print(f'{len(genotypes_not_found)} Genotypes not found in the list of genotypes: {genotypes_not_found}')

270 Genotypes found in the list of genotypes: ['CO255', 'CM7', 'F7', 'ND246', 'A654', 'CO106', 'W182B', 'EP1', 'A188', 'A659', 'CMV3', 'CO125', 'P39', 'A556', 'MEF156-55-2', 'A554', 'C49A', 'CM105', 'W117HT', 'A661', 'I29', 'IA2132', 'SD44', 'MS1334', 'N192', 'B76', 'CM174', 'A619', 'H99', 'R168', 'W153R', 'MS71', 'OH43', 'IA5125', 'IL14H', 'C123', 'A641', 'A679', 'B103', 'A682', 'CH701-30', 'MS153', 'A272', 'A632', 'MO47', 'VA59', 'A635', 'B73', 'H105W', 'M14', 'VA102', 'W64A', 'N6', 'A239', 'YU796_NS', 'WF9', 'H49', 'B164', 'PA762', 'IL 101T', 'SD40', 'R177', 'A680', 'B79', 'C103', 'B75', 'H84', 'VA35', 'NC364', 'B14A', 'B109', 'IDS91', 'I 205', 'MO17', 'IDS28', 'W22', 'CH9', 'R109B', 'NC260', 'OH40B', 'VA99', 'VAW6', 'B57', 'NC310', 'MO44', 'NC362', 'OH43E', 'H95', 'MO46', 'CI 3A', 'MO1W', 'NC324', 'B97', 'NC328', '38-11', 'B73HTRHM', 'SA24', 'W22 R-RSTD', 'B37', 'VA22', 'B2', 'HP301', 'T8', 'VA17', 'NC368', 'B104', 'NC264', 'F2834T', 'OH603', 'R229', 'PA875', 'B10', 'B68', 'K55', '

Notice that there's a note in file `4_RunHeritabilityAndGwas.sh`:

`# # NOTE: 6 CML lines are not present in the genotype file (CML411, CML504, CML505, CML84, CML85, CML96) and do not appear in the public Panzea GBS dataset`

These genotypes match those not found in rules above.

Creating dictionary with genotypes names and aliases:

In [96]:
genotype2ids = {'KUI2007': 'AMES27128',
   'AMES27128': 'KUI2007',
   'CML511': 'KE_MAIZE634',
   'KE_MAIZE634': 'CML511',
   'ILL.HY': 'AMES26808',
   'AMES26808': 'ILL.HY',
   'B2': 'B2-GOOD',
   'B2-GOOD': 'B2',
   'CI7': 'CI.7',
   'CI.7': 'CI7',
   'DE2': 'DE_2',
   'DE_2': 'DE2',
   'IL101T': 'IL101',
   'IL101': 'IL101T',
   'MO.G': 'MOG',
   'MOG': 'MO.G',
   'MR19SANTODOMINGO': 'MR19_(SANTO_DOMINGO)',
   'MR19_(SANTO_DOMINGO)': 'MR19SANTODOMINGO',
   'MR20SHOEPEG': 'MR20_(SHOE_PEG)',
   'MR20_(SHOE_PEG)': 'MR20SHOEPEG',
   'W22R-RSTD': 'W22R-R-STD_CS-2909-1',
   'W22R-R-STD_CS-2909-1': 'W22R-RSTD',
   "CI21E": "CI 21E",
   "CI187-2": "CI 187-2",
   "CI28A": "CI28A GOODMAN-BUCKLER",
   "CML108": "CML 108",
   "CML10": "CML 10",
   "CML11": "CML 11",
   "SG1533": "SG 1533",
   "TZI11": "TZI 11",
   "SC357": "SC 357",
   "CML505": "CML505",
   "SG18": "SG 18",
   "W22R-RSTD": "W22 R-RSTD",
   "YU796NS": "YU796_NS",
   "MR20": "MR20(SHOEPEG)",
   "TZI16": "TZI 16",
   "CML103": "CML 103",
   "TZI18": "TZI 18",
   "MR19": "MR19(SANTO DOMINGO)",
   "TZI9": "TZI 9",
   "SC213R": "SC 213R",
   "CML322": "CML 322",
   "CML341": "CML 341",
   "CML238": "CML 238",
   "TZI25": "TZI 25",
   "CML77": "CML 77",
   "CML333": "CML 333",
   "CI91B": "CI 91B GOODMAN-BUCKLER",
   "CI3A": "CI 3A",
   "CI7": "CI 7 GOODMAN-BUCKLER",
   "CML220": "CML 220",
   "I205": "I 205",
   "CML91": "CML 91",
   "CML323": "CML 323",
   "CML314": "CML 314",
   "CML38": "CML 38",
   "CML218": "CML 218",
   "CML69": "CML 69",
   "CML92": "CML 92",
   "CML331": "CML 331",
   "CML158Q": "CML 158Q",
   "CML328": "CML 328",
   "CML157Q": "CML 157Q",
   "ILLHY": "ILL.HY",
   "CML261": "CML 261",
   "CML154Q": "CML 154Q",
   "IL101T": "IL 101T"
}

Filling dictionary with run metadata (except run accession)

In [97]:
with open(plate_key_file, 'r') as file:

    _ = file.readline()

    for line in file:
        fields = line.strip().split('\t')
        fields2 = fields[3].split('_')
        plot_id = fields[2]
        day = fields2[1]
        day_period = fields2[0]
        genotype = fields[14].upper()

        if genotype in genotype2ids.keys():
            genotype_list = [genotype, genotype2ids[genotype]]
        else:
            genotype_list = [genotype]

        if genotype != '#N/A':
            dict_wallace_kremling_2018[tmp_run_id] = {'run_accession_16s': '',
                                    'run_accession_rnaseq': '',
                                    'plot_id': plot_id,
                                    'day': day,
                                    'day_period': day_period,
                                    'genotype': genotype_list}
        tmp_run_id+=1

In [98]:
# Just checking dict strcuture
dict_wallace_kremling_2018[1]

{'run_accession_16s': '',
 'run_accession_rnaseq': '',
 'plot_id': '14A0005',
 'day': '8',
 'day_period': 'LMAD',
 'genotype': ['CO255']}

Adding run accession information:

In [99]:
runs_list = []
runs_in_dict = []

with open(sra_run_table_16s, 'r') as file:

    _ = file.readline()

    for line in file:
        fields = line.strip().split(',')
        fields2 = fields[11].split('.')
        run_accession = fields[0]
        day_period = fields2[0]
        day = fields2[1]
        plot_id = fields2[2]

        runs_list.append(run_accession)

        #print(f'{run_accession}\t{plot_id}\t{day}\t{day_period}')
        for key in dict_wallace_kremling_2018.keys():
            if dict_wallace_kremling_2018[key]['plot_id'] == plot_id and\
                 dict_wallace_kremling_2018[key]['day'] == day and\
                 dict_wallace_kremling_2018[key]['day_period'] == day_period:
                dict_wallace_kremling_2018[key]['run_accession_16s'] = run_accession
                runs_in_dict.append(run_accession)

In [100]:
# Just checking if any record has no run_accession
for key, value in dict_wallace_kremling_2018.items():
    if not dict_wallace_kremling_2018[key]['run_accession_16s']:
        print(f'{key}\t{dict_wallace_kremling_2018[key]}')

In [101]:
len(set(runs_list).difference(set(runs_in_dict)))

8

Eight runs that are not in dictionary correspond to "#N/A" (blanks)

In [102]:
print(set(runs_list).difference(set(runs_in_dict)))
print(set(runs_in_dict).difference(set(runs_list)))

{'SRR6666051', 'SRR6665849', 'SRR6665809', 'SRR6666006', 'SRR6665758', 'SRR6665757', 'SRR6665812', 'SRR6666050'}
set()


In [103]:
len(set(runs_in_dict).union(set(runs_list)))

592

Bringing RNAseq data!

In [104]:
import re

not_found = 0

with open(sra_run_table_rnaseq, 'r') as file:

    _ = file.readline()

    for line in file:
        fields = line.strip().split(',')
        fields2 = fields[11].split('_')
        sample_id = fields2[1]
        genotype = fields2[2]
        day = ''
        match = re.search(r'\d+', sample_id)
        unmatched_parts = re.split(r'\d+', sample_id)
        day_period = unmatched_parts[0]
        if match:
            day = int(match.group())
        if sample_id.startswith('LMA'):
            if genotype.upper() not in genotypes_upper:
                if genotype.upper() not in genotype2ids.keys():
                    print(f'PROBLEM: {fields[0]} {day} {unmatched_parts[0]} {genotype}')
                    not_found+=1

print(f'Not found: {not_found}')

PROBLEM: SRR5911290 26 LMAN CML504
PROBLEM: SRR5911291 26 LMAD CML504
PROBLEM: SRR5911464 26 LMAN CML96
PROBLEM: SRR5911472 26 LMAD CML84
PROBLEM: SRR5911475 26 LMAN CML84
PROBLEM: SRR5911597 26 LMAD CML96
PROBLEM: SRR5911605 26 LMAN CML85
PROBLEM: SRR5911606 26 LMAD CML85
PROBLEM: SRR5911607 26 LMAD CML411
PROBLEM: SRR5911608 26 LMAN CML411
Not found: 10


In [105]:
import re

not_found = 0

with open(sra_run_table_rnaseq, 'r') as file:

    _ = file.readline()

    for line in file:
        fields = line.strip().split(',')
        fields2 = fields[11].split('_')
        sample_id = fields2[1]
        genotype = fields2[2].upper()
        if genotype.upper() in genotype2ids.keys():
            genotype = genotype2ids[genotype.upper()]
        day = ''
        match = re.search(r'\d+', sample_id)
        unmatched_parts = re.split(r'\d+', sample_id)
        day_period = unmatched_parts[0]
        if match:
            day = str(match.group())
        if sample_id.startswith('LMA'):
            for key in dict_wallace_kremling_2018:
                if dict_wallace_kremling_2018[key]['day'] == day and\
                    genotype in dict_wallace_kremling_2018[key]['genotype'] and\
                    dict_wallace_kremling_2018[key]['day_period'] == day_period:
                    dict_wallace_kremling_2018[key]['run_accession_rnaseq'] = fields[0]

In [106]:
count_rnaseq = 0

# Just checking if any record has no run_accession
for key, value in dict_wallace_kremling_2018.items():
    if not dict_wallace_kremling_2018[key]['run_accession_rnaseq']:
        print(f'{key}\t{dict_wallace_kremling_2018[key]["genotype"]}\t\
              {dict_wallace_kremling_2018[key]["day"]}\t\
              {dict_wallace_kremling_2018[key]["day_period"]}')
        count_rnaseq+=1

print(count_rnaseq, "records without run_accession_rnaseq")

28	['CM174']	              8	              LMAD
49	['A635']	              8	              LMAD
50	['B73']	              8	              LMAD
53	['H105W']	              8	              LMAD
54	['M14']	              8	              LMAD
55	['VA102']	              8	              LMAD
56	['W64A']	              8	              LMAD
57	['N6']	              8	              LMAD
58	['A239']	              8	              LMAD
59	['YU796_NS']	              8	              LMAD
60	['WF9']	              8	              LMAD
61	['H49']	              8	              LMAD
62	['B164']	              8	              LMAD
63	['PA762']	              8	              LMAD
64	['IL 101T']	              8	              LMAD
65	['SD40']	              8	              LMAD
66	['R177']	              8	              LMAD
67	['A680']	              8	              LMAD
68	['B79']	              8	              LMAD
69	['C103']	              8	              LMAD
70	['B75']	              8	              LMAD
71	['H84']

I (RACS) manually checked the supp material of Kremling (2018). Genotype A635 does not have RNAseq of LMAD (only LMAN). B73 does not have LMAD8 (only 26). Genotype IDS91 does not have RNAseq of LMAD (only LMAN8). B164 does not have RNAseq of LMAD8 (only LMAN8).

In [107]:
len([dict_wallace_kremling_2018[key]['genotype'] for key in dict_wallace_kremling_2018.keys() if dict_wallace_kremling_2018[key]['run_accession_16s']])

584

In [108]:
len([dict_wallace_kremling_2018[key]['genotype'] for key in dict_wallace_kremling_2018.keys() if dict_wallace_kremling_2018[key]['run_accession_rnaseq']])

495

## Generating a matrix with both RNAseq and Metataxonomic data

Associations between 16S and RNAseq data are present in the 'dict_wallace_kremling_2018' dictionary.


In [109]:
run2my_sample_id = {}

for key in dict_wallace_kremling_2018:
    if dict_wallace_kremling_2018[key]['run_accession_rnaseq']:
        run2my_sample_id[dict_wallace_kremling_2018[key]['run_accession_rnaseq']] = key
    if dict_wallace_kremling_2018[key]['run_accession_16s']:
        run2my_sample_id[dict_wallace_kremling_2018[key]['run_accession_16s']] = key

In [110]:
len(run2my_sample_id)

1053

### Importing expression data (Kremling et al, 2018)

In [123]:
import pandas as pd

# Importing expression data from Kremling et al. 2018 (TPM matrix on Maize v5 using Salmon after cleaning with cutadapt)
kremling_expression_v5 = pd.read_csv('/media/rsantos/4TB_drive/Projects/UGA_RACS/RNAseq/Salmon/Zma2_tpm_matrix.txt', sep='\t')

# Rename column and reset the index
kremling_expression_v5.set_index('Name', inplace=True)

# Print the dataframe
kremling_expression_v5.head()

Unnamed: 0_level_0,SRR5909626,SRR5909627,SRR5909633,SRR5909635,SRR5909639,SRR5909642,SRR5909645,SRR5909653,SRR5909655,SRR5909665,...,SRR5912073,SRR5912081,SRR5912082,SRR5912083,SRR5912093,SRR5912094,SRR5912104,SRR5912105,SRR5912111,SRR5912116
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Zm00001eb371370_T002,1.04145,0.0,3.39106,0.0,0.0,1.82712,0.284514,2.23201,0.437147,0.468934,...,0.0,1.51042,0.0,0.0,0.0,2.82055,3.96967,0.0,2.96105,0.0
Zm00001eb371350_T001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zm00001eb371330_T001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zm00001eb371310_T001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zm00001eb371280_T001,1.2765,2.1092,0.692731,0.0,4.2798,1.47496,2.55732,0.0,1.06594,1.14953,...,3.02253,0.4114,1.17447,0.0,3.48749,9.47506,6.19189,3.80776,1.03695,1.14981


In [124]:
# Renaming columns based on associations in 'run2my_sample_id'

# Rename the columns using the dictionary
kremling_expression_v5 = kremling_expression_v5.rename(columns=run2my_sample_id)
kremling_expression_v5.columns = [str(x) for x in kremling_expression_v5.columns]


In [125]:
kremling_expression_v5.head()

Unnamed: 0_level_0,238,379,SRR5909633,354,420,419,434,225,413,204,...,172,22,306,543,505,268,137,194,136,193
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Zm00001eb371370_T002,1.04145,0.0,3.39106,0.0,0.0,1.82712,0.284514,2.23201,0.437147,0.468934,...,0.0,1.51042,0.0,0.0,0.0,2.82055,3.96967,0.0,2.96105,0.0
Zm00001eb371350_T001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zm00001eb371330_T001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zm00001eb371310_T001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zm00001eb371280_T001,1.2765,2.1092,0.692731,0.0,4.2798,1.47496,2.55732,0.0,1.06594,1.14953,...,3.02253,0.4114,1.17447,0.0,3.48749,9.47506,6.19189,3.80776,1.03695,1.14981


### Importing ASV data (Wallace et al, 2018)

Testing with ASVs generated after dada2 considering data as if they were single-end.
Quality trimming was made using cutadapt, with minimum quality 30 and minimum read length 50bp.


In [126]:
# Importing ASV data; generated from processing 16S data from Wallace et al. (2018)
wallace_asvs = pd.read_csv('/media/rsantos/4TB_drive/Projects/UGA_RACS/16S/Qiime2/dada2/as_single_q20/table-paired-end_wallace2018_assingle_forward_q20-dada2_feature-table/q20_fw_feature-table.tsv',
                           sep='\t')

# Rename column and reset the index
wallace_asvs.rename(columns={'ASV': 'Name'}, inplace=True)
wallace_asvs.set_index('Name', inplace=True)

# Print the dataframe
wallace_asvs.head()

Unnamed: 0_level_0,SRR6665476,SRR6665477,SRR6665478,SRR6665479,SRR6665480,SRR6665481,SRR6665482,SRR6665483,SRR6665484,SRR6665485,...,SRR6666058,SRR6666059,SRR6666060,SRR6666061,SRR6666062,SRR6666063,SRR6666064,SRR6666065,SRR6666066,SRR6666067
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
bc664ea528899e36452dd37c1f55a48a,47869.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,78028.0,0.0,0.0,0.0,0.0,0.0,58946.0,3868.0,0.0
232ad9e267688a5d573112b4855bac96,0.0,2727.0,4065.0,27528.0,7244.0,3035.0,2433.0,847.0,2351.0,830.0,...,18215.0,0.0,9866.0,13921.0,29850.0,1713.0,11708.0,0.0,0.0,3469.0
6967c9a10eff11f751218e759df28ab7,0.0,610.0,4147.0,267.0,6479.0,7206.0,6862.0,7565.0,12271.0,1298.0,...,742.0,0.0,12448.0,225.0,7830.0,47503.0,1241.0,0.0,0.0,920.0
fa79d5937f424b58a27843dfff8bdcd4,0.0,1837.0,2993.0,18227.0,4525.0,1975.0,1701.0,545.0,1492.0,567.0,...,12277.0,0.0,6519.0,8965.0,22337.0,1113.0,7897.0,0.0,0.0,2223.0
e6b96dce8fbd261b8836b93b9a1d5e07,0.0,1767.0,3093.0,19988.0,5185.0,2100.0,1768.0,542.0,1485.0,603.0,...,11994.0,0.0,6479.0,9388.0,25538.0,1041.0,7923.0,0.0,0.0,2095.0


In [127]:
# Renaming columns based on associations in 'run2my_sample_id'

# Rename the columns using the dictionary
wallace_asvs = wallace_asvs.rename(columns=run2my_sample_id)
wallace_asvs.columns = [str(x) for x in wallace_asvs.columns]

In [128]:
kremling_expression_v5.head()

Unnamed: 0_level_0,238,379,SRR5909633,354,420,419,434,225,413,204,...,172,22,306,543,505,268,137,194,136,193
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Zm00001eb371370_T002,1.04145,0.0,3.39106,0.0,0.0,1.82712,0.284514,2.23201,0.437147,0.468934,...,0.0,1.51042,0.0,0.0,0.0,2.82055,3.96967,0.0,2.96105,0.0
Zm00001eb371350_T001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zm00001eb371330_T001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zm00001eb371310_T001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zm00001eb371280_T001,1.2765,2.1092,0.692731,0.0,4.2798,1.47496,2.55732,0.0,1.06594,1.14953,...,3.02253,0.4114,1.17447,0.0,3.48749,9.47506,6.19189,3.80776,1.03695,1.14981


### Ensuring Wallace df has the same columns as Kremling df

In [129]:
wallace_asvs = wallace_asvs.filter(items=kremling_expression_v5.columns)
print(wallace_asvs.shape)

(6241, 469)


### Ensuring Kremling df has the same columns as Wallace df

In [130]:
print(kremling_expression_v5.shape)
kremling_expression_v5 = kremling_expression_v5.filter(items=wallace_asvs.columns)
print(kremling_expression_v5.shape)

(39096, 486)
(39096, 469)


In [131]:
kremling_expression_v5.head()

Unnamed: 0_level_0,238,379,354,420,419,434,225,413,204,20,...,172,22,306,543,505,268,137,194,136,193
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Zm00001eb371370_T002,1.04145,0.0,0.0,0.0,1.82712,0.284514,2.23201,0.437147,0.468934,3.43127,...,0.0,1.51042,0.0,0.0,0.0,2.82055,3.96967,0.0,2.96105,0.0
Zm00001eb371350_T001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zm00001eb371330_T001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zm00001eb371310_T001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zm00001eb371280_T001,1.2765,2.1092,0.0,4.2798,1.47496,2.55732,0.0,1.06594,1.14953,2.40324,...,3.02253,0.4114,1.17447,0.0,3.48749,9.47506,6.19189,3.80776,1.03695,1.14981


In [132]:
wallace_asvs.head()

Unnamed: 0_level_0,238,379,354,420,419,434,225,413,204,20,...,172,22,306,543,505,268,137,194,136,193
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
bc664ea528899e36452dd37c1f55a48a,0.0,126935.0,0.0,0.0,0.0,13831.0,15628.0,55406.0,12829.0,0.0,...,0.0,0.0,0.0,254310.0,9113.0,2218.0,6931.0,0.0,171537.0,0.0
232ad9e267688a5d573112b4855bac96,632.0,0.0,451.0,20624.0,36005.0,0.0,0.0,0.0,0.0,1160.0,...,3133.0,6729.0,9940.0,0.0,0.0,0.0,0.0,921.0,0.0,13028.0
6967c9a10eff11f751218e759df28ab7,1466.0,0.0,1580.0,638.0,2151.0,0.0,0.0,0.0,0.0,1796.0,...,9521.0,31945.0,2719.0,0.0,0.0,0.0,0.0,159.0,0.0,544.0
fa79d5937f424b58a27843dfff8bdcd4,403.0,0.0,298.0,15106.0,25648.0,0.0,0.0,0.0,0.0,782.0,...,2122.0,4547.0,6842.0,0.0,0.0,0.0,0.0,654.0,0.0,8963.0
e6b96dce8fbd261b8836b93b9a1d5e07,447.0,0.0,311.0,15538.0,25585.0,0.0,0.0,0.0,0.0,781.0,...,2024.0,4879.0,7471.0,0.0,0.0,0.0,0.0,637.0,0.0,9696.0


In [133]:
if wallace_asvs.columns.all() == kremling_expression_v5.columns.all():
    print('Columns are equal!')

Have a nice weekend! Just kidding, the columns are equal!


In [134]:
concatenated_df = pd.concat([wallace_asvs, kremling_expression_v5], axis=0)

In [135]:
print(wallace_asvs.shape)
print(kremling_expression_v5.shape)
print(concatenated_df.shape)

(6241, 469)
(39096, 469)
(45337, 469)


In [136]:
concatenated_transposed = concatenated_df.T

## Actual computation of correlations

At the time of writing, there are at least two interesting approaches:

 * Deep Graph
 * CorALS

In [137]:
from corals.threads import set_threads_for_external_libraries
set_threads_for_external_libraries(n_threads=1)



In [138]:
import numpy as np

In [139]:
# runtime: ~2 sec
from corals.correlation.full.default import cor_full
cor_values = cor_full(concatenated_transposed)

In [140]:
cor_values.shape

(45337, 45337)

In [141]:
cor_values

Name,bc664ea528899e36452dd37c1f55a48a,232ad9e267688a5d573112b4855bac96,6967c9a10eff11f751218e759df28ab7,fa79d5937f424b58a27843dfff8bdcd4,e6b96dce8fbd261b8836b93b9a1d5e07,1674323e4fe615dc003edd628305bc9f,6935437446b9c69c21f6ac4518b2eb04,8db37fcbc11f63d8a46690adeb7cad70,52c0751a4259810b7c12be45c6597335,e588c7fc94221bf561c85d536b986ef9,...,Zm00001eb155840_T001,Zm00001eb282610_T001,Zm00001eb282620_T001,Zm00001eb371420_T001,Zm00001eb111420_T001,Zm00001eb282630_T001,Zm00001eb363130_T001,Zm00001eb282640_T001,Zm00001eb046990_T001,Zm00001eb282660_T002
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
bc664ea528899e36452dd37c1f55a48a,1.000000,-0.087876,-0.054919,-0.087139,-0.092061,-0.061232,-0.055461,-0.051788,-0.059567,-0.064147,...,0.040014,-0.016705,0.033277,,0.015267,0.003508,0.051901,0.048651,0.005404,0.012344
232ad9e267688a5d573112b4855bac96,-0.087876,1.000000,0.488727,0.999468,0.992517,0.300425,0.492030,0.480338,0.295914,0.311727,...,0.042335,-0.004365,0.053060,,-0.024741,-0.027665,0.097695,0.006515,0.082641,-0.038843
6967c9a10eff11f751218e759df28ab7,-0.054919,0.488727,1.000000,0.511375,0.577524,0.372563,0.999799,0.997708,0.372460,0.394253,...,0.017017,-0.000425,0.075315,,-0.005234,-0.017472,0.081307,0.043504,0.008661,-0.000548
fa79d5937f424b58a27843dfff8bdcd4,-0.087139,0.999468,0.511375,1.000000,0.994379,0.311739,0.514715,0.502919,0.307473,0.323460,...,0.042673,-0.004222,0.056025,,-0.025260,-0.027456,0.098626,0.006875,0.082579,-0.039203
e6b96dce8fbd261b8836b93b9a1d5e07,-0.092061,0.992517,0.577524,0.994379,1.000000,0.314711,0.580325,0.571483,0.310323,0.328559,...,0.044947,-0.004095,0.054318,,-0.026526,-0.028986,0.092879,0.007648,0.078789,-0.035427
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zm00001eb282630_T001,0.003508,-0.027665,-0.017472,-0.027456,-0.028986,-0.019320,-0.017667,-0.016502,-0.018799,-0.020236,...,-0.022385,-0.005412,0.023766,,-0.020607,1.000000,-0.060219,-0.008756,0.007112,-0.091930
Zm00001eb363130_T001,0.051901,0.097695,0.081307,0.098626,0.092879,0.084852,0.081877,0.077514,0.084624,0.085059,...,-0.041444,0.003934,0.009993,,-0.016494,-0.060219,1.000000,-0.087668,0.035119,-0.051543
Zm00001eb282640_T001,0.048651,0.006515,0.043504,0.006875,0.007648,0.003986,0.045122,0.041138,0.004064,0.005371,...,-0.014298,-0.003457,-0.044826,,0.026566,-0.008756,-0.087668,1.000000,0.023122,0.026810
Zm00001eb046990_T001,0.005404,0.082641,0.008661,0.082579,0.078789,-0.011881,0.008405,0.008347,-0.012188,-0.012139,...,0.098524,-0.040518,0.052542,,0.022377,0.007112,0.035119,0.023122,1.000000,-0.038604


In [144]:
highly_correlated_pairs_df = pd.DataFrame(columns=['feature1', 'feature2', 'correlation'])

In [145]:
correlated_pairs_file = open('correlated_pairs.txt', 'w')

# Find the highly correlated pairs
for i in range(len(cor_values.columns)):
    for j in range(i+1, len(cor_values.columns)):
        if (abs(cor_values.iloc[i, j]) > 0.8) or\
           (abs(cor_values.iloc[i, j]) < -0.8):
            pair = (cor_values.columns[i], cor_values.columns[j])
            highly_correlated_pairs_df.loc[len(highly_correlated_pairs_df.index)] = [cor_values.columns[i],
                                                                                     cor_values.columns[j],
                                                                                     cor_values.iloc[i, j]]\
            correlated_pairs_file.write(f'{cor_values.columns[i]}\t{cor_values.columns[j]}\t{cor_values.iloc[i, j]}\n')                                                                   

correlated_pairs_file.close()

KeyboardInterrupt: 