# Correlations between transcriptome and microbiome



## Datasets

 * RNAseq matrix with TPM values
 * Feature table with ASV counts

### Associations between RNAseq and 16S 

Here is the list of genotypes:

In [555]:
genotypes = ["33-16","38-11","4226","4722","A188","A214N","A239","A272",
 "A441-5","A554","A556","A6","A619","A632","A634","A635","A641",
 "A654","A659","A661","A679","A680","A682","Ab28A","Ames26808",
 "Ames27128","Ames28290","Ames28291","B10","B103","B104","B105",
 "B109","B115","B14A","B164","B2","B37","B46","B52","B57",
 "B64","B68","B73","B73(PI550473)","B73Htrhm","B75","B76","B77",
 "B79","B84","B97","B97(PI564682)","C103","C123","C49A","CH701-30",
 "CH9","CI 7 Goodman-Buckler","CI 187-2","CI 21E","CI28A Goodman-Buckler","CI31A","CI 3A","CI64","CI66",
 "CI90C","CI 91B Goodman-Buckler","CM105","CM174","CM37","CM7","CML 10","CML 103","CML103(Ames27081)",
 "CML 108","CML 11","CML 14","CML 154Q","CML 157Q","CML 158Q","CML206","CML206(Krakowsky)",
 "CML 218","CML 220","CML228","CML228(Ames27088)","CML 238","CML 247","CML247(PI595541)",
 "CML 254","CML 258","CML 261","CML 264","CML277","CML277(PI595550)","CML281","CML287",
 "CML 311","CML 314","CML 321","CML 322","CML322(Ames27096)","CML 323","CML 328","CML330",
 "CML 331","CML 332","CML 333","CML333(Ames27101)","CML 341","CML 38","CML418","CML45",
 "CML 5","CML 52","CML52(PI595561)","CML 61","CML 69","CML69(Ames28184)","CML 77","CML 91",
 "CML 92","CMV3","CO106","CO125","CO255","D940Y","DE1","DE811","DE2","DE_3","E2558W",
 "EMPTY","EP1","F2834T","F44","F6","F7","GA209","GT112","H105W","H49","H84","H91","H95",
 "H99","HP301","Hi27","Hp301(PI587131)","Hy","I137TN","I 205","I29","IA2132","IDS28","IDS69",
 "IDS91","IL14H","Ia5125","Il 101T","Il14H","Il14H(Ames27118)","Il677a","K148","K4","K55","K64",
 "KE_Maize634","KY226","KY228","Ki11","Ki11(Ames27124)","Ki14","Ki2021","Ki21","Ki3","Ki3(Ames27123)",
 "Ki43","Ki44","Ky21","Ky21(Ames27130)","L317","L578","M14","M162W","M162W(Ames27134)","M37W",
 "M37W(Ames27133)","MEF156-55-2","MO1W","MR19(Santo Domingo)","MR20(ShoePeg)","MR20_(Shoe_Peg)_S7",
 "MS1334","MS153","MS71","MS71(PI587137)","Mo17","Mo17(PI648430)","Mo18W","Mo18W(PI550441)","Mo24W",
 "Mo44","Mo45","Mo46","Mo47","Mo.G","Mp339","Mt42","N192","N28Ht","N6","N7A","NC222","NC230","NC232",
 "NC236","NC238","NC250","NC258","NC260","NC262","NC264","NC290A","NC294","NC296","NC296A","NC298",
 "NC300","NC302","NC304","NC306","NC310","NC314","NC318","NC320","NC324","NC326","NC328","NC33",
 "NC336","NC338","NC340","NC342","NC344","NC346","NC348","NC350","NC350(Ames27171)","NC352",
 "NC354","NC356","NC358","NC358(Ames27175)","NC360","NC362","NC364","NC366","NC368","ND246",
 "OH7B","Oh40B","Oh43","Oh43(Ames19288)","Oh43E","Oh603","Oh7B(Ames19323)","Os420","P39",
 "P39(Ames28186)","Pa762","Pa875","Pa880","Pa91","R109B","R168","R177","R229","R4","SA24",
 "SC 213R","SC 357","SC55","SD40","SD44","Sg 1533","Sg 18","T232","T234","T8","Tx303",
 "Tx303(Ames19327)","Tx601","Tzi 10","Tzi 11","Tzi 16","Tzi 18","Tzi 25","Tzi 8",
 "Tzi8(PI506246)","Tzi 9","U267Y","VA102","Va14","Va17","Va22","Va26",
 "Va35","Va59","Va85","Va99","VaW6","W117Ht","W153R","W182B","W22",
 "W22 R-rstd","W64A","WD","Wf9","Yu796_NS","tripsacum"]

In [556]:
# Convert all strings in the list to upper case
genotypes_upper = [x.upper() for x in genotypes]

In [557]:
# Convert all strings in the list to upper case
print(f'List has {len(genotypes_upper)} genotypes from 0h_samples_sorted_filtered.hmp.txt')

List has 323 genotypes from 0h_samples_sorted_filtered.hmp.txt


For the correlations between Kremling and Wallace papers, information about the day and time of sampling and genotype must match in order to do any correlations.

 * Run information for [16S data]()
 * Run information for [RNAseq data]()

Such information can be easily extracted for the RNAseq data based on field `LibraryName`

For Dr. Wallace manuscript, a file is available on [FigShare](https://doi.org/10.6084/m9.figshare.5886769.v2) that can connect plots (therefore genotypes) and sequencing runs

 * 0_plate_key.txt has associations between genotypes and plots (NCBI run metadata have plots as part of the identifier)

In [558]:
plate_key_file = '/home/rsantos/Repositories/maize_microbiome_transcriptomics/16S_wallace2018/0_plate_key.txt'
sra_run_table_16s = '/home/rsantos/Repositories/maize_microbiome_transcriptomics/16S_wallace2018/SraRunInfo_Wallace_etal_2018.csv'
sra_run_table_rnaseq = '/home/rsantos/Repositories/maize_microbiome_transcriptomics/rnaseq_kremling2018/run_info/SraRunInfo_Kremling_etal_2018.csv'

dict_wallace_kremling_2018 = {}

Checking consistency of maize genotype names between 0_plate_key.txt and 0h_samples_sorted_filtered.hmp.txt:

In [559]:
tmp_run_id = 1
genotypes_not_found = []
genotypes_found = []

with open(plate_key_file, 'r') as file:

    _ = file.readline()

    for line in file:
        fields = line.strip().split('\t')
        fields2 = fields[3].split('_')
        plot_id = fields[2]
        day = fields2[1]
        day_period = fields2[0]
        genotype = fields[14]

        #Check if genotype matches the list of genotypes
        if genotype == '#N/A':
            pass
            #print(f'Genotype is #N/A for plot {plot_id} (blank plot)')
        elif not genotype.upper() in genotypes_upper:
            if genotype.upper() not in genotypes_not_found:
                genotypes_not_found.append(genotype.upper())
            #print(f'Genotype {genotype} not in list of genotypes')
            #print(f'##{plot_id}\t{day}\t{day_period}\t{genotype}')
        else:
            #print(f'{plot_id}\t{day}\t{day_period}\t{genotype}')
            if genotype.upper() not in genotypes_found:
                genotypes_found.append(genotype.upper())

print(f'{len(genotypes_found)} Genotypes found in the list of genotypes: {genotypes_found}')
print(f'{len(genotypes_not_found)} Genotypes not found in the list of genotypes: {genotypes_not_found}')

261 Genotypes found in the list of genotypes: ['CO255', 'CM7', 'F7', 'ND246', 'A654', 'CO106', 'W182B', 'EP1', 'A188', 'A659', 'CMV3', 'CO125', 'P39', 'A556', 'MEF156-55-2', 'A554', 'C49A', 'CM105', 'W117HT', 'A661', 'I29', 'IA2132', 'SD44', 'MS1334', 'N192', 'B76', 'CM174', 'A619', 'H99', 'R168', 'W153R', 'MS71', 'OH43', 'IA5125', 'IL14H', 'C123', 'A641', 'A679', 'B103', 'A682', 'CH701-30', 'MS153', 'A272', 'A632', 'MO47', 'VA59', 'A635', 'B73', 'H105W', 'M14', 'VA102', 'W64A', 'N6', 'A239', 'YU796_NS', 'WF9', 'H49', 'B164', 'PA762', 'IL 101T', 'SD40', 'R177', 'A680', 'B79', 'C103', 'B75', 'H84', 'VA35', 'NC364', 'B14A', 'B109', 'IDS91', 'I 205', 'MO17', 'IDS28', 'W22', 'CH9', 'R109B', 'NC260', 'OH40B', 'VA99', 'VAW6', 'B57', 'NC310', 'MO44', 'NC362', 'OH43E', 'H95', 'MO46', 'CI 3A', 'MO1W', 'NC324', 'B97', 'NC328', '38-11', 'B73HTRHM', 'SA24', 'W22 R-RSTD', 'B37', 'VA22', 'B2', 'HP301', 'T8', 'VA17', 'NC368', 'B104', 'NC264', 'F2834T', 'OH603', 'R229', 'PA875', 'B10', 'B68', 'K55', '

Filling dictionary with run metadata (except run accession)

In [560]:
with open(plate_key_file, 'r') as file:

    _ = file.readline()

    for line in file:
        fields = line.strip().split('\t')
        fields2 = fields[3].split('_')
        plot_id = fields[2]
        day = fields2[1]
        day_period = fields2[0]
        genotype = fields[14]

        if genotype != '#N/A':
            dict_wallace_kremling_2018[tmp_run_id] = {'run_accession_16s': '',
                                    'run_accession_rnaseq': '',
                                    'plot_id': plot_id,
                                    'day': day,
                                    'day_period': day_period,
                                    'genotype': genotype}
        tmp_run_id+=1

In [561]:
# Just checking dict strcuture
dict_wallace_kremling_2018[1]

{'run_accession_16s': '',
 'run_accession_rnaseq': '',
 'plot_id': '14A0005',
 'day': '8',
 'day_period': 'LMAD',
 'genotype': 'CO255'}

Adding run accession information:

In [562]:
runs_list = []
runs_in_dict = []

with open(sra_run_table_16s, 'r') as file:

    _ = file.readline()

    for line in file:
        fields = line.strip().split(',')
        fields2 = fields[11].split('.')
        run_accession = fields[0]
        day_period = fields2[0]
        day = fields2[1]
        plot_id = fields2[2]

        runs_list.append(run_accession)

        #print(f'{run_accession}\t{plot_id}\t{day}\t{day_period}')
        for key in dict_wallace_kremling_2018.keys():
            if dict_wallace_kremling_2018[key]['plot_id'] == plot_id and\
                 dict_wallace_kremling_2018[key]['day'] == day and\
                 dict_wallace_kremling_2018[key]['day_period'] == day_period:
                dict_wallace_kremling_2018[key]['run_accession_16s'] = run_accession
                runs_in_dict.append(run_accession)

In [563]:
# Just checking if any record has no run_accession
for key, value in dict_wallace_kremling_2018.items():
    if not dict_wallace_kremling_2018[key]['run_accession_16s']:
        print(f'{key}\t{dict_wallace_kremling_2018[key]}')

In [564]:
len(set(runs_list).difference(set(runs_in_dict)))

8

In [565]:
dict_wallace_kremling_2018

{1: {'run_accession_16s': 'SRR6665630',
  'run_accession_rnaseq': '',
  'plot_id': '14A0005',
  'day': '8',
  'day_period': 'LMAD',
  'genotype': 'CO255'},
 2: {'run_accession_16s': 'SRR6665629',
  'run_accession_rnaseq': '',
  'plot_id': '14A0007',
  'day': '8',
  'day_period': 'LMAD',
  'genotype': 'CM7'},
 3: {'run_accession_16s': 'SRR6665624',
  'run_accession_rnaseq': '',
  'plot_id': '14A0009',
  'day': '8',
  'day_period': 'LMAD',
  'genotype': 'F7'},
 4: {'run_accession_16s': 'SRR6665623',
  'run_accession_rnaseq': '',
  'plot_id': '14A0011',
  'day': '8',
  'day_period': 'LMAD',
  'genotype': 'ND246'},
 5: {'run_accession_16s': 'SRR6665626',
  'run_accession_rnaseq': '',
  'plot_id': '14A0013',
  'day': '8',
  'day_period': 'LMAD',
  'genotype': 'A654'},
 6: {'run_accession_16s': 'SRR6665625',
  'run_accession_rnaseq': '',
  'plot_id': '14A0015',
  'day': '8',
  'day_period': 'LMAD',
  'genotype': 'CO106'},
 7: {'run_accession_16s': 'SRR6665621',
  'run_accession_rnaseq': '',


In [566]:
print(set(runs_list).difference(set(runs_in_dict)))
print(set(runs_in_dict).difference(set(runs_list)))

{'SRR6666051', 'SRR6666050', 'SRR6666006', 'SRR6665809', 'SRR6665758', 'SRR6665849', 'SRR6665812', 'SRR6665757'}
set()


In [567]:
len(set(runs_in_dict).union(set(runs_list)))

592

Bringing RNAseq data!

In [568]:
genotype2ids = {
   "CI21E": "CI 21E",
   "CI187-2": "CI 187-2",
   "CI28A": "CI28A Goodman-Buckler",
   "CML108": "CML 108",
   "CML10": "CML 10",
   "CML11": "CML 11",
   "SG1533": "SG 1533",
   "TZI11": "TZI 11",
   "SC357": "SC 357",
   "CML505": "CML505",
   "SG18": "SG 18",
   "W22R-RSTD": "W22 R-RSTD",
   "YU796NS": "YU796_NS",
   "MR20": "MR20(ShoePeg)",
   "TZI16": "TZI 16",
   "CML103": "CML 103",
   "TZI18": "TZI 18",
   "MR19": "MR19(Santo Domingo)",
   "TZI9": "TZI 9",
   "SC213R": "SC 213R",
   "CML322": "CML 322",
   "CML341": "CML 341",
   "CML238": "CML 238",
   "TZI25": "TZI 25",
   "CML77": "CML 77",
   "CML333": "CML 333",
   "CI91B": "CI 91B Goodman-Buckler",
   "CI3A": "CI 3A",
   "CI7": "CI 7 Goodman-Buckler",
   "CML220": "CML 220",
   "I205": "I 205",
   "CML91": "CML 91",
   "CML323": "CML 323",
   "CML314": "CML 314",
   "CML38": "CML 38",
   "CML218": "CML 218",
   "CML69": "CML 69",
   "CML92": "CML 92",
   "CML331": "CML 331",
   "CML158Q": "CML 158Q",
   "CML328": "CML 328",
   "CML157Q": "CML 157Q",
   "ILLHY": "ILL.HY",
   "CML261": "CML 261",
   "CML154Q": "CML 154Q",
   "IL101T": "IL 101T"
}

In [569]:
import re

not_found = 0

with open(sra_run_table_rnaseq, 'r') as file:

    _ = file.readline()

    for line in file:
        fields = line.strip().split(',')
        fields2 = fields[11].split('_')
        sample_id = fields2[1]
        genotype = fields2[2]
        day = ''
        match = re.search(r'\d+', sample_id)
        unmatched_parts = re.split(r'\d+', sample_id)
        day_period = unmatched_parts[0]
        if match:
            day = int(match.group())
        if sample_id.startswith('LMA'):
            if genotype.upper() not in genotypes_upper:
                if genotype.upper() not in genotype2ids.keys():
                    print(f'PROBLEM: {fields[0]} {day} {unmatched_parts[0]} {genotype}')
                    not_found+=1

print(f'Not found: {not_found}')

PROBLEM: SRR5911290 26 LMAN CML504
PROBLEM: SRR5911291 26 LMAD CML504
PROBLEM: SRR5911333 26 LMAD CML511
PROBLEM: SRR5911464 26 LMAN CML96
PROBLEM: SRR5911469 26 LMAN CML511
PROBLEM: SRR5911472 26 LMAD CML84
PROBLEM: SRR5911475 26 LMAN CML84
PROBLEM: SRR5911597 26 LMAD CML96
PROBLEM: SRR5911605 26 LMAN CML85
PROBLEM: SRR5911606 26 LMAD CML85
PROBLEM: SRR5911607 26 LMAD CML411
PROBLEM: SRR5911608 26 LMAN CML411
Not found: 12


In [570]:
import re

not_found = 0

with open(sra_run_table_rnaseq, 'r') as file:

    _ = file.readline()

    for line in file:
        fields = line.strip().split(',')
        fields2 = fields[11].split('_')
        sample_id = fields2[1]
        genotype = fields2[2]
        if genotype.upper() in genotype2ids.keys():
            genotype = genotype2ids[genotype.upper()]
        day = ''
        match = re.search(r'\d+', sample_id)
        unmatched_parts = re.split(r'\d+', sample_id)
        day_period = unmatched_parts[0]
        if match:
            day = str(match.group())
        if sample_id.startswith('LMA'):
            for key in dict_wallace_kremling_2018:
                if dict_wallace_kremling_2018[key]['day'] == day and\
                    dict_wallace_kremling_2018[key]['genotype'] == genotype and\
                    dict_wallace_kremling_2018[key]['day_period'] == day_period:
                    dict_wallace_kremling_2018[key]['run_accession_rnaseq'] = fields[0]

In [571]:
dict_wallace_kremling_2018[5]

{'run_accession_16s': 'SRR6665626',
 'run_accession_rnaseq': 'SRR5910043',
 'plot_id': '14A0013',
 'day': '8',
 'day_period': 'LMAD',
 'genotype': 'A654'}

In [572]:
count_rnaseq = 0

# Just checking if any record has no run_accession
for key, value in dict_wallace_kremling_2018.items():
    if not dict_wallace_kremling_2018[key]['run_accession_rnaseq']:
        print(f'{key}\t{dict_wallace_kremling_2018[key]}')
        count_rnaseq+=1

print(count_rnaseq, "records without run_accession_rnaseq")

28	{'run_accession_16s': 'SRR6665727', 'run_accession_rnaseq': '', 'plot_id': '14A0061', 'day': '8', 'day_period': 'LMAD', 'genotype': 'CM174'}
49	{'run_accession_16s': 'SRR6665828', 'run_accession_rnaseq': '', 'plot_id': '14A0111', 'day': '8', 'day_period': 'LMAD', 'genotype': 'A635'}
50	{'run_accession_16s': 'SRR6665825', 'run_accession_rnaseq': '', 'plot_id': '14A0113', 'day': '8', 'day_period': 'LMAD', 'genotype': 'B73'}
53	{'run_accession_16s': 'SRR6665831', 'run_accession_rnaseq': '', 'plot_id': '14A0115', 'day': '8', 'day_period': 'LMAD', 'genotype': 'H105W'}
54	{'run_accession_16s': 'SRR6665832', 'run_accession_rnaseq': '', 'plot_id': '14A0117', 'day': '8', 'day_period': 'LMAD', 'genotype': 'M14'}
55	{'run_accession_16s': 'SRR6665829', 'run_accession_rnaseq': '', 'plot_id': '14A0119', 'day': '8', 'day_period': 'LMAD', 'genotype': 'Va102'}
56	{'run_accession_16s': 'SRR6665830', 'run_accession_rnaseq': '', 'plot_id': '14A0121', 'day': '8', 'day_period': 'LMAD', 'genotype': 'W64A'

In [578]:
for key in dict_wallace_kremling_2018.keys():
    if dict_wallace_kremling_2018[key]['genotype'] == 'IL 101T':
        print(f'{key}\t{dict_wallace_kremling_2018[key]}')        

In [574]:
len([dict_wallace_kremling_2018[key]['genotype'] for key in dict_wallace_kremling_2018.keys() if dict_wallace_kremling_2018[key]['run_accession_16s']])

584

In [575]:
len([dict_wallace_kremling_2018[key]['genotype'] for key in dict_wallace_kremling_2018.keys() if dict_wallace_kremling_2018[key]['run_accession_rnaseq']])

473

In [576]:
set([dict_wallace_kremling_2018[key]['genotype'] for key in dict_wallace_kremling_2018.keys() if dict_wallace_kremling_2018[key]['run_accession_16s']]).difference(set([dict_wallace_kremling_2018[key]['genotype'] for key in dict_wallace_kremling_2018.keys() if dict_wallace_kremling_2018[key]['run_accession_rnaseq']]))

{'B115',
 'CM174',
 'DE2',
 'DE_3',
 'Il 101T',
 'Ill.Hy',
 'KUI2007',
 'MR20(Shoepeg)',
 'Mo.G',
 'R109B',
 'Sg 1533',
 'Tzi 11',
 'Tzi 16',
 'Tzi 18',
 'Tzi 25',
 'Tzi 9',
 'W22 R-rstd',
 'Yu796_NS'}

In [577]:
set([dict_wallace_kremling_2018[key]['genotype'] for key in dict_wallace_kremling_2018.keys() if dict_wallace_kremling_2018[key]['run_accession_rnaseq']]).difference(set([dict_wallace_kremling_2018[key]['genotype'] for key in dict_wallace_kremling_2018.keys() if dict_wallace_kremling_2018[key]['run_accession_16s']]))

set()

## Actual computation of correlations

At the time of writing, there are at least two interesting approaches:

 * Deep Graph
 * CorALS

In [4]:
from corals.threads import set_threads_for_external_libraries
set_threads_for_external_libraries(n_threads=1)

In [5]:
import numpy as np

In [6]:
n_features = 20000
n_samples = 50
X = np.random.random((n_samples, n_features))

In [10]:
X.shape

(50, 20000)

In [11]:
# runtime: ~2 sec
from corals.correlation.full.default import cor_full
cor_values = cor_full(X)

In [13]:
cor_values.shape

(20000, 20000)