/
TRANSCISTOR_GOLDLAB.py
267 lines (205 loc) · 10.2 KB
/
TRANSCISTOR_GOLDLAB.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
import os
import argparse
import math
import random
import matplotlib.pyplot as plt
import itertools
import csv
parser=argparse.ArgumentParser(description = "Calculating test statistic to classify cis interaction of UMLILO with nominal p-value as test of significance")
parser.add_argument("-g",nargs = 1,metavar = "GENCODE",type = str, help = "The location of GENCODE File containing positions of genes - which is used to calculate the distance of the TSS of each gene from the respective lncrna")
parser.add_argument("-meta",nargs = 1,type = str, help = "The location of the metadata file")
parser.add_argument("-fantom",nargs = 1,metavar="FANTOM",type = str, help = "The location of the FANTOM data files corresponding to each lncrna")
parser.add_argument("-o",nargs = 1,metavar = "csv file",type = str, help = "The location of the output file ")
args = parser.parse_args()
distance_dict_up={}
distance_dict_down={}
distance_dict_nontargs={}
''' load_meta() runs through the entire lncrna metadata file and stores different attributes as dictionaries with keys corresponding to the ENSEMBL gene id.
The following information is collected -
1) Experiment name
2) Ensembl id
3) Chromosome
4) lncrna gene name
5) TSS of the lncrna
OUTPUT OF FUNCTION: 1 list and 4 dictionaries.
'''
def load_meta():
global tss_dict
global chr_dict
global ensembl_dict
global lncrna_name
global exp_meta
tss_dict={}
chr_dict={}
ensembl_dict={}
lncrna_name={}
with open(args.meta[0],'r') as f:
exp_meta=[]
next(f)
f=f.readlines()
for line in f:
exp_name=line.split('.txt')[0]
ensembl=line.split('\t')[1]
chrom=line.split('\t')[2]
lnc_gene_name=line.split('\t')[4]
tss_start=line.split('\t')[5]
exp_meta.append(str(exp_name))
tss_dict.update({exp_name:tss_start})
chr_dict.update({exp_name:chrom})
ensembl_dict.update({exp_name:ensembl})
lncrna_name.update({exp_name:lnc_gene_name})
''' load_gencode_data() runs through the GENCODE file and stores different attributes as dictionaries with keys corresponding to the ENSEMBL gene id.
The following information is collected -
1) Chromosome of gene
2) TSS of gene
OUTPUT OF FUNCTION: Two dictionaries - both with format {ENSEMBL GENE ID:DATA}
'''
def load_gencode_data():
with open(args.g[0],'r') as g:
p=1
global chrom_gencode
global dist_gencode
chrom_gencode={}
dist_gencode={}
next(g)
for line in g:
line=line.split()
chrom_gencode.update({line[0]:line[1]})
dist_gencode.update({line[0]:line[2]})
'''
Read filenames from the FANTOM data folder. Filenames are the Experiment Ids
OUTPUT OF FUNCTION: A list with Experiment names
'''
def load_FANTOM_files():
arr = os.listdir(args.fantom[0])
exp_fantom=[name.split('.txt')[0] for name in arr]
'''
The simulation(mode) function generates gene lists consisting of non-target genes from the corresponding lncrna experiment file.
Since any lncrna experiment file will most likely contain different number of upregulated and downregulated genes, when simulating
with non-targets, the correct NUMBER of genes must be chosen to replace the dysregulated genes.
For example, if 150 genes were upregulated, then simulations must also contain 150 genes from the non-target set. This is necessary
since the test-statistic calculation involves the number of genes in that dataset.
There are three modes in which this function can be run - i)'UP' ii)'DOWN' iii)'BOTH'.
'UP' corresponds to the case when non-target genes are to replace the upregulated genes.
'DOWN' corresponds to the case when non-target genes are to replace the downregulated genes.
'BOTH' corresponds to the case when non-target genes are to replace the both upregulated and downregulated genes.
OUTPUT OF FUNCTION: A list consisting of appropriate number of non-target ENSEMBL gene ids
'''
def simulation(mode):
other=list(distance_dict_nontargs.keys())
random.shuffle(other)
if mode=='UP':
sim=other[:len(distance_dict_up.values())]
return sim#,number#len(sim)
elif mode=='DOWN':
sim=other[:len(distance_dict_down.values())]
return sim#,number#len(sim)
elif mode=='BOTH':
sim=other[:len(dist_ent.values())]
return sim#,number#len(sim)
'''
The test_stat() function calculates the average of the dataset. It reads off the list generated by simulation(mode),
and gets test-statistic values by referring to the dictionary 'distance_dict_nontargs' which contains 1/d values.
OUTPUT OF FUNCTION: The value of the test-statistic corresponding to the entire dataset
'''
iter=10000 #Number of simulations
def test_stat(inp):
statistic=[]
for gene in inp:
statistic.append(distance_dict_nontargs[gene])
return sum(statistic)/len(statistic)
'''
index_hist_val(og_stat,mode) calculates the fraction of simulations with test-statistic values greater than the orginal
lncrna experiment data.
og_stat is the test-statistic value corresponding to the orginal dataset of a particular experiment file
mode can assume three parameters - 'UP', 'DOWN' or 'BOTH'. The specified mode generates the respective p-value
OUTPUT OF FUNCTION: Directional or overall p-value depending on the mode.
'''
def index_hist_val(og_stat,mode):
counter=0
if mode=='DOWN':
for i in test_stat_val:
if i>og_stat:
counter=counter+1
print('p-val (Down Reg.)',counter/len(test_stat_val))
return counter/len(test_stat_val)
elif mode=='UP':
for i in test_stat_val:
if i>og_stat:
counter=counter+1
print('p-val (Up Reg.)',counter/len(test_stat_val))
return counter/len(test_stat_val)
elif mode=='BOTH':
for i in test_stat_val:
if i>og_stat:
counter=counter+1
print('p-val (Both)',counter/len(test_stat_val))
return counter/len(test_stat_val)
'''
The evaluator() function is the main fucntion that takes each experiment file - parses through it to separate upregulated,
downregulated, and non-target genes, then proceeds to calculate 1/d values. For each dictionary generated (distance_dict_up,
distance_dict_down, and distance_dict_nontargs) in the format {ENSEMBL Gene ID:1/d}, the test-statistic is calculated and
printed.
Next, a .csv file is created to contain the output. Each dataset is then sent over to generate 10,000 simulations, compare the
dataset's test-statistic value with those, and finally report p-values.
OUTPUT OF FUNCTION: .csv file at the location that the user specifies and contains the following columns:
Experiment_ID, lncrna gene name, p-value(UPREGULATED), p-value(DOWNREGULATED), p-value(BOTH)
'''
def evaluator():
with open(args.o[0]+'FANTOM_OUTPUT.csv','a') as w:
header = ['Experiment_id' ,'lncRNA_name', 'pvalue_up','pvalue_down','pvalue_both']
writer = csv.writer(w)
writer.writerow(header)
for i in exp_meta:
with open(args.fantom[0]+'/'+i+'.txt','r') as f:
global distance_dict_up
global distance_dict_down
global distance_dict_nontargs
global test_stat_val
global dist_ent
distance_dict_up={}
distance_dict_down={}
distance_dict_nontargs={}
p=1
next(f)
for line in f:
line=line.split()
#Each dictionary in this step will be setup as {GENCODE Gene ID : 1/d} where d - distance of gene from lncrna
#A gene is only included if it is on the same chromosome as the lncrna as read off from the GENCODE file. The lncrna
#knocked-out in the experiment file is not included in the list as it will introduce zero-error.
if line[1]=='1' and line[0]!=ensembl_dict[i] and chrom_gencode.get(line[0])==chr_dict[i]:
d=abs(int(dist_gencode.get(line[0])) - int(tss_dict[i]))
distance_dict_up.update({line[0]:1/int(d**p)})
elif line[1]=='-1' and line[0]!=ensembl_dict[i] and chrom_gencode.get(line[0])==chr_dict[i]:
d=abs(int(dist_gencode.get(line[0])) - int(tss_dict[i]))
distance_dict_down.update({line[0]:1/int(d**p)})
elif line[1]=='0' and line[0]!=ensembl_dict[i] and chrom_gencode.get(line[0])==chr_dict[i]:
d=abs(int(dist_gencode.get(line[0])) - int(tss_dict[i]))
distance_dict_nontargs.update({line[0]:1/int(d**p)})
dist_ent = {**distance_dict_up, **distance_dict_down}
avg_up=sum(list(distance_dict_up.values()))/len(distance_dict_up.values())
avg_down=sum(list(distance_dict_down.values()))/len(distance_dict_down.values())
avg_entire=(sum(list(distance_dict_up.values()))+sum(list(distance_dict_down.values())))/(len(list(distance_dict_up.values()))+len(list(distance_dict_down.values())))
print('Avg_up:',avg_up)
print('Avg_down:',avg_down)
print('Avg_entire:',avg_entire)
with open(args.o[0]+'FANTOM_OUTPUT.csv','a') as w:
writer = csv.writer(w)
test_stat_val=[]
for _ in range(iter):
test_stat_val.append(test_stat(simulation('DOWN')))
down=index_hist_val(avg_down,'DOWN')
test_stat_val=[]
for _ in range(iter):
test_stat_val.append(test_stat(simulation('UP')))
up=index_hist_val(avg_up,'UP')
test_stat_val=[]
for _ in range(iter):
test_stat_val.append(test_stat(simulation('BOTH')))
both=index_hist_val(avg_entire,'BOTH')
data=[i,lncrna_name[i],up,down,both]
writer.writerow(data)
load_meta()
load_gencode_data()
load_FANTOM_files()
evaluator()