-
Notifications
You must be signed in to change notification settings - Fork 1
/
advanced_python_day1.py
executable file
·385 lines (253 loc) · 11.4 KB
/
advanced_python_day1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
#https://dl.dropboxusercontent.com/u/2838022/python_courses/eg_ap_2015/index.html
#Session 1 - Data Structures
#Data Structures
#Dicts
#Dicts of sets are an interesting data structure - can rapidly look up the key and value (eg condition plus genes expressed), \
#and also exploit the set nature e.g. check if one value is a subset of another value
#Dicts of tuples - good for searching for specific records (list of tuples is fine for iteration but slow for the former), \
#as you can do something like this:
records = {
'ABC123' : ('actgctagt', 1),
'XYZ456' : ('ttaggttta', 1),
'HIJ789' : ('cgcgatcgt', 5)
}
#iteration example
for accession, record in records.items():
(sequence, code) = record
print("looking at record " + accession + " with genetic code " + str(code))
# do something with the record
#searching for something specific
my_record = records.get('XYZ456')
(this_sequence, this_code) = my_record
print("looking at record " + accession + " with genetic code " + str(code))
#^This works as the ascension number is being used as the key, rather than the record code - would have to \
#iterate if we were interested in that
#Collection data structure
import collections
dna = 'aattggaattggaattg'
base_counter = collections.Counter(dna)
print(base_counter)
collections.Counter is a special dict class
#We may want to create dicts which have a default value i.e. an empty list
#We use collections.defaultdict to do this - it avoids us having to do something like creating an empty - {list_of_positions = kmer2list.get(kmer, [])}
dna = 'aattggaattggaattg'
k = 4
kmer2list = {}
kmer2list = collections.defaultdict(list)
for start in range(len(dna) - k + 1):
kmer = dna[start:start + k]
kmer2list[kmer].append(start)
kmer2list
from __future__ import division
heavy_metals = {
'arsenic' : {1,2,3,4,5,6,8,12},
'cadmium' : {2,12,6,4},
'copper' : {7,6,10,4,8},
'mercury' : {3,2,4,5,1}
}
#input is a dict of sets
def main(gene_sets):
'''Gives the similarity score (pairwise similarity matrix) between gene expression conditions'''
#create a dictionary to store the output
#each key is a condition
#each value will be another dictionary
#the keys of this dictionary will be every other condition
#and the value for each is the similarity score for the two condition
psm = {}
#we have to iterate over all values in the dictionary,
#and iterate again but ignore when the conditions are
for condition1, gene_set1 in gene_sets.items():
#create a dictionary for condition1 in our final output
psm[condition1] = {}
#now iterate over every other conditions
for condition2, gene_set2 in gene_sets.items():
if condition1 != condition2:
#calculate the similarity score
#which is the intersect of the two sets divided by the union
union_set = gene_set1.union(gene_set2)
intersect_set = gene_set1.intersection(gene_set2)
score = len(intersect_set) / len(union_set)
#make a variable for our current condition, which is a
#dictionary
current_condition = psm[condition1]
current_condition[condition2] = score
return psm
main(heavy_metals)
##########################################################
#Session 2 - Recursions
#in writing
#Start with a list of all possible single-base sequences. Next, extend each sequence by adding each of the four
#possible bases onto the end. Repeat this extension process as many times as necessary until the sequences are the length
#you require
def generate_kmers(k):
# start with all the one-base strings
result = ['A', 'T', 'G', 'C']
# keep making the strings longer...
for i in range(k - 1):
print("start of loop, result is", result)
new_result = []
# by taking each one and adding each possible base
for kmer in result:
for base in ['A', 'T', 'G', 'C']:
new_result.append(kmer + base)
# the final list becomes the starting list for the next iteration
##note here - result is being altered as we build up the length of the mer, stopping
##when we reach k
result = new_result
print("end of loop, result is", result)
return result
generate_kmers(3)
#Alternative way - see in writing
#To get a list of all kmers of a given length, start by checking the length. If the length is one then the result
#is simply a list of the four bases. If the length is more than one, take the list of all possible sequences whose length
#is one less that the length you're looking for,
#and add each of the four possible bases to each of its elements to get the result.
def generate_kmers_rec(k):
# if k is one, then the result is all one-base strings
if k == 1:
return ['A', 'T', 'G', 'C']
# if k is bigger than one...
else:
result = []
# ...get a list of all kmers which are one base shorter...
#this is the big line - the function references itself
#it continually references itself until
#note that the function will keep on reaching this part before calling itself
#this means that the resuilts = [] will not clear what is returned
#effectively, generate_kmers_rec(k - 1) will become a list that is what returned
for seq in generate_kmers_rec(k - 1):
# ...and append each of the four possible bases
for base in ['A', 'T', 'G', 'C']:
result.append(seq + base)
return result
generate_kmers_rec(3)
#So the function iteratively calls itself UNTIL IT GETS TO THE CONDITION THAT IT DOES NOT CALL ITSELF
tax_dict = {
'Pan troglodytes' : 'Hominoidea', 'Pongo abelii' : 'Hominoidea',
'Hominoidea' : 'Simiiformes', 'Simiiformes' : 'Haplorrhini',
'Tarsius tarsier' : 'Tarsiiformes', 'Haplorrhini' : 'Primates',
'Tarsiiformes' : 'Haplorrhini', 'Loris tardigradus' : 'Lorisidae',
'Lorisidae' : 'Strepsirrhini', 'Strepsirrhini' : 'Primates',
'Allocebus trichotis' : 'Lemuriformes', 'Lemuriformes' : 'Strepsirrhini',
'Galago alleni' : 'Lorisiformes', 'Lorisiformes' : 'Strepsirrhini',
'Galago moholi' : ' Lorisiformes'
}
def get_ancestors(taxon):
if taxon == 'Primates':
return []
else:
parent = tax_dict.get(taxon)
parent_ancestors = get_ancestors(parent)
return [parent] + parent_ancestors
#so each iteration will return all previous ancestors plus the parent taxon of the current taxon
new_tax_dict = {
'Primates': ['Haplorrhini', 'Strepsirrhini'],
'Tarsiiformes': ['Tarsius tarsier'],
'Haplorrhini': ['Tarsiiformes', 'Simiiformes'],
'Simiiformes': ['Hominoidea'],
'Lorisidae': ['Loris tardigradus'],
'Lemuriformes': ['Allocebus trichotis'],
'Lorisiformes': ['Galago alleni','Galago moholi'],
'Hominoidea': ['Pongo abelii', 'Pan troglodytes'],
'Strepsirrhini': ['Lorisidae', 'Lemuriformes', 'Lorisiformes']
}
#two ways of writing a script to get all the descendents of a given taxon - iterative and recursive
def get_descendants(taxon):
result = []
# the stack is the list of taxa whose children we need to include
stack = [taxon]
while len(stack) != 0:
# remove the last element from the stack
current_taxon = stack.pop()
# look up the children
current_taxon_children = new_tax_dict.get(current_taxon, [])
# add the children onto the end of the stack
stack.extend(current_taxon_children)
# add the children to the result list
result.append(current_taxon)
# when the stack is empty we are done
return result
get_descendants("Simiiformes")
def get_descendants_rec(taxon):
# add the current taxon to the resul
result = [taxon]
# look up the list of children for this taxon
children = new_tax_dict.get(taxon, [])
#if the children list is empty, this step is skipped
#result ill still contain the taxon so it will return that
# for each child taxon...
for child in children:
# ... add its decendnts to the result
result.extend(get_descendants_rec(child))
return result
get_descendants_rec("Haplorrhini")
#LCA exercise using iterative answer:
tax_dict = {
'Pan troglodytes' : 'Hominoidea', 'Pongo abelii' : 'Hominoidea',
'Hominoidea' : 'Simiiformes', 'Simiiformes' : 'Haplorrhini',
'Tarsius tarsier' : 'Tarsiiformes', 'Haplorrhini' : 'Primates',
'Tarsiiformes' : 'Haplorrhini', 'Loris tardigradus' : 'Lorisidae',
'Lorisidae' : 'Strepsirrhini', 'Strepsirrhini' : 'Primates',
'Allocebus trichotis' : 'Lemuriformes', 'Lemuriformes' : 'Strepsirrhini',
'Galago alleni' : 'Lorisiformes', 'Lorisiformes' : 'Strepsirrhini',
'Galago moholi' : ' Lorisiformes'
}
#set up the function to get a list of the ancestors of a taxon
def get_ancestors(taxon):
result = [taxon]
while taxon != 'Primates' and taxon != None:
parent = tax_dict.get(taxon)
result.append(parent)
taxon = parent
return result
#input is a list of taxons
def get_LCA(taxon_list):
while (len(taxon_list) >= 2) :
ancestors1 = get_ancestors(taxon_list[0])
ancestors2 = get_ancestors(taxon_list[1])
taxon_list.pop(0)
taxon_list.pop(0)
counter = 0
while (counter != 1):
last_anc = ancestors1[0]
ancestors1 = ancestors1[1:]
if last_anc in ancestors2:
counter = 1
taxon_list.append(last_anc)
return last_anc
print get_LCA([ "Hominoidea", "Pan troglodytes","Tarsius tarsier"])
#recursive answer
tax_dict = {
'Pan troglodytes' : 'Hominoidea', 'Pongo abelii' : 'Hominoidea',
'Hominoidea' : 'Simiiformes', 'Simiiformes' : 'Haplorrhini',
'Tarsius tarsier' : 'Tarsiiformes', 'Haplorrhini' : 'Primates',
'Tarsiiformes' : 'Haplorrhini', 'Loris tardigradus' : 'Lorisidae',
'Lorisidae' : 'Strepsirrhini', 'Strepsirrhini' : 'Primates',
'Allocebus trichotis' : 'Lemuriformes', 'Lemuriformes' : 'Strepsirrhini',
'Galago alleni' : 'Lorisiformes', 'Lorisiformes' : 'Strepsirrhini',
'Galago moholi' : ' Lorisiformes'
}
#set up the function to get a list of the ancestors of a taxon
def get_ancestors(taxon):
result = [taxon]
while taxon != 'Primates' and taxon != None:
parent = tax_dict.get(taxon)
result.append(parent)
taxon = parent
return result
def get_LCA(taxa1, taxa2):
ancestors1 = get_ancestors(taxa1)
ancestors2 = get_ancestors(taxa2)
counter = 0
while (counter != 1):
last_anc = ancestors1[0]
ancestors1 = ancestors1[1:]
if last_anc in ancestors2:
counter = 1
return last_anc
#input is a list of taxons
def recursive(taxon_list):
if (len(taxon_list) == 2):
return get_LCA(taxon_list[0], taxon_list[1])
else:
return recursive(taxon_list[1:])