/
analysis_python.py
executable file
·118 lines (91 loc) · 4.29 KB
/
analysis_python.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#!/usr/bin/env python3
# Daniel Stribling | ORCID: 0000-0002-0649-9506
# Renne Lab, University of Florida
# Hybkit Project : http://www.github.com/RenneLab/hybkit
"""
Analysis for type/mirna analysis performed as a python workflow.
Provided as an example of direct use of the Hybkit API.
See: "README.rst" for this analysis for more information.
"""
import os
import sys
import hybkit
# Set mirna types as custom to include KSHV-miRNAs
hybkit.settings.HybRecord_settings['mirna_types'] = ['miRNA', 'KSHV-miRNA']
# Set script directories and input file names.
analysis_dir = os.path.abspath(os.path.dirname(__file__))
out_dir = os.path.join(analysis_dir, 'output_python')
input_files = sorted(f for f in os.listdir(analysis_dir) if f.endswith('.hyb'))
match_legend_file = os.path.join(analysis_dir, 'string_match_legend.csv')
# Begin Analysis
print('\nPerforming QC & Summary Analysis...')
if not os.path.isdir(out_dir):
print('Creating Output Directory:\n %s\n' % out_dir)
os.mkdir(out_dir)
print('Analyzing Files:')
print(' ' + '\n '.join(input_files) + '\n')
# Set the method of finding segment type
match_params = hybkit.HybRecord.TypeFinder.make_string_match_params(match_legend_file)
hybkit.HybRecord.TypeFinder.set_method('string_match', match_params)
# Set hybrid segment types to remove as part of quality control (QC)
remove_types = ['rRNA', 'mitoch-rRNA']
# Initialize Combined Analysis
combined_analysis = hybkit.analysis.Analysis(
analysis_types=['type', 'mirna'], name='Combined Analysis'
)
# Iterate over each input file, find the segment types, and save the output
# in the output directory.
for in_file_path in input_files:
in_file_name = os.path.basename(in_file_path)
in_file_label = in_file_name.replace('.hyb', '')
out_file_name = in_file_name.replace('.hyb', '_qc.hyb')
out_file_path = os.path.join(out_dir, out_file_name)
print('Analyzing:\n %s' % in_file_path)
print('Outputting to:\n %s\n' % out_file_path)
sys.stdout.flush()
# Initialize file-specific analysis
file_analysis = hybkit.analysis.Analysis(
analysis_types=['type', 'mirna'], name=in_file_label
)
# Open one HybFile entry for reading, and one for writing
with hybkit.HybFile(in_file_path, 'r', hybformat_id=True) as in_file, \
hybkit.HybFile(out_file_path, 'w') as out_file:
# Track last record identifier, to use only one record per read
last_record_id = None
# Iterate over each record of the input file
for hyb_record in in_file:
hyb_record.eval_types() # Find segment types
hyb_record.eval_mirna() # Find miRNA details
# Determine if record has type that is excluded
use_record = True
for remove_type in remove_types:
if hyb_record.has_prop('any_seg_type_is', remove_type):
use_record = False
break
# If record has an excluded type, continue to next record without analyzing.
if not use_record: # noqa: SIM114
continue
# If record is a duplicate, skip it.
elif hyb_record.id == last_record_id:
continue
last_record_id = hyb_record.id
# Set the dataset label for the record
hyb_record.set_flag('dataset', in_file_label)
# Add record details to analyses
file_analysis.add_hyb_record(hyb_record)
combined_analysis.add_hyb_record(hyb_record)
# Write the modified record to the output file
out_file.write_record(hyb_record)
# Write analysis for input file to outputs.
analysis_file_basename = out_file_path.replace('.hyb', '')
print('Outputting Analyses to:\n %s\n' % analysis_file_basename)
sys.stdout.flush()
file_analysis.write_analysis_results_special(analysis_file_basename)
file_analysis.plot_analysis_results(analysis_file_basename)
# Write combined summary analysis to files.
combined_analysis_file_basename = os.path.join(out_dir, 'combined_analysis')
print('Outputting Combined Analysis to:\n %s\n' % combined_analysis_file_basename)
sys.stdout.flush()
combined_analysis.write_analysis_results_special(combined_analysis_file_basename)
combined_analysis.plot_analysis_results(combined_analysis_file_basename)
print('Done!')