/
retrace.py
247 lines (211 loc) · 8.9 KB
/
retrace.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
# emacs: -*- mode: python; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil; coding: utf-8 -*-
# ex: set sts=4 ts=4 sw=4 noet:
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
#
# See COPYING file distributed along with the reproman package for the
# copyright and license terms.
#
# ## ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ##
"""Analyze existing spec or session file system to gather more detailed information
"""
from __future__ import unicode_literals
from os.path import normpath
import sys
import time
from reproman.resource.session import get_local_session
from reproman.resource.session import Session
from .common_opts import resref_opt
from .common_opts import resref_type_opt
from .base import Interface
from ..support.constraints import EnsureNone
from ..support.constraints import EnsureStr
from ..support.exceptions import InsufficientArgumentsError
from ..support.param import Parameter
from ..utils import assure_list
from ..utils import to_unicode
from ..resource import get_manager
__docformat__ = 'restructuredtext'
from logging import getLogger
lgr = getLogger('reproman.api.retrace')
class Retrace(Interface):
"""Gather detailed package information from paths or a ReproZip trace file.
Examples
--------
$ reproman retrace --spec reprozip_run.yml > reproman_config.yml
"""
_params_ = dict(
spec=Parameter(
args=("--spec",),
doc="ReproZip YML file to be analyzed",
metavar='SPEC',
# nargs="+",
constraints=EnsureStr() | EnsureNone(),
),
path=Parameter(
args=("path",),
metavar="PATH",
doc="""path(s) to be traced. If spec is provided, would trace them
after tracing the spec""",
nargs="*",
constraints=EnsureStr() | EnsureNone()),
output_file=Parameter(
args=("-o", "--output-file",),
doc="Output file. If not specified - printed to stdout",
metavar='output_file',
constraints=EnsureStr() | EnsureNone(),
),
resref=Parameter(
args=("-r", "--resource",),
dest="resref",
metavar="RESOURCE",
doc="""Name or ID of the resource to operate on. To see available
resources, run 'reproman ls'.[PY: Note: As a special case, a session
instance can be passed as the value for `resref`. PY]""",
constraints=EnsureStr() | EnsureNone()),
resref_type=resref_type_opt,
)
# TODO: add a session/resource so we could trace within
# arbitrary sessions
@staticmethod
def __call__(path=None, spec=None, output_file=None,
resref=None, resref_type="auto"):
# heavy import -- should be delayed until actually used
if not (spec or path):
raise InsufficientArgumentsError(
"Need at least a single --spec or a file"
)
paths = assure_list(path)
if spec:
lgr.info("reading spec file %s", spec)
# TODO: generic loader to auto-detect formats etc
from reproman.formats.reprozip import ReprozipProvenance
spec = ReprozipProvenance(spec)
paths += spec.get_files() or []
# Convert paths to unicode
paths = map(to_unicode, paths)
# The tracers assume normalized paths.
paths = list(map(normpath, paths))
if isinstance(resref, Session):
# TODO: Special case for Python callers. Is this something we want
# to handle more generally at the interface level?
session = resref
elif resref:
resource = get_manager().get_resource(resref, resref_type)
session = resource.get_session()
else:
session = get_local_session()
# TODO: at the moment assumes just a single distribution etc.
# Generalize
# TODO: RF so that only the above portion is reprozip specific.
# If we are to reuse their layout largely -- the rest should stay as is
(distributions, files) = identify_distributions(
paths,
session=session
)
from reproman.distributions.base import EnvironmentSpec
spec = EnvironmentSpec(
distributions=distributions,
)
if files:
spec.files = sorted(files)
# TODO: generic writer!
from reproman.formats.reproman import RepromanProvenance
stream = open(output_file, "w") if output_file else sys.stdout
RepromanProvenance.write(stream, spec)
if stream is not sys.stdout:
stream.close()
# TODO: session should be with a state. Idea is that if we want
# to trace while inheriting all custom PATHs which that run might have
# had
def identify_distributions(files, session=None, tracer_classes=None):
"""Identify packages files belong to
Parameters
----------
files : iterable
Files to consider
Returns
-------
distributions : list of Distribution
unknown_files : list of str
Files which were not determined to belong to any specific distribution
"""
if tracer_classes is None:
tracer_classes = get_tracer_classes()
session = session or get_local_session()
# TODO create list of appropriate for the `environment` OS tracers
# in case of no environment -- get current one
# TODO: should operate in the session, might be given additional information
# not just files
# .identify_ functions will have a side-effect of shrinking this list in-place
# as they identify files beloning to them
files_to_consider = set(files)
distibutions = []
files_processed = set()
files_to_trace = files_to_consider
niter = 0
max_niter = 10
while True:
niter += 1
nfiles_processed = len(files_processed)
nfiles_to_trace = len(files_to_trace)
lgr.info("Entering iteration #%d over Tracers", niter)
if niter > max_niter:
lgr.error(
"We did %s iterations already, something is not right"
% max_niter)
break
for Tracer in tracer_classes:
lgr.debug("Tracing using %s", Tracer.__name__)
# TODO: memoize across all loops
# Identify directories from the files_to_consider
dirs = set(filter(session.isdir, files_to_trace))
# Pull out directories if the tracer can't handle them
if Tracer.HANDLES_DIRS:
files_to_trace = files_to_consider
files_skipped = set()
else:
files_to_trace = files_to_consider - dirs
files_skipped = files_to_consider - files_to_trace
tracer = Tracer(session=session)
begin = time.time()
# yoh things the idea was that tracer might trace even without
# files, so we should not just 'continue' the loop if there is no
# files_to_trace
if files_to_trace:
remaining_files_to_trace = files_to_trace
nenvs = 0
for env, remaining_files_to_trace in tracer.identify_distributions(
files_to_trace):
distibutions.append(env)
nenvs += 1
files_processed |= files_to_trace - remaining_files_to_trace
files_to_trace = remaining_files_to_trace
lgr.info("%s: %d envs with %d other files remaining",
Tracer.__name__,
nenvs,
len(files_to_trace))
# Re-combine any files that were skipped
files_to_consider = files_to_trace | files_skipped
lgr.debug("Assigning files to packages by %s took %f seconds",
tracer, time.time() - begin)
if len(files_to_trace) == 0 or (
nfiles_processed == len(files_processed) and
nfiles_to_trace == len(files_to_trace)):
lgr.info("No more changes or files to track. Exiting the loop")
break
return distibutions, files_to_consider
def get_tracer_classes():
"""A helper which returns a list of all available Tracers
The order should not but does matter and ATM is magically provided
"""
# TODO: automate discovery of available tracers
from reproman.distributions.debian import DebTracer
from reproman.distributions.redhat import RPMTracer
from reproman.distributions.conda import CondaTracer
from reproman.distributions.venv import VenvTracer
from reproman.distributions.vcs import VCSTracer
from reproman.distributions.docker import DockerTracer
from reproman.distributions.singularity import SingularityTracer
Tracers = [DebTracer, RPMTracer, CondaTracer, VenvTracer, VCSTracer,
DockerTracer, SingularityTracer]
return Tracers