diff --git a/tools/probes/common/boost_long_fts_jobs b/tools/probes/common/boost_long_fts_jobs deleted file mode 100755 index 09921a40e0..0000000000 --- a/tools/probes/common/boost_long_fts_jobs +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env python -# Copyright European Organization for Nuclear Research (CERN) 2013 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - Wen Guan, , 2016 - -import datetime -import os -import sys - -from rucio.common.config import config_get -from rucio.core import request as request_core - -OK, WARNING, CRITICAL, UNKNOWN = 0, 1, 2, 3 - - -if __name__ == "__main__": - - try: - if len(sys.argv) == 3: - priority = int(sys.argv[1]) - older_than = int(sys.argv[2]) - activities = None - else: - priority = int(sys.argv[1]) - older_than = int(sys.argv[2]) - activities = sys.argv[3] - except IndexError, error: - activities = None - priority = 4 - older_than = 3600 * 72 - - WORST_RETVALUE = OK - - try: - proxy = config_get('nagios', 'proxy') - os.environ["X509_USER_PROXY"] = proxy - except Exception as error: - print "Failed to get proxy from rucio.cfg" - WORST_RETVALUE = WARNING - - try: - filter = {'older_than': datetime.datetime.utcnow() - datetime.timedelta(seconds=older_than)} - if activities: - filter['activities'] = activities.split(",") - request_core.update_requests_priority(priority, filter=filter) - except: - print "Failed to boost priority" - WORST_RETVALUE = CRITICAL - sys.exit(WORST_RETVALUE) diff --git a/tools/probes/common/check_WebDAV_ping b/tools/probes/common/check_WebDAV_ping deleted file mode 100755 index ab4f27ba2d..0000000000 --- a/tools/probes/common/check_WebDAV_ping +++ /dev/null @@ -1,53 +0,0 @@ -#!/usr/bin/env python -# Copyright European Organization for Nuclear Research (CERN) 2013 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - Cedric Serfon, , 2014-2015 - -''' -Probe to check the queues of the transfer service -''' - -import os -import sys -from rucio.common.config import config_get -from rucio.common.exception import ServiceUnavailable, RSEProtocolNotSupported -from rucio.rse import rsemanager - -scheme = 'https' -OK, WARNING, CRITICAL, UNKNOWN = 0, 1, 2, 3 -site = sys.argv[1] - - -try: - proxy = config_get('nagios', 'proxy') - os.environ["X509_USER_PROXY"] = proxy -except Exception as e: - print "Failed to get proxy from rucio.cfg" - sys.exit(CRITICAL) - -try: - rse_settings = rsemanager.get_rse_info(site) - dict = rsemanager.select_protocol(rse_settings, operation='write', scheme=scheme) - basepath = '%s://%s:%s%s' % (dict['scheme'], dict['hostname'], dict['port'], dict['prefix']) - print 'Testing existence of %s' % basepath - p = rsemanager.create_protocol(rse_settings, operation='write', scheme='https') - try: - p.connect() - except ServiceUnavailable, e: - print e - sys.exit(CRITICAL) - if not p.exists(basepath): - sys.exit(CRITICAL) - p.close() - print '%s exists' % basepath -except RSEProtocolNotSupported, e: - print e - sys.exit(WARNING) -except: - sys.exit(CRITICAL) -sys.exit(OK) diff --git a/tools/probes/common/check_activemq_queue b/tools/probes/common/check_activemq_queue deleted file mode 100755 index ddcb0fa548..0000000000 --- a/tools/probes/common/check_activemq_queue +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/env python -# Copyright European Organization for Nuclear Research (CERN) 2013 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - Thomas Beermann, , 2014 - -''' -Probe to check if an ActiveMQ queue is above a certain threshold. -''' - -from sys import argv, exit - -from requests import get - -# Exit statuses -OK, WARNING, CRITICAL, UNKNOWN = 0, 1, 2, 3 - -if __name__ == "__main__": - broker = argv[1] - destination = argv[2] - threshold = int(argv[3]) - url = """http://%s:61004/j4p/read/org.apache.activemq:type=Broker,brokerName=atlas_%s,destinationType=Queue,destinationName=%s/QueueSize""" % (broker, broker, destination) - - r = get(url) - - if r.status_code != 200: - exit(UNKNOWN) - - current = r.json()['value'] - if current > (2 * threshold): - exit(CRITICAL) - elif current > threshold: - exit(WARNING) - - exit(OK) diff --git a/tools/probes/common/check_always_ok b/tools/probes/common/check_always_ok deleted file mode 100755 index d05649baf4..0000000000 --- a/tools/probes/common/check_always_ok +++ /dev/null @@ -1,14 +0,0 @@ -#!/usr/bin/env python -# Copyright European Organization for Nuclear Research (CERN) 2013 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - Cedric Serfon, , 2014 - -import sys -OK, WARNING, CRITICAL, UNKNOWN = 0, 1, 2, 3 - -sys.exit(OK) diff --git a/tools/probes/common/check_ami b/tools/probes/common/check_ami deleted file mode 100755 index 214031672f..0000000000 --- a/tools/probes/common/check_ami +++ /dev/null @@ -1,81 +0,0 @@ -#!/usr/bin/env python -''' - Copyright European Organization for Nuclear Research (CERN) 2013 - - Licensed under the Apache License, Version 2.0 (the "License"); - You may not use this file except in compliance with the License. - You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 - - Authors: - - Vincent Garonne, , 2015 - - Cedric Serfon, , 2018 - - Probe to check datatype, project and convention from AMI -''' - -import sys -import traceback - -from rucio.db.sqla.session import get_session -from rucio.core.scope import add_scope - -# Exit statuses -OK, WARNING, CRITICAL, UNKNOWN = 0, 1, 2, 3 - - -if __name__ == "__main__": - try: - session = get_session() - - # Import datatypes - query = '''MERGE INTO atlas_rucio.did_key_map D -USING (select datatype from ATLAS_AMI_PRODUCTION_01_R.data_type@AMI_ATLR.CERN.CH where writestatus='valid') S -ON (D.key='datatype' and S.datatype = D.value) -WHEN NOT MATCHED THEN INSERT (key,value,created_at, updated_at) -VALUES ('datatype', S.datatype, sys_extract_utc(systimestamp), sys_extract_utc(systimestamp))''' - session.execute(query) - session.commit() - - # Import projects - query = '''MERGE INTO atlas_rucio.did_key_map D -USING (select distinct(projecttag) as value from ATLAS_AMI_PRODUCTION_01_R.projects@AMI_ATLR.CERN.CH where writestatus='valid') S -ON (D.key='project' and S.value = D.value) -WHEN NOT MATCHED THEN INSERT (key, value,created_at, updated_at) -VALUES ('project', S.value, sys_extract_utc(systimestamp), sys_extract_utc(systimestamp))''' - session.execute(query) - session.commit() - - # Import scopes - query = '''select projecttag -from ATLAS_AMI_PRODUCTION_01_R.projects@AMI_ATLR.CERN.CH -where writestatus='valid' -and not exists (select 1 from atlas_rucio.scopes where scope = projecttag) -and projecttag not in ('user', 'group', 'group10', 'user10', 'user09', 'group', 'data', 'condR2')''' - for project, in session.execute(query): - add_scope(scope=project, account='root') - - # import naming convention - query = '''select projecttag, nomenclaturetemplate -from ATLAS_AMI_PRODUCTION_01_R.projects@AMI_ATLR.CERN.CH A, ATLAS_AMI_PRODUCTION_01_R.nomenclature@AMI_ATLR.CERN.CH B -where A.writestatus='valid' and B.identifier = A.NOMENCLATUREFK and B.nomenclaturestatus = 'VALID' -and nomenclaturetemplate in ('project.runNumber.streamName.prodStep.dataType.Version', 'project.datasetNumber.physicsShort.prodStep.dataType.Version')''' - for project, naming_convention in session.execute(query): - try: - regexp = '^(?P%(project)s)\.(?P\d+)\.(?P[a-zA-Z0-9\_\-]+)\.(?P[a-zA-Z0-9\_\-]+)\.(?P\w+)(?:\.(?P[a-zA-Z0-9\_\-]{1,70}))?$' % locals() - query = '''MERGE INTO atlas_rucio.naming_conventions D -USING (SELECT '%(project)s' as scope, '%(regexp)s' as regexp FROM DUAL) S -ON (D.scope = S.scope) -WHEN MATCHED THEN UPDATE SET regexp='%(regexp)s', updated_at=sys_extract_utc(systimestamp) -WHEN NOT MATCHED THEN INSERT(scope, regexp, convention_type, updated_at, created_at) -VALUES('%(project)s', '%(regexp)s', 'DATASET', sys_extract_utc(systimestamp), sys_extract_utc(systimestamp))''' % locals() - session.execute(query) - session.commit() - except Exception as error: - print traceback.format_exc(error) - except: - print traceback.format_exc(error) - - except Exception as error: - print traceback.format_exc(error) - sys.exit(UNKNOWN) - sys.exit(OK) diff --git a/tools/probes/common/check_clean_staging_areas b/tools/probes/common/check_clean_staging_areas deleted file mode 100755 index 749801858c..0000000000 --- a/tools/probes/common/check_clean_staging_areas +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env python -# Copyright European Organization for Nuclear Research (CERN) 2013 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - Cedric Serfon, , 2015 - -''' -Probe to clean expired replicas on STAGING areas -''' - -import sys -import traceback - -from rucio.db.sqla.session import get_session - -# Exit statuses -OK, WARNING, CRITICAL, UNKNOWN = 0, 1, 2, 3 - -if __name__ == "__main__": - query = """ - declare - type staging_rses is table of number(6) index by varchar(64); - l_staging_rses staging_rses; - begin - for loc in (select id from atlas_rucio.rses where rse like '%STAGING%' and staging_area=1) - loop - l_staging_rses(atlas_rucio.id2rse(loc.id)) := 0; - end loop; - for idx in 1..40 - loop - for rep in (select /*+ INDEX(replicas REPLICAS_TOMBSTONE_IDX) */ scope, name, bytes, rse_id from atlas_rucio.replicas where - (case when tombstone is not null then rse_id END) in (select id from atlas_rucio.rses where rse like '%STAGING%' and staging_area=1) and lock_cnt=0 and rownum<10000) - loop - delete from atlas_rucio.replicas where scope=rep.scope and name=rep.name and rse_id=rep.rse_id; - insert into atlas_rucio.updated_rse_counters (id, rse_id, files, bytes, updated_at, created_at) values (sys_guid(), rep.rse_id, -1, -rep.bytes, sysdate, sysdate); - l_staging_rses(atlas_rucio.id2rse(rep.rse_id)) := l_staging_rses(atlas_rucio.id2rse(rep.rse_id)) + 1; - end loop; - commit; - end loop; - for loc in (select id from atlas_rucio.rses where rse like '%STAGING%') - loop - dbms_output.put_line(atlas_rucio.id2rse(loc.id) || ' ' || l_staging_rses(atlas_rucio.id2rse(loc.id))); - end loop; - end; - """ - try: - session = get_session() - session.execute(query) - except: - print traceback.format_exc() - sys.exit(CRITICAL) - sys.exit(OK) diff --git a/tools/probes/common/check_cloud_srm_space b/tools/probes/common/check_cloud_srm_space deleted file mode 100755 index af4cec79f6..0000000000 --- a/tools/probes/common/check_cloud_srm_space +++ /dev/null @@ -1,85 +0,0 @@ -#!/usr/bin/env python -# Copyright European Organization for Nuclear Research (CERN) 2013 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - Wen Guan, , 2014 -# - Cedric Serfon, , 2015 -# -import os -import sys - -from rucio.client import Client -from rucio.common.config import config_get -from rucio.rse import rsemanager as rsemgr - -OK, WARNING, CRITICAL, UNKNOWN = 0, 1, 2, 3 - - -if __name__ == "__main__": - - cloud = sys.argv[1] - - retvalue = OK - cloudRetValue = OK - usedsize = 0 - freesize = 0 - - try: - proxy = config_get('nagios', 'proxy') - os.environ["X509_USER_PROXY"] = proxy - except Exception as e: - print "Failed to get proxy from rucio.cfg" - retvalue = WARNING - - c = Client() - - rses = c.list_rses('cloud=%s' % cloud) - for rse in sorted(rses): - rsename = rse['rse'] - rse_settings = rsemgr.get_rse_info(rsename) - schemes = [p['scheme'] for p in rse_settings['protocols']] - attrs = c.list_rse_attributes(rsename) - - if 'srm' not in schemes: - print "The rse(%s) has no SRM protocol defined in AGIS" % (rsename) - continue - if not rse_settings['availability_read']: - print "The rse(%s) blacklisted (not available for read)" % (rsename) - continue - - for protocol in rse_settings['protocols']: - if protocol['scheme'] == "srm": - rse_settings['protocols'].remove(protocol) - protocol['impl'] = "rucio.rse.protocols.gfal.Default" - rse_settings['protocols'].append(protocol) - try: - gs, ret = rsemgr.get_space_usage(rse_settings, "srm") - if gs: - totalsize = long(ret["totalsize"]) - freesize = long(ret["unusedsize"]) - usedsize = totalsize - freesize - retvalue = OK - else: - print "Failed to get rse(%s) space information: %s" % (rsename, str(ret)) - retvalue = WARNING - if 'tier' in attrs and attrs['tier'] != '3' and 'type' in attrs and attrs['type'] != 'TEST': - cloudRetValue = WARNING - else: - print "The rse(%s) is T3 or TEST => not setting overall failure" % (rsename) - except Exception as e: - print "Failed to get rse(%s) space information: %s" % (rsename, str(e)) - retvalue = WARNING - if 'tier' in attrs and attrs['tier'] != '3' and 'type' in attrs and attrs['type'] != 'TEST': - cloudRetValue = WARNING - else: - print "The rse(%s) is T3 or TEST => not setting overall failure" % (rsename) - - if retvalue == OK: - print "Update RSE %s space usage (usedsize: %s, freesize: %s)" % (rsename, usedsize, freesize) - c.set_rse_usage(rsename, "srm", usedsize, freesize) - - sys.exit(cloudRetValue) diff --git a/tools/probes/common/check_davs b/tools/probes/common/check_davs deleted file mode 100755 index 29f6cb2015..0000000000 --- a/tools/probes/common/check_davs +++ /dev/null @@ -1,140 +0,0 @@ -#!/usr/bin/env python -# Copyright European Organization for Nuclear Research (CERN) 2013 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - Stefan Prenner, , 2017 -# - -''' -Probe that checks for each RSE if it is possible to download data using the webdav door and an X509 certificate without any extensions. -''' - -import re -import requests -from rucio.client.client import Client -import sys -import zlib - - -# Exit statuses -OK, WARNING, CRITICAL, UNKNOWN = 0, 1, 2, 3 - -index = 1 - - -def set_browser_enabled(rse, browser_enabled): - ''' - Sets a new rse attribute 'browser_enabled' to True/False. - - :param rse: The RSE name. - :param browser_enabled: The new boolean value for the browser_enabled attribute. - ''' - - if 'browser_enabled' in c.list_rse_attributes(rse): - c.delete_rse_attribute(rse, 'browser_enabled') - c.add_rse_attribute(rse, 'browser_enabled', browser_enabled) - - -def verifyDownload(rse, response, checksum, success_list, error_list, wrong_checksum_list): - ''' - Verifies the response byte sequence by comparing its adler32 hash with the stored checksum. - - :param rse: The RSE name. - :param response: The Response object returned by the requests GET call. - :param checksum: The correct checksum stored in the database. - :param success_list: A list to keep track of all successful file downloads. - :param error_list: A list to keep track of all errors except http errors during the GET request. - :param wrong_checksum_list: A list to keep track of all files that produce an adler32 hash different to the one stored in the database. - ''' - - try: - cont = response.content - adler = zlib.adler32(cont, 1L) - # backflip on 32bit - if adler < 0: - adler = adler + 2 ** 32 - print('Adler checksum: ' + str('%08x' % adler)) - if str('%08x' % adler) != checksum: - wrong_checksum_list.append(rse + ' : checksum ' + str('%08x' % adler)) - else: - success_list.append(str(rse)) - set_browser_enabled(rse, True) - print('Checksum corrent!') - except: - e_type = sys.exc_info()[0] - e_value = sys.exc_info()[1] - e_traceback = sys.exc_info()[2] - error_list.append(str(rse) + ': Error while verifying! ' + str(e_type) + ' ' + str(e_value) + ' ' + str(e_traceback)) - set_browser_enabled(rse, False) - print('An error occurred while verifying download, see error list for details.') - - -if __name__ == "__main__": - ''' - Iterates through all replicas storing the specified file and keeps track of occurring errors. - Adds current rse to one of 6 lists depending on the result (success, http error, other error, skipped due to blacklisting, missing download link, wrong checksum/corrupted file). - ''' - - c = Client() - file_scope = sys.argv[1] - file_name = sys.argv[2] - cert_location = sys.argv[3] - r = c.list_replicas([{'scope': file_scope, 'name': file_name}], schemes=['davs']) - error_list = [] - empty_list = [] - wrong_checksum_list = [] - success_list = [] - http_error_list = [] - skipped_list = [] - - for replica in r: - checksum = replica['adler32'] - rses = replica['rses'] - for rse in rses: - p = c.get_protocols(str(rse), scheme='davs') # skip rse if not available - if p['availability_read'] is False: - skipped_list.append(rse) - continue - tmp = rses[rse] - try: - link = tmp.pop() - print(str(index) + ': ' + str(rse) + ' ...') - link = link.replace('davs', 'https', 1) - response = requests.get(link, cert=cert_location, verify=False) - try: - if 'text/html' in response.headers.get('content-type'): - response_text = str(response.text) - number_length = 3 - pattern = r"\D(\d{%d})\D" % number_length # \D to avoid matching 4 digit (or more) numbers - http_error_list.append(str(rse) + ': ' + str(list(set(re.findall(pattern, response_text))))) # conversion to set to delete duplicates, back to list to get rid of 'set' when printing - else: - verifyDownload(rse, response, checksum, success_list, error_list, wrong_checksum_list) - except TypeError as te: - print('HTTP Header did not have content-type attribute. Attempting download...') - verifyDownload(rse, response, checksum, success_list, error_list, wrong_checksum_list) - except IndexError as e: - print(str(index) + ': ' + str(rse) + ': Link is empty.') - empty_list.append(rse) - set_browser_enabled(rse, False) - except: - e_type = sys.exc_info()[0] - e_value = sys.exc_info()[1] - e_traceback = sys.exc_info()[2] - error_list.append(str(rse) + ': ' + str(e_type) + ' ' + str(e_value) + ' ' + str(e_traceback)) - set_browser_enabled(rse, False) - print('An error occurred, see error list for details.') - link = None - index += 1 - print('Browser enabled for ' + str(rse) + ': ' + str(c.list_rse_attributes(rse).get('browser_enabled'))) - - print('Empty links (' + str(len(empty_list)) + '): ' + str(empty_list)) - print('Http Error list (' + str(len(http_error_list)) + '): ' + str(http_error_list)) - print('Links of other errors (' + str(len(error_list)) + '): ' + str(error_list)) - print('List of wrong checksums (' + str(len(wrong_checksum_list)) + '): ' + str(wrong_checksum_list)) - print('Success (' + str(len(success_list)) + '): ' + str(success_list)) - print('Skipped RSEs (' + str(len(skipped_list)) + '): ' + str(skipped_list)) - sys.exit(OK) diff --git a/tools/probes/common/check_deletable_replicas b/tools/probes/common/check_deletable_replicas deleted file mode 100755 index 9b69be1bc1..0000000000 --- a/tools/probes/common/check_deletable_replicas +++ /dev/null @@ -1,77 +0,0 @@ -#!/usr/bin/env python -# Copyright European Organization for Nuclear Research (CERN) 2013 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - Mario Lassnig, , 2013-2014 -# - Cedric Serfon, , 2018 - -''' -Probe to check the queues of messages to submit by Hermes to the broker -''' - -from __future__ import print_function -import sys - -from rucio.db.sqla.session import get_session - -# Exit statuses -OK, WARNING, CRITICAL, UNKNOWN = 0, 1, 2, 3 - - -if __name__ == "__main__": - try: - SESSION = get_session() - QUERY = '''BEGIN - FOR u in (SELECT - a.rse_id AS rse_id, - NVL(b.files, 0) AS files, - NVL(b.bytes, 0) AS bytes, - sys_extract_utc(localtimestamp) AS updated_at - FROM - ( - SELECT - id AS rse_id - FROM - atlas_rucio.rses - WHERE - deleted=0) a - LEFT OUTER JOIN - ( - SELECT - rse_id, - COUNT(1) AS files, - SUM(bytes) AS bytes - FROM - ATLAS_RUCIO.ADG_ONLY_SCAN_REPLICAS - WHERE - tombstone IS NOT NULL - AND tombstone < sys_extract_utc(localtimestamp) GROUP BY rse_id) b - ON - a.rse_id=b.rse_id) - LOOP - MERGE INTO atlas_rucio.RSE_USAGE - USING DUAL - ON (atlas_rucio.RSE_USAGE.rse_id = u.rse_id and source = 'expired') - WHEN NOT MATCHED THEN INSERT(rse_id, source, used, files, updated_at, created_at) - VALUES (u.rse_id, 'expired', u.bytes, u.files, u.updated_at, u.updated_at) - WHEN MATCHED THEN UPDATE SET used=u.bytes, files=u.files, updated_at=u.updated_at; - - MERGE INTO ATLAS_RUCIO.RSE_USAGE_HISTORY H - USING DUAL - ON (h.rse_id = u.rse_id and h.source = 'expired' and h.updated_at = u.updated_at) - WHEN NOT MATCHED THEN INSERT(rse_id, source, used, files, updated_at, created_at) - VALUES (u.rse_id, 'expired', u.bytes, u.files, u.updated_at, u.updated_at); - - COMMIT; - END LOOP; -END; -''' - SESSION.execute(QUERY) - except Exception as error: - print(error) - sys.exit(UNKNOWN) - sys.exit(OK) diff --git a/tools/probes/common/check_disabled_rses b/tools/probes/common/check_disabled_rses deleted file mode 100755 index 58c6161082..0000000000 --- a/tools/probes/common/check_disabled_rses +++ /dev/null @@ -1,60 +0,0 @@ -#!/usr/bin/env python -# Copyright European Organization for Nuclear Research (CERN) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - Cedric Serfon, , 2015-2016 - -import json -import requests -import sys -import traceback - -from rucio.client import Client -from rucio.common.exception import RSENotFound - - -UNKNOWN = 3 -CRITICAL = 2 -WARNING = 1 -OK = 0 - -if __name__ == '__main__': - - url = 'http://atlas-agis-api.cern.ch/request/ddmendpoint/query/list/?json' - resp = requests.get(url=url) - data = json.loads(resp.content) - retvalue = OK - agis_rses = [] - - client = Client() - for rse in data: - if rse['name'] not in agis_rses: - agis_rses.append(rse['name']) - notify = False - if rse['state'] == 'DISABLED' or rse['site_state'] == 'DISABLED': - notify = True - if notify: - try: - client.get_rse(rse['name']) - print '%s needs to be deleted' % (rse['name']) - retvalue = CRITICAL - except RSENotFound: - # Site is already deleted. Skip - pass - except: - trcbck = traceback.format_exc() - errno, errstr = sys.exc_info()[:2] - print 'Interrupted processing with %s %s %s.' % (errno, errstr, trcbck) - for rse in client.list_rses(): - if rse['rse'] not in agis_rses: - rse_attr = client.list_rse_attributes(rse['rse']) - if not ('is_stagingarea' in rse_attr and rse_attr['is_stagingarea']): - print '%s is not defined in AGIS !!!' % (rse['rse']) - retvalue = CRITICAL - - sys.exit(retvalue) diff --git a/tools/probes/common/check_expired_dids b/tools/probes/common/check_expired_dids deleted file mode 100755 index 70223113d5..0000000000 --- a/tools/probes/common/check_expired_dids +++ /dev/null @@ -1,33 +0,0 @@ -#!/usr/bin/env python -# Copyright European Organization for Nuclear Research (CERN) 2013 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - Vincent Garonne, , 2013 - -''' -Probe to check the backlog of expired dids. -''' -import sys -import traceback - -from rucio.core import monitor -from rucio.db.sqla.session import get_session - -# Exit statuses -OK, WARNING, CRITICAL, UNKNOWN = 0, 1, 2, 3 - -if __name__ == "__main__": - try: - session = get_session() - result = session.execute('select count(*) from atlas_rucio.dids where expired_at is not null and expired_at < sys_extract_utc(localtimestamp)').fetchone()[0] - # Possible check against a threshold. If result > max_value then sys.exit(CRITICAL) - monitor.record_gauge(stat='undertaker.expired_dids', value=result) - print result - except: - print traceback.format_exc() - sys.exit(UNKNOWN) - sys.exit(OK) diff --git a/tools/probes/common/check_expired_es_indices b/tools/probes/common/check_expired_es_indices deleted file mode 100755 index a3fdc04bae..0000000000 --- a/tools/probes/common/check_expired_es_indices +++ /dev/null @@ -1,127 +0,0 @@ -#!/usr/bin/env python -# Copyright European Organization for Nuclear Research (CERN) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - Ralph Vigne, 2015 -# -# This script enforces the retention of the Rucio indices stored in Elasticsearch (ES). -# It also maintaines the aliases used by Rucio to be up to date. -# -# This script is supposed to run every day shortly after midnight (e.g. 00:02) (UTC). - - -import re -import traceback -import socket -import sys - -from datetime import datetime, timedelta -from elasticsearch import Elasticsearch - -# Uses the HTTP proxy running on any rucio logger nodes -ES_URL = 'rucio-logger-prod-01.cern.ch:80/elastic' - -# Retention for non-debug entries in days -RETENTION = 30 - -# Retention for debug entries in days -DEBUG_RETENTION = 5 - -if __name__ == '__main__': - es = Elasticsearch(ES_URL) - today = datetime.today() - general_cutoff = datetime.now() - timedelta(days=RETENTION) - debug_cutoff = datetime.now() - timedelta(days=DEBUG_RETENTION) - tbd = list() # List with to be deleted indices - log_index = 'logs-rucio-residue-%04d.%02d.%02d' % (today.year, today.month, today.day) - - try: - for line in es.cat.indices(index='logs-rucio-*').split('\n'): - # Lines look like this: green open logs-rucio-residue-2015.09.30 5 2 9883699 0 3.8gb 1.2gb - general = re.match(r"(\w+)\s+(\w+)\s+(?P\S+\-(?P(?P\d{4}).(?P\d{2}).(?P\d{2})))\s.*", line) - debug = re.match(r"(\w+)\s+(\w+)\s+(?P\S+-debug-(?P(?P\d{4}).(?P\d{2}).(?P\d{2})))\s.*", line) - if debug and ("%s-%s-%s" % (debug.group('year'), debug.group('month'), debug.group('day')) < str(debug_cutoff.date())): - tbd.append(debug.group('index')) - elif general and ("%s-%s-%s" % (general.group('year'), general.group('month'), general.group('day')) < str(general_cutoff.date())): - tbd.append(general.group('index')) - except Exception as e: - print 'Failed rerquesting index list from ES: %s' % str(e) - traceback.print_exc() - sys.exit(2) - - if not len(tbd): - try: - es.index(index=log_index, - doc_type='rucio-residue', - body={'message': 'No indices qualify for deletion with given retention policy (%s days / %s days (debug))' % (RETENTION, DEBUG_RETENTION), - 'application': 'check_indices', - 'severity': 6, - 'severity_label': 'info', - 'facility': 1, - 'host': socket.getfqdn(), - '@timestamp': datetime.utcnow()}) - except Exception: - traceback.print_exc() - sys.exit(1) # In Nagios 1 is WARNING - else: - for index in tbd: - try: - es.indices.delete(index) - es.index(index=log_index, - doc_type='rucio-residue', - body={'message': 'Successfully deleted index: %s' % index, - 'application': 'check_indices', - 'severity': 6, - 'severity_label': 'info', - 'facility': 1, - 'host': socket.getfqdn(), - '@timestamp': datetime.utcnow()}) - except Exception as e: - try: - es.index(index=log_index, - doc_type='rucio-residue', - body={'message': 'Failed deleting index: %s' % str(e), - 'application': 'check_indices', - 'severity': 3, - 'severity_label': 'error', - 'facility': 1, - 'host': socket.getfqdn(), - '@timestamp': datetime.utcnow()}) - except Exception: - traceback.print_exc() - sys.exit(2) # In Nagi os 2 is CRITICAL - - # Updating alias to include latest indices - - try: - es.indices.put_alias(index='logs-rucio-daemons-*', name='rucio-daemon-logs') - es.index(index=log_index, - doc_type='rucio-residue', - body={'message': 'Successfully update alias \"rucio-daemons-logs\"', - 'application': 'check_indices', - 'severity': 6, - 'severity_label': 'info', - 'facility': 1, - 'host': socket.getfqdn(), - '@timestamp': datetime.utcnow()}) - except Exception as e: - traceback.print_exc() - try: - es.index(index=log_index, - doc_type='rucio-residue', - body={'message': 'Failed updating alias: %s' % str(e), - 'application': 'check_indices', - 'severity': 2, - 'severity_label': 'critical', - 'facility': 1, - '@timestamp': datetime.utcnow()}) - except Exception as arg: - traceback.print_exc() - sys.exit(2) # In Nagios 2 is CRITICAL - - sys.exit(0) # Nagios for 'All Good' diff --git a/tools/probes/common/check_expired_locked_rules b/tools/probes/common/check_expired_locked_rules deleted file mode 100755 index 55e139a1f2..0000000000 --- a/tools/probes/common/check_expired_locked_rules +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env python -# Copyright European Organization for Nuclear Research (CERN) 2013 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - Cedric Serfon, , 2015 - -''' -Probe to check the locked expired rules or datasets with locked rules -''' - -import sys -from rucio.db.sqla.session import get_session - -# Exit statuses -OK, WARNING, CRITICAL, UNKNOWN = 0, 1, 2, 3 - - -def main(): - ''' - Probe to check the locked expired rules or datasets with locked rules - ''' - status = OK - session = get_session() - try: - query = "select rawtohex(id), scope, name, rse_expression from atlas_rucio.rules where locked=1 and expires_at, 2013 - -''' -Probe to check the backlog of expired rules. -''' -import sys -import traceback - -from rucio.core import monitor -from rucio.db.sqla.session import get_session - -# Exit statuses -OK, WARNING, CRITICAL, UNKNOWN = 0, 1, 2, 3 - -if __name__ == "__main__": - try: - session = get_session() - result = session.execute('select count(1) from atlas_rucio.rules where expires_at < sys_extract_utc(localtimestamp)').fetchone()[0] - monitor.record_gauge(stat='judge.expired_rules', value=result) - result = session.execute('select count(1) from atlas_rucio.rules where expires_at > sys_extract_utc(localtimestamp)').fetchone()[0] - monitor.record_gauge(stat='judge.lifetimed_rules', value=result) - print result - except: - print traceback.format_exc() - sys.exit(UNKNOWN) - sys.exit(OK) diff --git a/tools/probes/common/check_free_space b/tools/probes/common/check_free_space deleted file mode 100755 index 91e6dd35ba..0000000000 --- a/tools/probes/common/check_free_space +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/env python -# Copyright European Organization for Nuclear Research (CERN) 2013 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - Vincent Garonne, , 2015 - -''' -Probe to check the min free space. -''' - -import sys -import traceback - -from rucio.core.rse import list_rses, get_rse_limits, set_rse_usage - -# Exit statuses -OK, WARNING, CRITICAL, UNKNOWN = 0, 1, 2, 3 - - -if __name__ == "__main__": - try: - - for rse in list_rses(): - limits = get_rse_limits(rse=rse['rse'], rse_id=rse['id']) - min_free_space = limits.get('MinFreeSpace') - if min_free_space is not None: - # print rse['rse'], min_free_space - set_rse_usage(rse=rse['rse'], source='min_free_space', used=min_free_space, free=None) - except: - print (traceback.format_exc()) - sys.exit(UNKNOWN) - sys.exit(OK) diff --git a/tools/probes/common/check_fts_backlog b/tools/probes/common/check_fts_backlog deleted file mode 100755 index bbf58c314e..0000000000 --- a/tools/probes/common/check_fts_backlog +++ /dev/null @@ -1,145 +0,0 @@ -#!/usr/bin/env python -""" - Copyright European Organization for Nuclear Research (CERN) 2013 - - Licensed under the Apache License, Version 2.0 (the "License"); - You may not use this file except in compliance with the License. - You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 - - Authors: - - Cedric Serfon, , 2014-2018 - - Mario Lassnig, , 2015 -""" - -import os -import sys -from urlparse import urlparse -import urllib3 - -import requests - -from rucio.common.config import config_get -from rucio.core import monitor - -from rucio.core.distance import update_distances - -from rucio.db.sqla.session import get_session - -OK, WARNING, CRITICAL, UNKNOWN = 0, 1, 2, 3 - -urllib3.disable_warnings() - -if __name__ == "__main__": - - se_matrix = {} - try: - VO = sys.argv[1] - except IndexError as error: - VO = 'atlas' - try: - CHECK_BUSY = sys.argv[2] - except IndexError as error: - CHECK_BUSY = 0 - - WORST_RETVALUE = OK - - try: - PROXY = config_get('nagios', 'proxy') - os.environ["X509_USER_PROXY"] = PROXY - except Exception as error: - print "Failed to get proxy from rucio.cfg" - PROXY = '/opt/rucio/etc/ddmusr01.rfc.proxy' - WORST_RETVALUE = WARNING - - try: - FTSHOSTS = config_get('conveyor', 'ftsmonhosts') - except Exception as error: - print "Failed to get ftsmonhosts" - WORST_RETVALUE = WARNING - for ftshost in FTSHOSTS.split(','): - print "=== %s ===" % (ftshost) - parsed_url = urlparse(ftshost) - scheme, hostname, port = parsed_url.scheme, parsed_url.hostname, parsed_url.port - retvalue = CRITICAL - url = '%s/fts3/ftsmon/overview?dest_se=&source_se=&time_window=1&vo=%s' % (ftshost, VO) - busy_channels = [] - busylimit = 5000 - for attempt in xrange(0, 5): - result = None - try: - result = requests.get(url, verify=False, cert=(PROXY, PROXY)) - res = result.json() - for channel in res['overview']['items']: - src = channel['source_se'] - dst = channel['dest_se'] - if (src, dst) not in se_matrix: - se_matrix[(src, dst)] = {'active': 0, 'submitted': 0, 'finished': 0, 'failed': 0, 'transfer_speed': 0, 'mbps_link': 0} - for state in ['submitted', 'active', 'finished', 'failed']: - try: - se_matrix[(src, dst)][state] += channel[state] - except Exception: - pass - try: - se_matrix[(src, dst)]['transfer_speed'] += channel['current'] - se_matrix[(src, dst)]['mbps_link'] += channel['current'] - except Exception: - pass - if CHECK_BUSY and 'submitted' in channel and channel['submitted'] >= busylimit: - url_activities = '%s/fts3/ftsmon/config/activities/%s?source_se=%s&dest_se=%s' % (ftshost, VO, src, dst) - activities = {} - try: - s = requests.get(url_activities, verify=False, cert=(PROXY, PROXY)) - for key, val in s.json().items(): - activities[key] = val['SUBMITTED'] - except Exception as error: - pass - busy_channels.append({'src': src, 'dst': dst, 'submitted': channel['submitted'], 'activities': activities}) - summary = res['summary'] - hostname = hostname.replace('.', '_') - print '%s : Submitted : %s' % (hostname, summary['submitted']) - print '%s : Active : %s' % (hostname, summary['active']) - print '%s : Staging : %s' % (hostname, summary['staging']) - print '%s : Started : %s' % (hostname, summary['started']) - if busy_channels != []: - print 'Busy channels (>%s submitted):' % (busylimit) - for bc in busy_channels: - activities_str = ", ".join([("%s: %s" % (key, val)) for key, val in bc['activities'].items()]) - print ' %s to %s : %s submitted jobs (%s)' % (bc['src'], bc['dst'], bc['submitted'], str(activities_str)) - monitor.record_gauge(stat='fts3.%s.submitted' % (hostname), value=(summary['submitted'] + summary['active'] + summary['staging'] + summary['started'])) - retvalue = OK - break - except Exception as error: - retvalue = CRITICAL - if result and result.status_code: - errmsg = 'Error when trying to get info from %s : HTTP status code %s. [%s]' % (ftshost, str(result.status_code), str(error)) - else: - errmsg = 'Error when trying to get info from %s. %s' % (ftshost, str(error)) - if retvalue == CRITICAL: - print "All attempts failed. %s" % (errmsg) - WORST_RETVALUE = max(retvalue, WORST_RETVALUE) - se_map = {} - try: - session = get_session() - for sename, rse_id in session.execute("select scheme||'://'||hostname, rawtohex(rse_id) from atlas_rucio.rse_protocols").fetchall(): - if sename not in se_map: - se_map[sename] = [] - se_map[sename].append(rse_id) - except: - sys.exit(WORST_RETVALUE) - QUERY = """ - update atlas_rucio.distances set active=null, submitted=null, finished=null, failed=null, transfer_speed=null - where not (active is null and submitted is null and finished is null and failed is null and transfer_speed is null) - """ - try: - session = get_session() - session.execute(QUERY) - session.commit() - except: - sys.exit(WORST_RETVALUE) - - for source_rse, dest_rse in se_matrix: - for source_rse_id in se_map[source_rse]: - for dest_rse_id in se_map[dest_rse]: - # print source_rse_id, dest_rse_id, se_matrix[(source_rse, dest_rse)] - update_distances(src_rse_id=source_rse_id, dest_rse_id=dest_rse_id, parameters=se_matrix[(source_rse, dest_rse)], session=None) - sys.exit(WORST_RETVALUE) diff --git a/tools/probes/common/check_fts_proxy_lifetime.py b/tools/probes/common/check_fts_proxy_lifetime.py deleted file mode 100755 index 6319f92c90..0000000000 --- a/tools/probes/common/check_fts_proxy_lifetime.py +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env python -# Copyright European Organization for Nuclear Research (CERN) 2013 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - Mario Lassnig, , 2017 - -import datetime -import requests -import sys - -from dateutil import parser - -from rucio.common.config import config_get - -requests.packages.urllib3.disable_warnings() # pylint: disable=no-member - - -UNKNOWN = 3 -CRITICAL = 2 -WARNING = 1 -OK = 0 - -PROXY = config_get('nagios', 'rfcproxy') -FTS_SERVERS = config_get('nagios', 'fts_servers').split(',') - -status = OK -for FTS_SERVER in FTS_SERVERS: - FTS_SERVER = FTS_SERVER.strip() - whoami = requests.get('%s/whoami' % FTS_SERVER, verify=False, cert=PROXY).json() - delegation = requests.get('%s/delegation/%s' % (FTS_SERVER, whoami['delegation_id']), verify=False, cert=PROXY).json() - expiration = parser.parse(delegation['termination_time']) - if expiration < datetime.datetime.now() + datetime.timedelta(days=30): - print FTS_SERVER, expiration.strftime("%Y-%m-%d"), 'CRITICAL' - status = CRITICAL - elif expiration < datetime.datetime.now() + datetime.timedelta(days=100): - print FTS_SERVER, expiration.strftime("%Y-%m-%d"), 'WARNING' - status = WARNING - else: - print FTS_SERVER, expiration.strftime("%Y-%m-%d"), 'OK' - -sys.exit(status) diff --git a/tools/probes/common/check_geoip_distances b/tools/probes/common/check_geoip_distances deleted file mode 100755 index 9b3e091bea..0000000000 --- a/tools/probes/common/check_geoip_distances +++ /dev/null @@ -1,62 +0,0 @@ -#!/usr/bin/env python -""" - Copyright European Organization for Nuclear Research (CERN) - - Licensed under the Apache License, Version 2.0 (the "License"); - You may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - - Authors: - - Cedric Serfon, , 2017 -""" - -import json -from math import asin, cos, radians, sin, sqrt -import sys - -import requests - -from rucio.core.rse import update_rse, get_rse_id -from rucio.core.distance import update_distances -from rucio.common.exception import RSENotFound - - -UNKNOWN = 3 -CRITICAL = 2 -WARNING = 1 -OK = 0 - - -if __name__ == '__main__': - - URL = 'http://atlas-agis-api.cern.ch/request/site/query/list/?json' - try: - resp = requests.get(url=URL) - data = json.loads(resp.content) - retvalue = OK - agis_rses = [] - rse_matrix = {} - rse_ids = {} - for site in data: - if site['latitude'] and site['longitude']: - for rse in site['ddmendpoints']: - try: - update_rse(rse=rse, parameters={'latitude': site['latitude'], 'longitude': site['longitude']}, session=None) - rse_matrix[rse] = (float(site['latitude']), float(site['longitude'])) - if rse not in rse_ids: - rse_ids[rse] = get_rse_id(rse, session=None) - except RSENotFound: - pass - - for rse1 in rse_matrix: - for rse2 in rse_matrix: - long1, lat1, long2, lat2 = rse_matrix[rse1][1], rse_matrix[rse1][0], rse_matrix[rse2][1], rse_matrix[rse2][0] - long1, lat1, long2, lat2 = map(radians, [long1, lat1, long2, lat2]) - dlon = long2 - long1 - dlat = lat2 - lat1 - dist = 6378 * 2 * asin(sqrt(sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2)) - update_distances(src_rse_id=rse_ids[rse1], dest_rse_id=rse_ids[rse2], parameters={'geoip_distance': dist}, session=None) - except Exception: - retvalue = CRITICAL - sys.exit(retvalue) diff --git a/tools/probes/common/check_gridftp_space b/tools/probes/common/check_gridftp_space deleted file mode 100755 index bfc46213e2..0000000000 --- a/tools/probes/common/check_gridftp_space +++ /dev/null @@ -1,78 +0,0 @@ -#!/usr/bin/env python -# Copyright European Organization for Nuclear Research (CERN) 2013 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - Wen Guan, , 2014 -# - Tomas Javurek, , 2016 -# - Cedric Serfon, , 2016 - -import os -import sys - -from rucio.client import Client -from rucio.common.config import config_get -from rucio.rse import rsemanager as rsemgr -from rucio.api.rse import list_rse_attributes - -OK, WARNING, CRITICAL, UNKNOWN = 0, 1, 2, 3 - - -if __name__ == "__main__": - - retvalue = OK - cloudRetValue = OK - usedsize = 0 - freesize = 0 - - try: - proxy = config_get('nagios', 'proxy') - os.environ["X509_USER_PROXY"] = proxy - except Exception as e: - print "Failed to get proxy from rucio.cfg" - retvalue = WARNING - - c = Client() - - rses = c.list_rses() - for rse in rses: - rsename = rse['rse'] - rse_settings = rsemgr.get_rse_info(rsename) - - # verifing, that json method is perfered - attr = list_rse_attributes(rsename) - if 'space_usage_method' in attr.keys(): - if not attr['space_usage_method'] == 'json': - continue - else: - continue - - if rse_settings['protocols']: - rse_settings['protocols'][0]['impl'] = 'rucio.rse.protocols.gsiftp.Default' - else: - print '%s has no protocol' % (rsename) - - try: - gs, ret = rsemgr.get_space_usage(rse_settings, "gsiftp") - if gs: - totalsize = long(ret["totalsize"]) - freesize = long(ret["unusedsize"]) - usedsize = totalsize - freesize - retvalue = OK - else: - print "Failed to get rse(%s) space information: %s" % (rsename, str(ret)) - retvalue = WARNING - cloudRetValue = WARNING - except Exception as e: - print "Failed to get rse(%s) space information: %s" % (rsename, str(e)) - retvalue = WARNING - cloudRetValue = WARNING - - if retvalue == OK: - print "Update RSE %s space usage (usedsize: %s, freesize: %s)" % (rsename, usedsize, freesize) - c.set_rse_usage(rsename, "gsiftp", usedsize, freesize) - - sys.exit(cloudRetValue) diff --git a/tools/probes/common/check_injecting_rules b/tools/probes/common/check_injecting_rules deleted file mode 100755 index 1cde7b0b58..0000000000 --- a/tools/probes/common/check_injecting_rules +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/env python -# Copyright European Organization for Nuclear Research (CERN) 2013 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - Martin Barisits, , 2016 - -''' -Probe to check the backlog of injecting rules. -''' -import sys -import traceback - -from rucio.core import monitor -from rucio.db.sqla.session import get_session - -# Exit statuses -OK, WARNING, CRITICAL, UNKNOWN = 0, 1, 2, 3 - -if __name__ == "__main__": - try: - session = get_session() - result = session.execute('select count(1) from atlas_rucio.rules where state=\'I\'').fetchone()[0] - monitor.record_gauge(stat='judge.injecting_rules', value=result) - print result - except: - print traceback.format_exc() - sys.exit(UNKNOWN) - sys.exit(OK) diff --git a/tools/probes/common/check_log b/tools/probes/common/check_log deleted file mode 100755 index 97e7fab5ce..0000000000 --- a/tools/probes/common/check_log +++ /dev/null @@ -1,51 +0,0 @@ -#!/usr/bin/env python -# Copyright European Organization for Nuclear Research (CERN) 2013 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - Cedric Serfon, , 2014 -# - -import commands -import os -import socket -import sys - -from datetime import datetime, timedelta - -OK, WARNING, CRITICAL, UNKNOWN = 0, 1, 2, 3 - -if __name__ == "__main__": - if len(sys.argv) < 2: - sys.exit(CRITICAL) - collector = sys.argv[1] - hostname = socket.gethostname() - now = datetime.now() - directory = '/var/log/rucio/' - for filename in os.listdir(directory): - if filename.startswith('rucio') and filename.endswith('log'): - f = open('%s%s' % (directory, filename), 'r') - count = 0 - results = {'DEBUG': 0, 'INFO': 0, 'WARNING': 0, 'ERROR': 0, 'CRITICAL': 0} - for line in f: - sline = line.split('\t') - try: - if abs(now - datetime.strptime(sline[0].split(',')[0], '%Y-%m-%d %H:%M:%S')) < timedelta(hours=1): - count += 1 - results[sline[2]] += 1 - except ValueError, e: - pass - print results - agent = filename.split('.')[0] - f.close() - g = open('/tmp/passive_probes.txt', 'w') - if results['CRITICAL'] == 0: - g.write('%s\t%s : Errors in the last 60 minutes\t%i\tNo critical errors\n' % (hostname, agent, OK)) - else: - g.write('%s\t%s : Errors in the last 60 minutes\t%i\t%i critical errors during the last hour\n' % (hostname, agent, CRITICAL, results['CRITICAL'])) - g.close() - s, o = commands.getstatusoutput('/usr/sbin/send_nsca %s -c /etc/nagios/send_nsca.cfg < /tmp/passive_probes.txt' % (collector)) - sys.exit(OK) diff --git a/tools/probes/common/check_lost_files b/tools/probes/common/check_lost_files deleted file mode 100755 index 348f7ae1b8..0000000000 --- a/tools/probes/common/check_lost_files +++ /dev/null @@ -1,500 +0,0 @@ -#!/usr/bin/env python -# Copyright European Organization for Nuclear Research (CERN) 2015 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - Tomas Javurek, Cedric Serfon, 2015 - -import os -import sys -import smtplib -import time -import requests -import calendar -from email.mime.text import MIMEText -from email.MIMEMultipart import MIMEMultipart -from email.MIMEBase import MIMEBase -from email import Encoders -# from rucio.common.config import config_get -from datetime import datetime -from datetime import date - -from rucio.db.sqla.session import get_session -# from rucio.core import monitor - -# Exit statuses -OK, WARNING, CRITICAL, UNKNOWN = 0, 1, 2, 3 - -users = True -groups = True -gdp = True -testmode = False -working_days = ['Wednesday'] - -timestamp = datetime.today().strftime('%Y-%m-%d') -log_dir = '/var/log/rucio/lost_files/logs/' -log_path = log_dir + timestamp + '.log' -tmpdata_dir = '/var/log/rucio/lost_files/tmp/' -tmpdata_path = tmpdata_dir + 'rse-lost-files.txt' -reports_dir = '/var/log/rucio/lost_files/reports/' - - -# protection against running this script every day -def run_judger(working_days): - - flog = open(log_path, 'a') - today_date = date.today() - today_day = calendar.day_name[today_date.weekday()] - if today_day in working_days: - flog.write('Today is %s.\n' % today_day) - flog.write('I might try to work today.\n') - return True - else: - flog.write('Today is %s. This is NOT my working day! I am working only on:\n' % today_day) - flog.write(str(working_days) + '\n') - return False - - -def merge_dicts(d1, d2): - - dm = d1.copy() - for a in d2.keys(): - if a not in dm.keys(): - dm[a] = d2[a] - else: - dm[a] = list(set(dm[a] + d2[a])) - return dm - - -# extracting mails of users from Rucio DB -def find_mails_users(account, session): - - mails = [] - try: - query = ''' select distinct a.email from atlas_rucio.identities a, atlas_rucio.account_map b where -a.identity=b.identity and b.account='%s' ''' % account - result = session.execute(query) - for row in result: - for col in row: - mails.append(str(col)) - except Exception, e: - flog = open(log_path, 'a') - flog.wirte('find_mails_users\n') - flog.write(str(e) + '\n') - sys.exit(CRITICAL) - if account == 'ddmadmin' or account == 'root': - mails = ['atlas-adc-ddm-support@cern.ch'] - if 'tomas.javurek@cern.ch' not in mails: - mails.append('tomas.javurek@cern.ch') - - return mails - - -# hardcoded, TODO -def find_mails_gdp(): - - mails = ['atlas-adc-dpa@cern.ch', 'tomas.javurek@cern.ch'] - return mails - - -# extracting mails of physgroups from Rucio DB -def find_mails_groups(rse, session): - - mails = [] - try: - query = ''' select distinct email from atlas_rucio.identities where identity in - (select identity from atlas_rucio.account_map where account in - (select value from atlas_rucio.rse_attr_map where key = 'physgroup' and rse_id = atlas_rucio.rse2id('%s'))) ''' % rse - result = session.execute(query) - for row in result: - for col in row: - mails.append(str(col)) - except Exception, e: - flog = open(log_path, 'a') - flog.write('find_mails_groups\n') - flog.write(str(e) + '\n') - sys.exit(CRITICAL) - - if 'tomas.javurek@cern.ch' not in mails: - mails.append('tomas.javurek@cern.ch') - - return mails - - -# find account for rule on given did -def get_rule_owners(scope, name, session): - - rule_owners = [] - try: - query = ''' select distinct(account) from atlas_rucio.rules where scope='%s' and name='%s' ''' % (scope, name) - result = session.execute(query) - for row in result: - for col in row: - rule_owners.append(str(col)) - except Exception, e: - flog = open(log_path, 'a') - flog.write('get_rule_owners:') - flog.write(str(e) + '\n') - - if testmode: - print 'DEBUG: ', scope, name - print 'DEBUG: rule owners', rule_owners - - return rule_owners - - -# collects reports for given email -def report_collector(rse, account, session): - - mails_reports = {} - mail_list = [] - report_path = '' - if groups and rse != '': - mail_list = find_mails_groups(rse, session) - report_path = reports_dir + 'report_' + rse - if users and account != '' and account != 'gdp': - mail_list = find_mails_users(account, session) - report_path = reports_dir + 'report_' + account - if gdp and account == 'gdp': - mail_list = find_mails_gdp() - report_path = reports_dir + 'report_' + account - if mail_list == [] or report_path == '' or report_path == 'report_': - return - - for mail in mail_list: - if mail not in mails_reports: - mails_reports[mail] = [report_path] - else: - mails_reports[mail].append(report_path) - return mails_reports - - -# mailing agent -def send_report(mail, report_paths): - - if testmode: - print "DEBUG: mailing agent is accessed." - print "DEBUG: ", mail, report_paths - - # defining mailing list - me = 'atlas-adc-ddm-support@cern.ch' - recepients = [] - - if testmode: - print 'DEBUG: notification would be send to:', mail - recepients = ['tomas.javurek@cern.ch'] - else: - recepients = [mail] - - msg = MIMEMultipart() - msg['Subject'] = 'DDMops: completely lost files that may affect you - last 7 days' - msg['From'] = me - msg['To'] = ", ".join(recepients) - - # email body - msg.attach(MIMEText('Please check the attached list of files that have been lost and can not be recovered. These files may affect you. In case of questions contact DDMops.' + "\n\n")) - - # if report is short,lost files are reported in email body as well - lines = [] - for report_path in report_paths: - fr = open(report_path, 'r') - for lost_file in fr.readlines(): - lines.append(lost_file) - fr.close() - if len(lines) > 20: - break - if len(lines) < 21: - for l in lines: - msg.attach(MIMEText(str(l))) - - # attachments - for report_path in report_paths: - fr = open(report_path, 'rb') - part = MIMEBase('application', "octet-stream") - part.set_payload(fr.read()) - Encoders.encode_base64(part) - part.add_header('Content-Disposition', 'attachment; filename="%s"' % report_path) - msg.attach(part) - - # sending email, s=server - flog = open(log_path, 'a') - flog.write('Reports were sent to:\n') - flog.write(mail) - flog.write(str(report_paths)) - flog.write('\n\n') - s = smtplib.SMTP('localhost') - s.sendmail(me, recepients, msg.as_string()) - s.quit() - - -# create report for gdp -# call mailing agent -def report_gdp(): - - # INIT - if testmode: - print "DEBUG: making report for GDP" - print "||||||||||||||||||||||||||||" - if not os.path.isfile(tmpdata_path): - print "ERROR: lost files not downloaded" - sys.exit(CRITICAL) - - cmd = 'cp %s %s' % (tmpdata_path, reports_dir + '/report_gdp') - os.system(cmd) - - return ['gdp'] - - -# make report by user -# call the mailing agent -def report_by_account(session): - - # INIT - if testmode: - print "DEBUG: making report by account" - print "|||||||||||||||||||||||||||||||" - if not os.path.isfile(tmpdata_path): - print "ERROR: lost files not downloaded" - sys.exit(CRITICAL) - fi = open(tmpdata_path, 'r') - data_per_account = {} - accs = [] - - # loop over lost files from get_bad_files() - for line in fi.readlines(): - scope = line.split(' ')[0] - data_name = line.split(' ')[1] - dataset = line.split(' ')[3] - rse_name = line.split(' ')[4] - account = line.split(' ')[5] - updated_at = line.split(' ')[6] - accounts = [] - - # find owners of rule, they are contacted as well - if testmode: - print "DEBUG: get rule owners" - rule_owners = get_rule_owners(scope, dataset, session) - # did_woners = get_did_owner TO BE DEVELOPED - for own in rule_owners: - if own not in accounts: - accounts.append(own) - if testmode: - print 'INFO:', rse_name, account, dataset, data_name - if accounts == []: - print "DEBUG: there is no account to be notified." - else: - print "DEBUG: rule owners found:", accounts - print '=======================' - - for acc in accounts: - if acc not in data_per_account.keys(): - data_per_account[acc] = [{'scope': scope, 'name': data_name, 'dataset': dataset, 'rse': rse_name, 'time': updated_at}] - else: - data_per_account[acc].append({'scope': scope, 'name': data_name, 'dataset': dataset, 'rse': rse_name, 'time': updated_at}) - - if testmode: - print "DEBUG: creating reports and sending." - - # create report per account - for account in data_per_account.keys(): - fo = open(reports_dir + 'report_' + account, 'w') - for bad_file in data_per_account[account]: - fo.write("%s %s %s %s\n" % (bad_file['scope'], bad_file['dataset'], bad_file['name'], bad_file['time'])) - - # send report by mail - for account in data_per_account.keys(): - if testmode: - print "DEBUG: going to send the report." - accs.append(account) - - if testmode: - if data_per_account == {}: - print "DEBUG: nothing to send." - - if testmode: - print "DEBUG: report by accnounts done." - fi.close() - return accs - - -# make report for each rse -# call the mailing agent -def report_by_rses(session): - - rses = [] - # INIT - if not os.path.isfile(tmpdata_path): - print "ERROR: lost files not downloaded" - sys.exit(CRITICAL) - fi = open(tmpdata_path, 'r') - data_per_rse = {} - - # loop over lost files from get_bad_files() - for line in fi.readlines(): - scope = line.split(' ')[0] - data_name = line.split(' ')[1] - dataset = line.split(' ')[3] - rse_name = line.split(' ')[4] - account = line.split(' ')[5] - updated_at = line.split(' ')[6] - - if rse_name not in data_per_rse.keys(): - data_per_rse[rse_name] = [{'scope': scope, 'name': data_name, 'dataset': dataset, 'account': account, 'time': updated_at, 'rse': rse_name}] - else: - data_per_rse[rse_name].append({'scope': scope, 'name': data_name, 'dataset': dataset, 'account': account, 'time': updated_at, 'rse': rse_name}) - - # create report per rse - for rse in data_per_rse.keys(): - fo = open(reports_dir + 'report_' + rse, 'w') - for bad_file in data_per_rse[rse]: - fo.write("%s %s %s %s\n" % (bad_file['scope'], bad_file['dataset'], bad_file['name'], bad_file['time'])) - - # send report by mail - for rse in data_per_rse.keys(): - rses.append(rse) - - fi.close() - return rses - - -# the input -def get_bad_files(session): - - f = open(tmpdata_path, 'w') - try: - query = ''' select a.scope, a.name, b.scope, b.name, atlas_rucio.id2rse(a.rse_id), a.account, a.updated_at from atlas_rucio.bad_replicas a, atlas_rucio.contents_history b - where a.state='L' and a.updated_at>sysdate-7 and b.did_type='D'and a.scope=b.child_scope and a.name=b.child_name ''' - - result = session.execute(query) - processed_files = [] - for row in result: - if row[3].startswith('panda.'): - continue - if '_sub' in row[3]: - continue - f_did = row[0] + ':' + row[1] - if f_did in processed_files: - print 'double counted:', f_did - continue - else: - processed_files.append(f_did) - for col in row: - f.write('%s ' % col) - f.write('\n') - - except Exception, e: - flog = open(log_path, 'a') - flog.write('get_bad_files\n') - flog.write(str(e) + "\n") - return False - - return True - - -def get_bad_files_from_dump(session): - - flog = open(log_path, 'a') - url = 'https://rucio-hadoop.cern.ch/lost_files' - dump7 = requests.get(url, verify=False) - if dump7.status_code == 404: - flog.write('ERROR: dump of bad files not reachable on hadoop') - return False - - f = open(tmpdata_path, 'w') - line_counter = 0 - processed_files = [] - for l in dump7.text.split('\n'): - line_counter += 1 - data = l.split('\t') - if len(data) < 7: - flog.write('WARNING: line %i in dump does not contain full info \n' % line_counter) - continue - if data[3].startswith('panda.'): - continue - if '_sub' in data[3]: - continue - f_did = data[0] + ':' + data[1] - if f_did in processed_files: - print 'double counted:', f_did - continue - else: - processed_files.append(f_did) - updated_at = time.strftime('%Y-%m-%d', time.localtime(float(data[6]))) - f.write('%s %s %s %s %s %s %s\n' % (data[0], data[1], data[2], data[3], data[4], data[5], updated_at)) - flog.write('INFO: dump contains %i lines\n' % line_counter) - - return True - - -def main(): - - run_flag = run_judger(working_days) - if not run_flag: - sys.exit(OK) - - session = get_session() - - # check folder hierarchy - if not os.path.exists(log_dir): - sys.exit(CRITICAL) - if not os.path.exists(tmpdata_dir): - sys.exit(CRITICAL) - if not os.path.exists(reports_dir): - sys.exit(CRITICAL) - - mails = {} - # get input - get_input = get_bad_files_from_dump(session) - if not get_input: - print 'WARNING: the dump is not accessible' - get_input = get_bad_files(session) - if not get_input: - sys.exit(CRITICAL) - # make and sent report to groups - if groups: - l_rses = report_by_rses(session) - for rse in l_rses: - reps = report_collector(rse, '', session) - mails = merge_dicts(mails, reps) - # make and sent report to users - if users: - l_acc = report_by_account(session) - for acc in l_acc: - reps = report_collector('', acc, session) - mails = merge_dicts(mails, reps) - if gdp: - if testmode: - print "DEBUG: summary report to gdp" - l_acc = report_gdp() - for acc in l_acc: - reps = report_collector('', acc, session) - mails = merge_dicts(mails, reps) - - if len(list(set(mails.keys()))) != len(mails.keys()): - print "ERROR: list of emails is not distinct" - sys.exit('ERROR: list of emails is not distinct') - - if testmode: - send_report('tomas.javurek@cern.ch', mails['tomas.javurek@cern.ch']) - for m in mails.keys(): - flog = open(log_path, 'a') - flog.write(m) - flog.write(str(mails[m])) - flog.write('\n') - else: - for m in mails.keys(): - send_report(m, mails[m]) - - # clean tmp - cmd = 'rm ' + tmpdata_path - os.system(cmd) - - sys.exit(OK) - - -if __name__ == '__main__': - - main() diff --git a/tools/probes/common/check_map_voms_roles b/tools/probes/common/check_map_voms_roles deleted file mode 100755 index 68b8db5b6e..0000000000 --- a/tools/probes/common/check_map_voms_roles +++ /dev/null @@ -1,112 +0,0 @@ -#!/usr/bin/env python -""" - Copyright European Organization for Nuclear Research (CERN) - - Licensed under the Apache License, Version 2.0 (the "License"); - You may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - - Authors: - - Cedric Serfon, , 2014, 2017 - - Joaquin Bogado, , 2014 - - Mario Lassnig, , 2015 - - Dimitrios Christidis, , 2019 -""" - -import os -import sys - -from rucio.client import Client -from rucio.common.config import config_get -from rucio.common.exception import Duplicate - -from VOMSAdmin.VOMSCommands import VOMSAdminProxy - -UNKNOWN = 3 -CRITICAL = 2 -WARNING = 1 -OK = 0 - - -if __name__ == '__main__': - try: - PROXY = config_get('nagios', 'proxy') - os.environ["X509_USER_PROXY"] = PROXY - CERT, KEY = os.environ['X509_USER_PROXY'], os.environ['X509_USER_PROXY'] - except Exception: - print("Failed to get proxy from rucio.cfg") - sys.exit(CRITICAL) - ACCOUNT_MAP = {'Role=pilot': 'pilot', 'Role=production': 'pilot'} - STATUS = OK - NBUSERS = 0 - CLIENT = Client() - ADMIN = VOMSAdminProxy(vo='atlas', host='voms2.cern.ch', port=8443, - user_cert=CERT, user_key=KEY) - for account in ACCOUNT_MAP: - NBUSERS = 0 - attempts = 0 - totattemps = 3 - for attempts in range(0, totattemps): - res = ADMIN.call_method('list-users-with-role', '/atlas', account) - if isinstance(res, list) and (attempts < totattemps - 1): - for user in res: - NBUSERS += 1 - try: - dn = user._DN - ca = user._CA - email = user._mail - print(ACCOUNT_MAP[account], dn, ca, email) - try: - CLIENT.add_identity(account=ACCOUNT_MAP[account], identity=dn, authtype='X509', email=email, default=True) - print('Identity %(dn)s added' % locals()) - except Duplicate: - pass - except Exception as error: - print(error) - except: - print('ERROR getting info for %s' % (user._DN)) - STATUS = WARNING - break - else: - sys.exit(CRITICAL) - print('%i users extracted from VOMS with %s' % (NBUSERS, account)) - - ACCOUNT_LIST = [ - 'calib-muon', 'dataprep', 'det-alfa', 'det-ibl', 'det-indet', - 'det-larg', 'det-muon', 'det-slhc', 'det-tile', 'perf-egamma', - 'perf-flavtag', 'perf-idtracking', 'perf-jets', 'perf-muons', - 'perf-tau', 'phys-beauty', 'phys-exotics', 'phys-gener', 'phys-hdbs', - 'phys-hi', 'phys-higgs', 'phys-sm', 'phys-susy', 'phys-top', - 'phys-valid', 'proj-sit', 'trig-daq', 'trig-hlt', 'trig-l1calo' - ] - for account in ACCOUNT_LIST: - NBUSERS = 0 - attempts = 0 - totattemps = 3 - for attempts in range(0, totattemps): - res = ADMIN.call_method('list-members', '/atlas/{0}'.format(account)) - if isinstance(res, list) and (attempts < totattemps - 1): - for user in res: - NBUSERS += 1 - try: - dn = user._DN - ca = user._CA - email = user._mail - print(account, dn, ca, email) - try: - CLIENT.add_identity(account=account, identity=dn, authtype='X509', email=email, default=True) - print('Identity {0} added to {1}'.format(dn, account)) - except Duplicate: - pass - except Exception as error: - print(error) - except: - print('ERROR getting info for %s' % (user._DN)) - STATUS = WARNING - break - else: - sys.exit(CRITICAL) - print('%i users extracted from VOMS with %s' % (NBUSERS, account)) - - sys.exit(STATUS) diff --git a/tools/probes/common/check_messages_to_submit b/tools/probes/common/check_messages_to_submit deleted file mode 100755 index 211f3ffb5a..0000000000 --- a/tools/probes/common/check_messages_to_submit +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/env python -# Copyright European Organization for Nuclear Research (CERN) 2013 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - Mario Lassnig, , 2013-2014 - -''' -Probe to check the queues of messages to submit by Hermes to the broker -''' - -import sys - -from rucio.core import monitor -from rucio.db.sqla.session import get_session - -# Exit statuses -OK, WARNING, CRITICAL, UNKNOWN = 0, 1, 2, 3 - -queue_sql = """SELECT COUNT(*) FROM atlas_rucio.messages""" - -if __name__ == "__main__": - try: - session = get_session() - result = session.execute(queue_sql).fetchall() - print 'queues.messages %s' % result[0][0] - monitor.record_gauge(stat='queues.messages', value=result[0][0]) - - if result[0][0] > 100000: - sys.exit(WARNING) - elif result[0][0] > 1000000: - sys.exit(CRITICAL) - - except Exception, e: - sys.exit(UNKNOWN) - sys.exit(OK) diff --git a/tools/probes/common/check_metrix_closeness_agis b/tools/probes/common/check_metrix_closeness_agis deleted file mode 100755 index 4015575e6d..0000000000 --- a/tools/probes/common/check_metrix_closeness_agis +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/python -# Copyright European Organization for Nuclear Research (CERN) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - Joaquin Bogado, , 2016 -# - Mario Lassnig, , 2016-2017 - -""" -Build the closeness for each link from AGIS. -""" - -import datetime -import json -import requests - -if __name__ == '__main__': - - RES = requests.get('http://atlas-agis-api.cern.ch/request/site/query/list_links/?json') - - if RES.ok: - RESJ = json.loads(RES.text) - DATA = {} - for site in RESJ: - if 'closeness' in site.keys(): - link = '%s:%s' % (site['src'], site['dst']) - DATA[link] = {'closeness': {'latest': site['closeness'], - 'timestamp': datetime.datetime.utcnow().isoformat()[:-7]}} - - with open('/data/metrix/data/closeness-agis/closeness-agis-{0}.json'.format(datetime.datetime.utcnow().isoformat()[:-7]), 'w') as f: - json.dump(DATA, f, indent=1, sort_keys=True) - - with open('/data/metrix/data/closeness-agis/latest.json', 'w') as f: - json.dump(DATA, f, indent=1, sort_keys=True) - - else: - print 'could not load json' diff --git a/tools/probes/common/check_metrix_files_done_dashb b/tools/probes/common/check_metrix_files_done_dashb deleted file mode 100755 index c873d1e051..0000000000 --- a/tools/probes/common/check_metrix_files_done_dashb +++ /dev/null @@ -1,121 +0,0 @@ -#!/usr/bin/python -# Copyright European Organization for Nuclear Research (CERN) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - Mario Lassnig, , 2016-2017 - -""" -Build the files done per link over the last 1 and 6 hours. -""" - -import datetime -import json -import requests - -ACTIVITIES = [ - 'Data Brokering', - 'Data Consolidation', - 'Data Rebalancing', - 'Deletion', - 'Express', - 'Functional Test', - 'Production Input', - 'Production Output', - 'Recovery', - 'SFO to EOS export', - 'Staging', - 'T0 Export', - 'T0 Tape', - 'User Subscriptions', - 'default'] - - -def get_link_done(hour): - - """ Retrieve the #transfers done from the dashboard. """ - - res = [] - - for tmp_activity in ACTIVITIES: - url1 = 'http://dashb-atlas-ddm.cern.ch/dashboard/request.py/matrix.json?activity=' - url2 = '&src_grouping=site&src_grouping=token&dst_grouping=site&dst_grouping=token&interval=' - url = url1 + tmp_activity.replace(' ', '%20') + url2 + str(int(hour * 60)) - tmp = requests.get(url).json() - for tmp_row in tmp['transfers']['rows']: - res.append((tmp_row[0], tmp_row[1], tmp_row[2], tmp_row[3], tmp_row[5], tmp_activity)) - - return res - - -if __name__ == '__main__': - - DATA = {} - - DONE_1 = get_link_done(1) - - for row in DONE_1: - src = '%s_%s' % (row[0], row[1]) - dst = '%s_%s' % (row[2], row[3]) - link = '%s:%s' % (src, dst) - if link.startswith(':') or link.endswith(':') or link.endswith('_None') or '_None:' in link: - continue - if link in DATA.keys(): - DATA[link]['files']['done'][row[5]] = {'1h': row[4], - '6h': row[4], - 'timestamp': datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%S')} - else: - DATA[link] = {'files': {'done': {row[5]: {'1h': row[4], - '6h': row[4], - 'timestamp': datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%S')}}}} - - DONE_6 = get_link_done(6) - - for row in DONE_6: - src = '%s_%s' % (row[0], row[1]) - dst = '%s_%s' % (row[2], row[3]) - link = '%s:%s' % (src, dst) - if link.startswith(':') or link.endswith(':') or link.endswith('_None') or '_None:' in link: - continue - if link in DATA.keys(): - if row[5] in DATA[link]['files']['done'].keys(): - DATA[link]['files']['done'][row[5]]['6h'] = row[4] - else: - DATA[link]['files']['done'][row[5]] = {'1h': row[4], - '6h': row[4], - 'timestamp': datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%S')} - else: - DATA[link] = {'files': {'done': {row[5]: {'1h': 0, - '6h': row[4], - 'timestamp': datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%S')}}}} - - MAPPING = None - with open('/data/metrix/data/mapping-rse-site/latest.json', 'r') as f: - MAPPING = json.load(f) - - for link in DATA: - src, dst = link.split(':') - DATA[link]['src_site'] = MAPPING[src] - DATA[link]['dst_site'] = MAPPING[dst] - - SITE_DATA = {} - for link in DATA: - site_link = '%s:%s' % (DATA[link]['src_site'], DATA[link]['dst_site']) - if site_link in SITE_DATA.keys(): - for activity in DATA[link]['files']['done']: - if activity in SITE_DATA[site_link]['files']['done'].keys(): - SITE_DATA[site_link]['files']['done'][activity]['1h'] += DATA[link]['files']['done'][activity]['1h'] - SITE_DATA[site_link]['files']['done'][activity]['6h'] += DATA[link]['files']['done'][activity]['6h'] - else: - SITE_DATA[site_link]['files']['done'][activity] = DATA[link]['files']['done'][activity] - else: - SITE_DATA[site_link] = {'files': DATA[link]['files']} - - with open('/data/metrix/data/files-done-dashb/files-done-dashb-{0}.json'.format(datetime.datetime.utcnow().isoformat()[:-7]), 'w') as f: - json.dump(SITE_DATA, f, indent=1, sort_keys=True) - - with open('/data/metrix/data/files-done-dashb/latest.json', 'w') as f: - json.dump(SITE_DATA, f, indent=1, sort_keys=True) diff --git a/tools/probes/common/check_metrix_files_queued_rucio_total b/tools/probes/common/check_metrix_files_queued_rucio_total deleted file mode 100755 index bf6181170e..0000000000 --- a/tools/probes/common/check_metrix_files_queued_rucio_total +++ /dev/null @@ -1,75 +0,0 @@ -#!/usr/bin/python -# Copyright European Organization for Nuclear Research (CERN) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - Mario Lassnig, , 2016-2017 -# - Joaquin Bogado, , 2016 - -""" -Build the files queued for transfer. -""" - -import datetime -import json - -import requests - -if __name__ == '__main__': - - URL = 'http://rucio-nagios-prod.cern.ch/files-queued-rucio-total/latest.json' - - R = requests.get(URL) - - DATA = {} - if R.status_code == 200: - DATA = R.json() - - with open('/data/metrix/data/files-queued-rucio-total/files-queued-rucio-total-{0}.json'.format(datetime.datetime.utcnow().isoformat()[:-7]), 'w') as f: - json.dump(DATA, f, indent=1, sort_keys=True) - - with open('/data/metrix/data/files-queued-rucio-total/latest.json', 'w') as f: - json.dump(DATA, f, indent=1, sort_keys=True) - else: - print 'cannot get', URL, R.status_code - - -CONTENT_OF_PROBE_FOR_REFERENCE = """ -#!/usr/bin/env python - -import datetime -import pprint -import json - -from rucio.db.sqla.session import get_session - -if __name__ == "__main__": - - session = get_session() - result = session.execute(''' -select src_site, dst_site, activity, count(*) -from ( -select - (select value from atlas_rucio.rse_attr_map where rse_id=source_rse_id and key='site') as src_site, - (select value from atlas_rucio.rse_attr_map where rse_id=dest_rse_id and key='site') as dst_site, - activity -from atlas_rucio.requests -where source_rse_id is not null -) -group by src_site, dst_site, activity -order by src_site, dst_site, activity''') - - data = {} - for row in result.fetchall(): - if '%s:%s' % (row[0], row[1]) in data.keys(): - data['%s:%s' % (row[0], row[1])]['files']['queued'][row[2]] = {'latest': row[3], 'timestamp': datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%S')} - else: - data['%s:%s' % (row[0], row[1])] = {'files': {'queued': {row[2]: {'latest': row[3], 'timestamp': datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%S')}}}} - -with open('/var/www/html/files-queued-rucio-total/latest.json', 'w') as f: - json.dump(data, f, indent=1, sort_keys=True) -""" diff --git a/tools/probes/common/check_metrix_fill_redis b/tools/probes/common/check_metrix_fill_redis deleted file mode 100755 index 4c7e33dec4..0000000000 --- a/tools/probes/common/check_metrix_fill_redis +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/python -# Copyright European Organization for Nuclear Research (CERN) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - Mario Lassnig, , 2016-2017 - -""" -Fill the merged metrix into the Redis cache. -""" - -import json -import redis - -if __name__ == '__main__': - - R = redis.Redis() - - with open('/data/metrix/data/metrix/latest.json', 'rb') as f: - DATA = json.load(f) - - for link in DATA: - R.set('metrics#%s' % str(link), json.dumps(DATA[link])) diff --git a/tools/probes/common/check_metrix_latency_perfsonar b/tools/probes/common/check_metrix_latency_perfsonar deleted file mode 100755 index bd8d85f141..0000000000 --- a/tools/probes/common/check_metrix_latency_perfsonar +++ /dev/null @@ -1,106 +0,0 @@ -#!/usr/bin/python -# Copyright European Organization for Nuclear Research (CERN) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - Mario Lassnig, , 2016-2017 - -""" -Build the latency from perfsonar per link. -""" - -import datetime -import json -import time - -from elasticsearch import Elasticsearch - -if __name__ == '__main__': - - ES = Elasticsearch([{'host': 'atlas-kibana.mwt2.org', 'port': 9200}]) - - BEGIN_TIME = int(time.mktime((datetime.datetime.now() - datetime.timedelta(hours=1)).timetuple()) * 1000) - END_TIME = int(time.mktime((datetime.datetime.now()).timetuple()) * 1000) - - BODY = """ - { - "size": 0, - "query": { - "bool": { - "must": [ - { - "query_string": { - "query": "srcVO:ATLAS AND destVO:ATLAS AND _type:latency AND delay_mean:>0", - "analyze_wildcard": true, - "lowercase_expanded_terms": false - } - }, - { - "range": { - "timestamp": { - "gte": %s, - "lte": %s, - "format": "epoch_millis" - } - } - } - ], - "must_not": [] - } - }, - "_source": { - "excludes": [] - }, - "aggs": { - "2": { - "terms": { - "field": "srcSite", - "size": 999, - "order": { - "_term": "desc" - } - }, - "aggs": { - "3": { - "terms": { - "field": "destSite", - "size": 999, - "order": { - "_term": "desc" - } - }, - "aggs": { - "1": { - "avg": { - "field": "delay_mean" - } - } - } - } - } - } - } - } - """ % (BEGIN_TIME, END_TIME) - - RES = ES.search(index='network_weather-2017*', body=BODY) - - DATA = {} - for src_site in RES['aggregations']['2']['buckets']: - for dst_site in src_site['3']['buckets']: - link = '%s:%s' % (src_site['key'], dst_site['key']) - if link.startswith(':') or link.endswith(':'): - continue - latency = dst_site['1']['value'] - DATA[link] = {'latency': {'latest': int(latency), - 'timestamp': datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%S')}} - - with open('/data/metrix/data/latency-perfsonar/latency-perfsonar-{0}.json'.format(datetime.datetime.utcnow().isoformat()[:-7]), 'w') as f: - json.dump(DATA, f, indent=1, sort_keys=True) - - with open('/data/metrix/data/latency-perfsonar/latest.json', 'w') as f: - json.dump(DATA, f, indent=1, sort_keys=True) diff --git a/tools/probes/common/check_metrix_mapping_rse_site b/tools/probes/common/check_metrix_mapping_rse_site deleted file mode 100755 index 2fc11b2248..0000000000 --- a/tools/probes/common/check_metrix_mapping_rse_site +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/python -# Copyright European Organization for Nuclear Research (CERN) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - Joaquin Bogado, , 2016 -# - Mario Lassnig, , 2016-2017 - -""" -Dictionary to map the RSE to a site. -""" - -import datetime -import json -import requests - -if __name__ == '__main__': - - RES = requests.get('http://atlas-agis-api.cern.ch/request/ddmendpoint/query/list/?json') - - if RES.ok: - RESJ = json.loads(RES.text) - - DATA = {} - for site in RESJ: - DATA[site['name']] = site['site'] - - # MANUAL OVERRIDE (not defined in AGIS) - DATA['CERN-P1_SFO'] = 'CERN-P1' - - with open('/data/metrix/data/mapping-rse-site/mapping-rse-site-{0}.json'.format(datetime.datetime.utcnow().isoformat()[:-7]), 'w') as f: - json.dump(DATA, f, indent=1, sort_keys=True) - - with open('/data/metrix/data/mapping-rse-site/latest.json', 'w') as f: - json.dump(DATA, f, indent=1, sort_keys=True) - - else: - print 'could not load json' diff --git a/tools/probes/common/check_metrix_mapping_se_site b/tools/probes/common/check_metrix_mapping_se_site deleted file mode 100755 index b27dea775c..0000000000 --- a/tools/probes/common/check_metrix_mapping_se_site +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/python -# Copyright European Organization for Nuclear Research (CERN) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - Joaquin Bogado, , 2016 -# - Mario Lassnig, , 2016-2017 - -""" -Dictionary to map the SE to a site. -""" - -import datetime -import json -import requests - -if __name__ == '__main__': - - RES = requests.get('http://atlas-agis-api.cern.ch/request/ddmendpoint/query/list/?json') - - if RES.ok: - RESJ = json.loads(RES.text) - - DATA = {} - for site in RESJ: - for p in site['protocols']: - se = '%s:%s' % (p.split(':')[0], p.split(':')[1]) - DATA[se] = site['site'] - # if se in DATA: - # if site['site'] not in DATA[se]: - # DATA[se].append(site['site']) - # else: - # DATA[se] = [site['site']] - - # there are a few duplicates: - # for x in DATA: - # if len(DATA[x])>1: - # print x, DATA[x], len(DATA[x]) - # manually fix them for now: - DATA['davs://dav.ndgf.org'] = 'NDGF-T1' - DATA['gsiftp://gridftp.pic.es'] = 'pic' - DATA['srm://srm.ndgf.org'] = 'NDGF-T1' - DATA['davs://grid05.lal.in2p3.fr'] = 'GRIF-LAL' - - with open('/data/metrix/data/mapping-se-site/mapping-se-site-{0}.json'.format(datetime.datetime.utcnow().isoformat()[:-7]), 'w') as f: - json.dump(DATA, f, indent=1, sort_keys=True) - - with open('/data/metrix/data/mapping-se-site/latest.json', 'w') as f: - json.dump(DATA, f, indent=1, sort_keys=True) - - else: - print 'could not load json' diff --git a/tools/probes/common/check_metrix_mbps_dashb b/tools/probes/common/check_metrix_mbps_dashb deleted file mode 100755 index 78ecbb5295..0000000000 --- a/tools/probes/common/check_metrix_mbps_dashb +++ /dev/null @@ -1,135 +0,0 @@ -#!/usr/bin/python -# Copyright European Organization for Nuclear Research (CERN) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - Mario Lassnig, , 2016-2017 - -""" -Get the average megabytes per second per file on a link from the dashboard. -""" - -import datetime -import json -import requests - - -def get_link_throughput(minutes): - - """ Retrieve the data from the dashboard. """ - - url = 'http://dashb-atlas-ddm.cern.ch/dashboard/request.py/matrix.json'\ - '?activity=Analysis+Input&activity=Data+Brokering&activity=Data+Consolidation'\ - '&activity=Data+Rebalancing&activity=Deletion&activity=Express&activity=Functional+Test'\ - '&activity=Production+Input&activity=Production+Output&activity=Recovery'\ - '&activity=SFO+to+EOS+export&activity=Staging&activity=T0+Export&activity=T0+Tape'\ - '&activity=User+Subscriptions&activity=default'\ - '&src_grouping=site&src_grouping=token&dst_grouping=site&dst_grouping=token&interval=' + str(minutes) - - res = requests.get(url).json()['transfers']['rows'] - return res - - -if __name__ == '__main__': - - DATA = {} - - THROUGHPUTS = [('1h', get_link_throughput(60)), - ('1d', get_link_throughput(1440)), - ('1w', get_link_throughput(10080))] - - for metric, throughput in THROUGHPUTS: - - for row in throughput: - src = '%s_%s' % (row[0], row[1]) - dst = '%s_%s' % (row[2], row[3]) - link = '%s:%s' % (src, dst) - if link.startswith(':') or link.endswith(':') or link.endswith('_None') or '_None:' in link: - continue - - if link not in DATA.keys(): - DATA[link] = {'mbps': {'dashb': {'1h': [], - '1d': [], - '1w': [], - 'timestamp': datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%S')}}} - - if metric == '1h': - if row[4] != 0: - DATA[link]['mbps']['dashb']['1h'] = row[4] - DATA[link]['mbps']['dashb']['1d'] = row[4] - DATA[link]['mbps']['dashb']['1w'] = row[4] - - if metric == '1d': - if row[4] != 0: - DATA[link]['mbps']['dashb']['1d'] = row[4] - DATA[link]['mbps']['dashb']['1w'] = row[4] - - if metric == '1w': - if row[4] != 0: - DATA[link]['mbps']['dashb']['1w'] = row[4] - - MAPPING = None - with open('/data/metrix/data/mapping-rse-site/latest.json', 'r') as f: - MAPPING = json.load(f) - - for link in DATA: - src, dst = link.split(':') - DATA[link]['src_site'] = MAPPING[src] - DATA[link]['dst_site'] = MAPPING[dst] - - SITE_DATA = {} - for link in DATA: - site_link = '%s:%s' % (DATA[link]['src_site'], DATA[link]['dst_site']) - if site_link in SITE_DATA.keys(): - if DATA[link]['mbps']['dashb']['1h'] != []: - if DATA[link]['mbps']['dashb']['1h'] != 0: - SITE_DATA[site_link]['mbps']['dashb']['1h'].append(DATA[link]['mbps']['dashb']['1h']) - if DATA[link]['mbps']['dashb']['1d'] != []: - if DATA[link]['mbps']['dashb']['1d'] != 0: - SITE_DATA[site_link]['mbps']['dashb']['1d'].append(DATA[link]['mbps']['dashb']['1d']) - if DATA[link]['mbps']['dashb']['1w'] != []: - if DATA[link]['mbps']['dashb']['1w'] != 0: - SITE_DATA[site_link]['mbps']['dashb']['1w'].append(DATA[link]['mbps']['dashb']['1w']) - else: - SITE_DATA[site_link] = {'mbps': DATA[link]['mbps']} - SITE_DATA[site_link]['mbps']['dashb']['1h'] = [SITE_DATA[site_link]['mbps']['dashb']['1h']] - SITE_DATA[site_link]['mbps']['dashb']['1d'] = [SITE_DATA[site_link]['mbps']['dashb']['1d']] - SITE_DATA[site_link]['mbps']['dashb']['1w'] = [SITE_DATA[site_link]['mbps']['dashb']['1w']] - - for link in SITE_DATA: - if SITE_DATA[link]['mbps']['dashb']['1h'] == [[]]: - SITE_DATA[link]['mbps']['dashb']['1h'] = [0.0] - if SITE_DATA[link]['mbps']['dashb']['1d'] == [[]]: - SITE_DATA[link]['mbps']['dashb']['1d'] = [0.0] - if SITE_DATA[link]['mbps']['dashb']['1w'] == [[]]: - SITE_DATA[link]['mbps']['dashb']['1w'] = [0.0] - - for link in SITE_DATA: - if [] in SITE_DATA[link]['mbps']['dashb']['1h']: - SITE_DATA[link]['mbps']['dashb']['1h'].remove([]) - if [] in SITE_DATA[link]['mbps']['dashb']['1d']: - SITE_DATA[link]['mbps']['dashb']['1d'].remove([]) - if [] in SITE_DATA[link]['mbps']['dashb']['1w']: - SITE_DATA[link]['mbps']['dashb']['1w'].remove([]) - - for link in SITE_DATA: - SITE_DATA[link]['mbps']['dashb']['1h'] = round(sum(SITE_DATA[link]['mbps']['dashb']['1h']) * 0.0000000002778, 2) - SITE_DATA[link]['mbps']['dashb']['1d'] = round(sum(SITE_DATA[link]['mbps']['dashb']['1d']) * 0.00000000001157, 2) - SITE_DATA[link]['mbps']['dashb']['1w'] = round(sum(SITE_DATA[link]['mbps']['dashb']['1w']) * 0.000000000001653, 2) - - for link in SITE_DATA: - if SITE_DATA[link]['mbps']['dashb']['1h'] == 0: - del SITE_DATA[link]['mbps']['dashb']['1h'] - if SITE_DATA[link]['mbps']['dashb']['1d'] == 0: - del SITE_DATA[link]['mbps']['dashb']['1d'] - if SITE_DATA[link]['mbps']['dashb']['1w'] == 0: - del SITE_DATA[link]['mbps']['dashb']['1w'] - - with open('/data/metrix/data/mbps-dashb/mbps-dashb-{0}.json'.format(datetime.datetime.utcnow().isoformat()[:-7]), 'w') as f: - json.dump(SITE_DATA, f, indent=1, sort_keys=True) - - with open('/data/metrix/data/mbps-dashb/latest.json', 'w') as f: - json.dump(SITE_DATA, f, indent=1, sort_keys=True) diff --git a/tools/probes/common/check_metrix_mbps_fts b/tools/probes/common/check_metrix_mbps_fts deleted file mode 100755 index 2399765f18..0000000000 --- a/tools/probes/common/check_metrix_mbps_fts +++ /dev/null @@ -1,172 +0,0 @@ -#!/usr/bin/python -# Copyright European Organization for Nuclear Research (CERN) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - Mario Lassnig, , 2016-2017 -# - Hannes Hansen, , 2019 - -""" -Get the average megabytes per second per file on a link from ElasticSearch/FTS. -""" - -import datetime -import json -import time - -from elasticsearch import Elasticsearch -from six import iteritems - - -def dict_merge(dct, merge_dct): - - """ Merge two dictionaries. """ - - for k, v in iteritems(merge_dct): # pylint: disable=invalid-name,unused-variable - if k in dct and isinstance(dct[k], dict and isinstance(merge_dct[k], dict)): - dict_merge(dct[k], merge_dct[k]) - else: - dct[k] = merge_dct[k] - - -def get_mbps(label, hours): - - """ Retrieve the data from ElasticSearch. """ - - begin_time = int(time.mktime((datetime.datetime.now() - datetime.timedelta(hours=hours)).timetuple()) * 1000) - end_time = int(time.mktime((datetime.datetime.now()).timetuple()) * 1000) - - body = """ -{ - "size": 0, - "query": { - "bool": { - "must": [ - { - "query_string": { - "query": "_type:transfer-done", - "analyze_wildcard": true - } - }, - { - "range": { - "@timestamp": { - "gte": %s, - "lte": %s, - "format": "epoch_millis" - } - } - } - ], - "must_not": [] - } - }, - "_source": { - "excludes": [] - }, - "aggs": { - "2": { - "terms": { - "field": "payload.src-rse", - "size": 999, - "order": { - "_term": "desc" - } - }, - "aggs": { - "3": { - "terms": { - "field": "payload.dst-rse", - "size": 999, - "order": { - "_term": "desc" - } - }, - "aggs": { - "1": { - "percentiles": { - "field": "payload.duration", - "percents": [ - 95 - ], - "keyed": false - } - } - } - } - } - } - } -} -""" % (begin_time, end_time) - - esearch = Elasticsearch([{'host': 'atlas-kibana.mwt2.org', 'port': 9200}]) - res = esearch.search(index='rucio-events-2017*', body=body) - - data = {} - for src_site in res['aggregations']['2']['buckets']: - for dst_site in src_site['3']['buckets']: - tmp_link = '%s:%s' % (src_site['key'], dst_site['key']) - if tmp_link.startswith(':') or tmp_link.endswith(':'): - continue - throughput = dst_site['1']['values'][0]['value'] - if throughput == 'Infinity': - pass - else: - DATA[tmp_link] = {'mbps': {'fts': {label: round(throughput, 2)}}} - - return data - - -if __name__ == '__main__': - - DATA = get_mbps('1h', 1) - DAYS = get_mbps('1d', 24) - WEEKS = get_mbps('1w', 168) - - dict_merge(DATA, DAYS) - dict_merge(DATA, WEEKS) - - for link in DATA: - if len(DATA[link]['mbps']['fts'].keys()) != 3: - if '1d' not in DATA[link]['mbps']['fts'].keys(): - if '1h' in DATA[link]['mbps']['fts'].keys(): - DATA[link]['mbps']['fts']['1d'] = DATA[link]['mbps']['fts']['1h'] - if '1w' not in DATA[link]['mbps']['fts'].keys(): - DATA[link]['mbps']['fts']['1w'] = DATA[link]['mbps']['fts']['1d'] - DATA[link]['mbps']['fts']['timestamp'] = datetime.datetime.utcnow().isoformat()[:-7] - - MAPPING = None - with open('/data/metrix/data/MAPPING-rse-site/latest.json', 'r') as f: - MAPPING = json.load(f) - - for link in DATA: - src, dst = link.split(':') - DATA[link]['src_site'] = MAPPING[src] - DATA[link]['dst_site'] = MAPPING[dst] - - SITE_DATA = {} - for link in DATA: - site_link = '%s:%s' % (DATA[link]['src_site'], DATA[link]['dst_site']) - if site_link in SITE_DATA.keys(): - if '1h' in DATA[link]['mbps']['fts'].keys() and '1h' in SITE_DATA[site_link]['mbps']['fts'].keys(): - if DATA[link]['mbps']['fts']['1h'] > SITE_DATA[site_link]['mbps']['fts']['1h']: - SITE_DATA[site_link]['mbps']['fts']['1h'] = DATA[link]['mbps']['fts']['1h'] - if '1d' in DATA[link]['mbps']['fts'].keys() and '1d' in SITE_DATA[site_link]['mbps']['fts'].keys(): - if DATA[link]['mbps']['fts']['1d'] > SITE_DATA[site_link]['mbps']['fts']['1d']: - SITE_DATA[site_link]['mbps']['fts']['1d'] = DATA[link]['mbps']['fts']['1d'] - if '1w' in DATA[link]['mbps']['fts'].keys() and '1w' in SITE_DATA[site_link]['mbps']['fts'].keys(): - if DATA[link]['mbps']['fts']['1w'] > SITE_DATA[site_link]['mbps']['fts']['1w']: - SITE_DATA[site_link]['mbps']['fts']['1w'] = DATA[link]['mbps']['fts']['1w'] - else: - SITE_DATA[site_link] = {'mbps': DATA[link]['mbps']} - - with open('/data/metrix/data/mbps-fts/mbps-fts-{0}.json'.format(datetime.datetime.utcnow().isoformat()[:-7]), 'w') as f: - json.dump(SITE_DATA, f, indent=1, sort_keys=True) - - with open('/data/metrix/data/mbps-fts/latest.json', 'w') as f: - json.dump(SITE_DATA, f, indent=1, sort_keys=True) diff --git a/tools/probes/common/check_metrix_merge_metrix b/tools/probes/common/check_metrix_merge_metrix deleted file mode 100755 index 67736b5686..0000000000 --- a/tools/probes/common/check_metrix_merge_metrix +++ /dev/null @@ -1,110 +0,0 @@ -#!/usr/bin/python -# Copyright European Organization for Nuclear Research (CERN) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - Mario Lassnig, , 2016-2017 -# - Hannes Hansen, , 2019 - -""" -Merge all sub results into the final metrix. -""" - -import copy -import datetime -import json - -from six import iteritems - -INPUTS = ['closeness-agis', - 'latency-perfsonar', - 'mbps-fts', - 'mbps-dashb', - 'packetloss-perfsonar', - 'files-done-dashb', - 'files-queued-rucio-total'] - - -def dict_merge(dct, merge_dct): - - """ Merge two dictionaries. """ - - for k, v in iteritems(merge_dct): # pylint: disable=invalid-name,unused-variable - if k in dct and isinstance(dct[k], dict and isinstance(merge_dct[k], dict)): - dict_merge(dct[k], merge_dct[k]) - else: - dct[k] = merge_dct[k] - - -if __name__ == '__main__': - - DATA = {} - - for input_file in INPUTS: - with open('/data/metrix/data/%s/latest.json' % input_file, 'r') as open_file: - tmp = json.load(open_file) - dict_merge(DATA, tmp) - - # add separate src & dst - for link in DATA: - src, dst = link.split(':') - DATA[link]['src'] = src - DATA[link]['dst'] = dst - - # add convenience total - for link in DATA: - if 'files' in DATA[link].keys(): - if 'done' in DATA[link]['files'].keys(): - DATA[link]['files']['done-total-1h'] = 0 - for activity in DATA[link]['files']['done'].keys(): - if activity in DATA[link]['files']['done'].keys(): - DATA[link]['files']['done-total-1h'] += DATA[link]['files']['done'][activity]['1h'] - else: - DATA[link]['files']['done-total-1h'] = DATA[link]['files']['done'][activity]['1h'] - - DATA[link]['files']['done-total-6h'] = 0 - for activity in DATA[link]['files']['done'].keys(): - if activity in DATA[link]['files']['done'].keys(): - DATA[link]['files']['done-total-6h'] += DATA[link]['files']['done'][activity]['6h'] - else: - DATA[link]['files']['done-total-6h'] = DATA[link]['files']['done'][activity]['6h'] - - if 'queued' in DATA[link]['files'].keys(): - DATA[link]['files']['queued-total'] = 0 - for activity in DATA[link]['files']['queued'].keys(): - if activity in DATA[link]['files']['queued'].keys(): - DATA[link]['files']['queued-total'] += DATA[link]['files']['queued'][activity]['latest'] - else: - DATA[link]['files']['queued-total'] = DATA[link]['files']['queued'][activity]['latest'] - - # ndgf to signet fix - NDGF_SRC_ALL = [tmp_link for tmp_link in DATA if tmp_link.startswith('NDGF-T1')] - NDGF_DST_ALL = [tmp_link for tmp_link in DATA if tmp_link.endswith('NDGF-T1')] - - for ndgf_link in NDGF_SRC_ALL: - n_src, n_dst = ndgf_link.split(':') - if 'SiGNET:%s' % n_dst in DATA.keys(): - tmp = dict(DATA[ndgf_link].items() + DATA['SiGNET:%s' % n_dst].items()) - DATA['SiGNET:%s' % n_dst] = tmp - else: - DATA['SiGNET:%s' % n_dst] = copy.deepcopy(DATA[ndgf_link]) - DATA['SiGNET:%s' % n_dst]['src'] = 'SiGNET' - - for ndgf_link in NDGF_DST_ALL: - n_src, n_dst = ndgf_link.split(':') - if '%s:SiGNET' % n_src in DATA.keys(): - tmp = dict(DATA[ndgf_link].items() + DATA['%s:SiGNET' % n_src].items()) - DATA['%s:SiGNET' % n_src] = tmp - else: - DATA['%s:SiGNET' % n_src] = copy.deepcopy(DATA[ndgf_link]) - DATA['%s:SiGNET' % n_src]['dst'] = 'SiGNET' - - with open('/data/metrix/data/metrix/metrix-{0}.json'.format(datetime.datetime.utcnow().isoformat()[:-7]), 'w') as f: - json.dump(DATA, f, indent=8, sort_keys=True) - - with open('/data/metrix/data/metrix/latest.json', 'w') as f: - json.dump(DATA, f, indent=8, sort_keys=True) diff --git a/tools/probes/common/check_metrix_packetloss_perfsonar b/tools/probes/common/check_metrix_packetloss_perfsonar deleted file mode 100755 index 69b0432d35..0000000000 --- a/tools/probes/common/check_metrix_packetloss_perfsonar +++ /dev/null @@ -1,110 +0,0 @@ -#!/usr/bin/python -# Copyright European Organization for Nuclear Research (CERN) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - Mario Lassnig, , 2016-2017 - -""" -Build the packetloss from perfsonar per link. -""" - -import datetime -import json -import time - -from elasticsearch import Elasticsearch - -if __name__ == '__main__': - - ES = Elasticsearch([{'host': 'atlas-kibana.mwt2.org', 'port': 9200}]) - - BEGIN_TIME = int(time.mktime((datetime.datetime.now() - datetime.timedelta(hours=1)).timetuple()) * 1000) - END_TIME = int(time.mktime((datetime.datetime.now()).timetuple()) * 1000) - - BODY = """ - { - "size": 0, - "query": { - "bool": { - "must": [ - { - "query_string": { - "query": "srcVO:ATLAS AND destVO:ATLAS AND _type:packet_loss_rate", - "analyze_wildcard": true, - "lowercase_expanded_terms": false - } - }, - { - "range": { - "timestamp": { - "gte": %s, - "lte": %s, - "format": "epoch_millis" - } - } - } - ], - "must_not": [] - } - }, - "_source": { - "excludes": [] - }, - "aggs": { - "2": { - "terms": { - "field": "srcSite", - "size": 999, - "order": { - "_term": "desc" - } - }, - "aggs": { - "3": { - "terms": { - "field": "destSite", - "size": 999, - "order": { - "_term": "desc" - } - }, - "aggs": { - "1": { - "percentiles": { - "field": "packet_loss", - "percents": [ - 95 - ], - "keyed": false - } - } - } - } - } - } - } - } - """ % (BEGIN_TIME, END_TIME) - - RES = ES.search(index='network_weather-2017*', body=BODY) - - DATA = {} - for src_site in RES['aggregations']['2']['buckets']: - for dst_site in src_site['3']['buckets']: - link = '%s:%s' % (src_site['key'], dst_site['key']) - if link.startswith(':') or link.endswith(':'): - continue - packetloss = dst_site['1']['values'][0]['value'] - DATA[link] = {'packetloss': {'latest': round(packetloss, 3), # 3 decimal places in source data - 'timestamp': datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%S')}} - - with open('/data/metrix/data/packetloss-perfsonar/packetloss-perfsonar-{0}.json'.format(datetime.datetime.utcnow().isoformat()[:-7]), 'w') as f: - json.dump(DATA, f, indent=1, sort_keys=True) - - with open('/data/metrix/data/packetloss-perfsonar/latest.json', 'w') as f: - json.dump(DATA, f, indent=1, sort_keys=True) diff --git a/tools/probes/common/check_new_dids b/tools/probes/common/check_new_dids deleted file mode 100755 index 0a6d08dd0f..0000000000 --- a/tools/probes/common/check_new_dids +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/env python -# Copyright European Organization for Nuclear Research (CERN) 2013 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - Vincent Garonne, , 2013 - -''' -Probe to check the backlog of new dids. -''' -import sys -import traceback - -from rucio.core import monitor -from rucio.db.sqla.session import get_session - -# Exit statuses -OK, WARNING, CRITICAL, UNKNOWN = 0, 1, 2, 3 - -if __name__ == "__main__": - try: - session = get_session() - result = session.execute('select /*+ index_FFS(dids DIDS_IS_NEW_IDX) */ count(1) from atlas_rucio.dids where is_new is not null').fetchone()[0] - monitor.record_gauge(stat='transmogrifier.new_dids', value=result) - print result - except: - print traceback.format_exc() - sys.exit(UNKNOWN) - sys.exit(OK) diff --git a/tools/probes/common/check_obsolete_replicas b/tools/probes/common/check_obsolete_replicas deleted file mode 100755 index efd839438e..0000000000 --- a/tools/probes/common/check_obsolete_replicas +++ /dev/null @@ -1,84 +0,0 @@ -#!/usr/bin/env python -# Copyright European Organization for Nuclear Research (CERN) 2013 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - Vincent Garonne, , 2015 -# - Cedric Serfon, , 2018 - -''' -Probe to check the backlog of obsolete replicas. -''' - -import sys - -from rucio.db.sqla.session import get_session - -# Exit statuses -OK, WARNING, CRITICAL, UNKNOWN = 0, 1, 2, 3 - - -if __name__ == "__main__": - try: - SESSION = get_session() - QUERY = '''BEGIN - FOR u in (SELECT - a.rse_id AS rse_id, - NVL(b.files, 0) AS files, - NVL(b.bytes, 0) AS bytes, - SYS_EXTRACT_UTC(localtimestamp) AS updated_at - FROM - ( - SELECT - id AS rse_id - FROM - atlas_rucio.rses - WHERE - deleted=0) a - LEFT OUTER JOIN - ( - SELECT - /*+ INDEX_FFS(replicas REPLICAS_TOMBSTONE_IDX) */ - rse_id, - COUNT(1) AS files, - SUM(bytes) AS bytes - FROM - atlas_rucio.replicas - WHERE - ( - CASE - WHEN tombstone IS NOT NULL - THEN rse_id - END) IS NOT NULL - AND tombstone=to_date('1-1-1970 00:00:00','MM-DD-YYYY HH24:Mi:SS') - GROUP BY - rse_id) b - ON - a.rse_id=b.rse_id) - - LOOP - MERGE INTO atlas_rucio.RSE_USAGE - USING DUAL - ON (atlas_rucio.RSE_USAGE.rse_id = u.rse_id and source = 'obsolete') - WHEN NOT MATCHED THEN INSERT(rse_id, source, used, files, updated_at, created_at) - VALUES (u.rse_id, 'obsolete', u.bytes, u.files, u.updated_at, u.updated_at) - WHEN MATCHED THEN UPDATE SET used=u.bytes, files=u.files, updated_at=u.updated_at; - - MERGE INTO ATLAS_RUCIO.RSE_USAGE_HISTORY H - USING DUAL - ON (h.rse_id = u.rse_id and h.source = 'obsolete' and h.updated_at = u.updated_at) - WHEN NOT MATCHED THEN INSERT(rse_id, source, used, files, updated_at, created_at) - VALUES (u.rse_id, 'obsolete', u.bytes, u.files, u.updated_at, u.updated_at); - - COMMIT; - END LOOP; -END; -''' - SESSION.execute(QUERY) - except Exception as error: - print error - sys.exit(UNKNOWN) - sys.exit(OK) diff --git a/tools/probes/common/check_oracle b/tools/probes/common/check_oracle deleted file mode 100755 index a780932a14..0000000000 --- a/tools/probes/common/check_oracle +++ /dev/null @@ -1,62 +0,0 @@ -#!/usr/bin/env python -# Copyright European Organization for Nuclear Research (CERN) 2013 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - Vincent Garonne, , 2013 -# - Cedric Serfon, , 2014 - -''' -Probe to check Oracle. -''' - -import sys - -import cx_Oracle - -from rucio.common.config import config_get -from rucio.core import monitor - -# Exit statuses -OK, WARNING, CRITICAL, UNKNOWN = 0, 1, 2, 3 - -if __name__ == "__main__": - - try: - sql_connection = config_get('database', 'default') - user = config_get('database', 'powuseraccount') - password = config_get('database', 'powuserpassword') - except: - print 'Cannot get DB parameters' - sys.exit(CRITICAL) - dsn = sql_connection.split('@')[1] - connection = cx_Oracle.connect(user, password, dsn) - cursor = connection.cursor() - try: - query = '''SELECT mname.metric_name, round(VALUE) -FROM -gv$sysmetric sysm, -v$metricname mname -WHERE -mname.metric_id IN (2004, 2006, 2030, 2145, 2058, 2147, 2000, 2016, 2143, 4000, 2024, 2025) -AND mname.group_id = 2 -AND inst_id = 2 -AND mname.metric_id = sysm.metric_id -AND mname.group_id = sysm.group_id -ORDER BY inst_id, mname.metric_name, mname.metric_unit''' - cursor.execute(query) - for (metric, value) in cursor: - m = 'oracle.%s' % metric.replace(' ', '_') - print m, value, ' ', - monitor.record_gauge(stat=m, value=value) - except: - sys.exit(UNKNOWN) - finally: - cursor.close() - connection.close() - # monitor.record_gauge(stat='judge.waiting_dids', value=result) - # print result - sys.exit(OK) diff --git a/tools/probes/common/check_oracle_partitions b/tools/probes/common/check_oracle_partitions deleted file mode 100755 index c784cf0234..0000000000 --- a/tools/probes/common/check_oracle_partitions +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/env python -# Copyright European Organization for Nuclear Research (CERN) 2013 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - Cedric Serfon, , 2016 - -''' -Probe to check if the partitions for the scopes are created -''' -import sys -import traceback - -from rucio.db.sqla.session import get_session - -# Exit statuses -OK, WARNING, CRITICAL, UNKNOWN = 0, 1, 2, 3 - -if __name__ == "__main__": - exit_code = OK - try: - session = get_session() - result = session.execute('select scope from atlas_rucio.scopes where created_at>sysdate-10') - for scope in result.fetchall(): - partition = "_".join(scope[0].split('.')).upper() - result = session.execute("select table_name from ALL_TAB_PARTITIONS where partition_name='%s'" % (partition)) - parts = [item[0] for item in result.fetchall()] - for part in ['DELETED_DIDS', 'LOCKS', 'REPLICAS', 'CONTENTS', 'DIDS']: - if part not in parts: - print '%s partition on %s does not exist' % (partition, part) - exit_code = CRITICAL - except: - print traceback.format_exc() - sys.exit(UNKNOWN) - sys.exit(exit_code) diff --git a/tools/probes/common/check_ping_rucio_servers b/tools/probes/common/check_ping_rucio_servers deleted file mode 100755 index fbd4e40a1c..0000000000 --- a/tools/probes/common/check_ping_rucio_servers +++ /dev/null @@ -1,18 +0,0 @@ -#!/usr/bin/env python -# Copyright European Organization for Nuclear Research (CERN) 2013 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - Cedric Serfon, , 2014 -# -import sys -from rucio.client import Client - -if __name__ == "__main__": - host = sys.argv[1] - # print 'Ping %s' % host - c = Client(rucio_host='https://%s' % host) - print 'Rucio version installed : %s ' % c.ping()['version'] diff --git a/tools/probes/common/check_quotas b/tools/probes/common/check_quotas deleted file mode 100755 index 1bf17ca9c5..0000000000 --- a/tools/probes/common/check_quotas +++ /dev/null @@ -1,299 +0,0 @@ -#!/usr/bin/env python -# -# Copyright European Organization for Nuclear Research (CERN) - -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - David Cameron, , 2015 -# - Vincent Garonne, , 2015 -# - Cedric Serfon, , 2016-2019 -# -# -# Set quotas for physgroups -# Set quotas for users on their country's localgroupdisk -# Set quotas for all users on scratchdisk -# -# PY3K COMPATIBLE - -from __future__ import print_function -import json -import sys - -import requests - -from rucio.api.rse import get_rse, parse_rse_expression, get_rse_usage, list_rse_attributes -from rucio.api.account import list_accounts -from rucio.api.account_limit import set_account_limit -from rucio.api.config import get -from rucio.db.sqla.session import get_session -from rucio.common.exception import RucioException, RSENotFound, AccountNotFound - - -UNKNOWN = 3 -CRITICAL = 2 -WARNING = 1 -OK = 0 - -RESULT = OK - - -def get_storage_total(name, ddm_type): - global RESULT - try: - capacity = get_rse_usage(name, 'root', source='storage') - tier = list_rse_attributes(name)['tier'] - except RucioException as error: - if ddm_type == 'TEST': - print("Ignoring failed 'get capacity' for TEST endpoint: %s: %s" % (name, str(error))) - elif tier in ['0', '1', '2']: - print("WARNING: Could not get capacity of %s: %s" % (name, str(error))) - RESULT = WARNING - return None - try: - total = capacity[0]['total'] - return total - except IndexError as error: - print('No storage info for %s' % error) - return None - - -def set_group_quotas(): - global RESULT - quotas = {} - session = get_session() - query = '''select account, atlas_rucio.id2rse(c.rse_id), bytes from atlas_rucio.account_limits c, - (select rse_id, rse, value from atlas_rucio.rses a, atlas_rucio.rse_attr_map b where a.id=b.rse_id and b.key='physgroup' and b.value!='None' and a.deleted!=1) d - where c.rse_id=d.rse_id and c.account=d.value''' - try: - result = session.execute(query) - for account, rse, bytes_size in result: - quotas[account, rse] = bytes_size - except Exception as error: - print(error) - - url = 'http://atlas-agis-api.cern.ch/request/ddmendpoint/query/list/?json&state=ACTIVE&site_state=ACTIVE' - try: - resp = requests.get(url=url) - data = json.loads(resp.content) - except Exception as error: - print("Failed to load info from AGIS: %s" % str(error)) - RESULT = CRITICAL - - for ddmendpoint in data: - # Check if RSE exists - name = ddmendpoint['name'] - try: - get_rse(name) - except RSENotFound: - print("WARNING: RSE %s missing in Rucio" % name) - RESULT = WARNING - continue - - if ddmendpoint['phys_groups']: - for vomsgroup in ddmendpoint['quotas']: - group = vomsgroup.split('/')[2] - if group == ddmendpoint['phys_groups'][0]: - if (group, name) not in quotas: - print('Will add quota on RSE %s for group %s' % (name, group)) - try: - set_account_limit(group, name, ddmendpoint['quotas'][vomsgroup] * 1000 ** 4, 'root') - except AccountNotFound as error: - print(error) - elif ddmendpoint['quotas'][vomsgroup] == 999999: - if quotas[(group, name)] != -1: - print('Infinite quota defined in AGIS for group %s on %s' % (group, name)) - try: - set_account_limit(group, name, -1, 'root') - except AccountNotFound as error: - print(error) - elif quotas[(group, name)] != ddmendpoint['quotas'][vomsgroup] * 1000 ** 4: - print('On %s quota for %s differs Rucio : %s vs AGIS : %s' % (name, group, quotas[(group, name)], ddmendpoint['quotas'][vomsgroup] * 1000 ** 4)) - try: - set_account_limit(group, name, ddmendpoint['quotas'][vomsgroup] * 1000 ** 4, 'root') - except AccountNotFound as error: - print(error) - - -def set_user_quotas(ddm_type, fraction, absolute=0, account_type='USER'): - global RESULT - quotas = {} - accounts = {} - session = get_session() - user_accounts = [acc['account'] for acc in list_accounts({'account_type': account_type})] - - try: - for rse in parse_rse_expression('type=%s' % (ddm_type)): - total_space = get_storage_total(rse, ddm_type) - val = list_rse_attributes(rse) - if total_space: - def_quota = min(absolute, float(total_space) * fraction) if absolute else float(total_space) * fraction - quotas[rse] = float(val.get('default_account_limit_bytes', def_quota)) - elif 'default_account_limit_bytes' in val: - quotas[rse] = float(val['default_account_limit_bytes']) - - query = '''select account, atlas_rucio.id2rse(rse_id), bytes from atlas_rucio.account_limits where -rse_id in (select id from atlas_rucio.rses a, atlas_rucio.rse_attr_map b where a.id=b.rse_id and b.key='type' and b.value='%s' and a.deleted!=1) ''' % (ddm_type) - result = session.execute(query) - for account, rse, bytes_size in result: - if account not in accounts: - accounts[account] = {} - accounts[account][rse] = bytes_size - - for account in user_accounts: - if account not in accounts: - print('%s is missing' % (account)) - for rse in quotas: - print('Will add quota for account %s on %s' % (account, rse)) - try: - set_account_limit(account, rse, quotas[rse], 'root') - except AccountNotFound as error: - print(error) - - else: - for rse in accounts[account]: - if rse in quotas: - if abs(accounts[account][rse] - quotas[rse]) > 1000: - print('%s %s : Defined quota %s different from expected quota %s' % (rse, account, accounts[account][rse], quotas[rse])) - try: - set_account_limit(account, rse, quotas[rse], 'root') - except AccountNotFound as error: - print(error) - else: - print('%s cannot be found in quotas dictionary' % (rse)) - - for rse in quotas: - if rse not in accounts[account]: - print('Will add quota for account %s on %s' % (account, rse)) - try: - set_account_limit(account, rse, quotas[rse], 'root') - except AccountNotFound as error: - print(error) - - except Exception as error: - print(error) - RESULT = CRITICAL - - -def set_localgroupdisk_quotas(fraction, absolute): - global RESULT - countries = {} - dict_sites = {} - total_space = {} - session = get_session() - default_values = {} - - try: - query = '''select atlas_rucio.id2rse(rse_id), value from atlas_rucio.rses a, atlas_rucio.rse_attr_map b - where a.id=b.rse_id and b.key='country' and a.rse like '%LOCALGROUPDISK' and a.deleted!=1''' - result = session.execute(query) - for rse, country in result: - if country not in dict_sites: - dict_sites[country] = [] - dict_sites[country].append(rse) - if rse not in total_space: - total_space[rse] = get_storage_total(rse, 'LOCALGROUPDISK') - val = list_rse_attributes(rse) - if 'default_account_limit_bytes' in val: - default_values[rse] = float(val['default_account_limit_bytes']) - except Exception as error: - print(error) - RESULT = CRITICAL - - try: - query = '''select a.account, a.key, a.value, a.updated_at, a.created_at from atlas_rucio.account_attr_map a, atlas_rucio.accounts b - where a.account=b.account and key like 'country-%' and b.status='ACTIVE' ''' - result = session.execute(query) - for account, key, _, _, _ in result: - if key not in countries: - countries[key] = [] - countries[key].append(account) - except Exception as error: - print(error) - RESULT = CRITICAL - - for country in countries: - accounts = {} - try: - country_short = country.split('-')[1] - query = '''select account, atlas_rucio.id2rse(rse_id), bytes from atlas_rucio.account_limits where - account in (select c.account from atlas_rucio.account_attr_map c, atlas_rucio.accounts d where d.status='ACTIVE' - and c.account=d.account and key='%s') - and rse_id in (select id from atlas_rucio.rses a, atlas_rucio.rse_attr_map b - where a.id=b.rse_id and b.key='country' and b.value='%s' and a.rse like '%%LOCALGROUPDISK') ''' % (country, country_short) - result = session.execute(query) - for account, rse, bytes_size in result: - if account not in accounts: - accounts[account] = {} - accounts[account][rse] = bytes_size - for account in countries[country]: - sites_with_no_quota = [] - if account not in accounts or accounts[account] == {}: - sites_with_no_quota = dict_sites[country_short] - print('%s : %s account is missing quota on sites : %s' % (country, account, sites_with_no_quota)) - elif len(accounts[account]) < len(dict_sites[country_short]): - sites_with_no_quota = list(set(dict_sites[country_short]) - set(accounts[account])) - print('%s : %s account is missing quota on some sites %s vs %s : %s' % (country, account, len(accounts[account]), len(dict_sites[country_short]), sites_with_no_quota)) - for rse in sites_with_no_quota: - quota = None - if rse in default_values: - quota = default_values[rse] - elif rse in total_space and total_space[rse]: - quota = min(absolute, float(total_space[rse]) * fraction) if absolute else float(total_space[rse]) * fraction - quota = total_space[rse] * fraction - if quota: - print("Set quota of %dTB on %s for %s" % (quota / 1000 ** 4, rse, account)) - try: - set_account_limit(account, rse, quota, 'root') - except AccountNotFound as error: - print(error) - except Exception as error: - print(error) - RESULT = CRITICAL - - -if __name__ == '__main__': - try: - REL_QUOTA_SCRATCHDISK = get('quota', 'rel_SCRATCHDISK', 'root') - REL_QUOTA_SCRATCHDISK = float(REL_QUOTA_SCRATCHDISK) / 100 - except Exception: - REL_QUOTA_SCRATCHDISK = 0.3 - try: - ABS_QUOTA_SCRATCHDISK = get('quota', 'abs_SCRATCHDISK', 'root') - ABS_QUOTA_SCRATCHDISK = ABS_QUOTA_SCRATCHDISK - except Exception: - ABS_QUOTA_SCRATCHDISK = 20000000000000 - try: - REL_QUOTA_SCRATCHDISK_GROUP = get('quota', 'rel_SCRATCHDISK_group', 'root') - REL_QUOTA_SCRATCHDISK_GROUP = float(REL_QUOTA_SCRATCHDISK_GROUP) / 100 - except Exception: - REL_QUOTA_SCRATCHDISK_GROUP = 0.1 - try: - ABS_QUOTA_SCRATCHDISK_GROUP = get('quota', 'abs_SCRATCHDISK_GROUP', 'root') - ABS_QUOTA_SCRATCHDISK_GROUP = ABS_QUOTA_SCRATCHDISK_GROUP - except Exception: - ABS_QUOTA_SCRATCHDISK_GROUP = 10000000000000 - try: - REL_QUOTA_LOCALGROUPDISK = get('quota', 'rel_LOCALGROUPDISK', 'root') - REL_QUOTA_LOCALGROUPDISK = float(REL_QUOTA_LOCALGROUPDISK) / 100 - except Exception: - REL_QUOTA_LOCALGROUPDISK = 0.95 - try: - ABS_QUOTA_LOCALGROUPDISK = get('quota', 'abs_LOCALGROUPDISK', 'root') - ABS_QUOTA_LOCALGROUPDISK = float(ABS_QUOTA_LOCALGROUPDISK) / 100 - except Exception: - ABS_QUOTA_LOCALGROUPDISK = 0 - - # For group the quota is set and updated using the AGIS information - set_group_quotas() - # For the SCRATSK area, the quota is created and updated each time the size of the space token change - # If the RSEs have the res attribute default_account_limit_bytes set, the value will overwrite the default value - set_user_quotas(ddm_type='SCRATCHDISK', fraction=REL_QUOTA_SCRATCHDISK, absolute=ABS_QUOTA_SCRATCHDISK, account_type='USER') - set_user_quotas(ddm_type='SCRATCHDISK', fraction=REL_QUOTA_SCRATCHDISK_GROUP, absolute=ABS_QUOTA_SCRATCHDISK_GROUP, account_type='GROUP') - # For the LOCALGROUPDISK area, the quota is created once but never updated - # The value set is 95% of the space or default_account_limit_bytes if it is set as RSE attribute - set_localgroupdisk_quotas(fraction=REL_QUOTA_LOCALGROUPDISK, absolute=ABS_QUOTA_LOCALGROUPDISK) - sys.exit(RESULT) diff --git a/tools/probes/common/check_recoverable_suspicious b/tools/probes/common/check_recoverable_suspicious deleted file mode 100755 index cc8db424d5..0000000000 --- a/tools/probes/common/check_recoverable_suspicious +++ /dev/null @@ -1,124 +0,0 @@ -#!/usr/bin/env python -# Copyright 2012-2018 CERN for the benefit of the ATLAS collaboration. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Authors: -# - Cedric Serfon, , 2018 -# -# PY3K COMPATIBLE - -from __future__ import print_function -import sys - -from rucio.db.sqla.constants import BadFilesStatus -from rucio.db.sqla.session import get_session -from rucio.core.replica import list_replicas, declare_bad_file_replicas - - -UNKNOWN = 3 -CRITICAL = 2 -WARNING = 1 -OK = 0 - -if __name__ == '__main__': - - try: - CNT_THRESHOLD = sys.argv[1] - except IndexError: - print('No threshold value defined for CNT_THRESHOLD, will use the default one : 10') - CNT_THRESHOLD = 10 - try: - NB_DAYS = sys.argv[2] - except IndexError: - print('No threshold value defined for NB_DAYS, will use the default one : 3') - NB_DAYS = 3 - - EXITVALUE = OK - - SESSION = get_session() - - QUERY = ''' -SELECT - COUNT(*), - scope, - name, - atlas_rucio.id2rse(rse_id) -FROM - atlas_rucio.bad_replicas a -WHERE - state='S' -AND created_at>SYSDATE-%s -AND atlas_rucio.id2rse(rse_id) like '%%DATADISK' -AND EXISTS - ( - SELECT - 1 - FROM - atlas_rucio.replicas - WHERE - state='A' - AND scope=a.scope - AND name=a.name - AND rse_id!=a.rse_id) -AND NOT EXISTS - ( - SELECT - 1 - FROM - atlas_rucio.bad_replicas - WHERE - scope=a.scope - AND name=a.name - AND rse_id=a.rse_id - AND created_at>SYSDATE-%s - AND state IN ('B', - 'D', - 'R', - 'L')) -GROUP BY - scope, - name, - rse_id -HAVING - COUNT(*)>%s -''' % (NB_DAYS, NB_DAYS, CNT_THRESHOLD) - - BAD_SURLS = {} - try: - RESULT = SESSION.execute(QUERY) - for cnt, scope, name, rse in RESULT: - if rse not in BAD_SURLS: - BAD_SURLS[rse] = [] - for rep in list_replicas([{'scope': scope, 'name': name}]): - for site in rep['rses']: - if site == rse: - BAD_SURLS[rse].append(rep['rses'][site][0]) - except Exception as error: - print(error) - EXITVALUE = CRITICAL - sys.exit(EXITVALUE) - - for rse in BAD_SURLS: - print('Declaring %s bad replicas on %s : %s' % (len(BAD_SURLS[rse]), rse, BAD_SURLS[rse])) - if len(BAD_SURLS[rse]) > 100: - print('Too many (%s) suspicious files on %s. There might be a problem. Please investigate.' % (len(BAD_SURLS[rse]), rse)) - EXITVALUE = max(WARNING, EXITVALUE) - else: - try: - declare_bad_file_replicas(pfns=BAD_SURLS[rse], reason='Lost or bad. Automatic recovery', issuer='root', status=BadFilesStatus.BAD, session=None) - except Exception as error: - print(error) - EXITVALUE = CRITICAL - - sys.exit(EXITVALUE) diff --git a/tools/probes/common/check_redis b/tools/probes/common/check_redis deleted file mode 100755 index 9757c067b8..0000000000 --- a/tools/probes/common/check_redis +++ /dev/null @@ -1,102 +0,0 @@ -#!/usr/bin/env python -# Copyright European Organization for Nuclear Research (CERN) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - Ralph Vigne, 2015 -# - Cedric Serfon, 2017 -# -# If the connection to Elasticsearch (ES) is broken, Redis starts persisting data on disk till ES is back. -# If the downtime of ES is longer, the disk might run full and take the node down. Currently the disk space -# lasts for about 30 hours (2Gb/h). In order to protect the node from running out of disk space, this node -# check the free disk space and if below a certain threshold, drops entries from the rucio.debug queue to -# free up disk space again. -# -# If after dropping data, the disk space is low, a critical error is raised and an email alert sent. -# -# This probe should be executed every 15 minutes on the host running Redis. - -import os -import socket -import sys -import time - -import redis - -from pystatsd import Client - -UNKNOWN = 3 -CRITICAL = 2 -WARNING = 1 -OK = 0 - -REDIS_URL = "rucio-logger-prod-01" -REDIS_QUEUES = ["rucio.haproxy", "rucio.apache", "rucio.daemons", "rucio.debug"] - -# Size when it starts to reported (but not alerted!) -THRESHOLD_QUEUE_SIZE = 50000 - -# Threshold where it starts to be reported (but not alerted!) -THRESHOLD_GB = 8 -# Threshold where alters are sent via email -THRESHOLD_CRITICAL_GB = 4 - -# Number of events removed when critical, e.g. 2.5M events per hour as a rule of thumb -TRIM_CRITICAL_EVENTS = 5000000 - -HOST = socket.getfqdn() - - -def get_free_disk_space(): - state = os.statvfs('/') - return (state.f_bavail * state.f_frsize) / 1024 / 1024 - - -def report_to_graphite(name, value): - serv = 'rucio-graphite-prod-02.cern.ch' - port = 8125 - try: - Client(host=serv, port=port, prefix='%s.%s' % ('rucio.monitoring.redis', HOST)).gauge(name, value) - except Exception: - pass - - -if __name__ == '__main__': - server = redis.Redis(REDIS_URL) - STATUS = OK - - for queue in REDIS_QUEUES: - try: - queue_len = server.llen(queue) - # Report to Graphite - report_to_graphite('queue.%s' % queue, queue_len) - if queue_len > THRESHOLD_QUEUE_SIZE: - STATUS = WARNING - except redis.exceptions.ConnectionError as error: - print error - sys.exit(CRITICAL) - - free_disk_space = get_free_disk_space() - report_to_graphite('avail_diskspace', free_disk_space) - - if free_disk_space < THRESHOLD_GB * (1024): # GB to bytes - debug_cutoff = server.llen('rucio.debug') / 2 - server.ltrim('rucio.debug', debug_cutoff, -1) - time.sleep(120) # Sleeping two minutes to wait for the effects on queue trimming on disk - free_disk_space_after = get_free_disk_space() - report_to_graphite('avail_diskspace', free_disk_space_after) - STATUS = WARNING if STATUS == OK else STATUS - if free_disk_space_after < THRESHOLD_CRITICAL_GB * (1024): - print '[%s] Cleaning debug queue wasn\'t sufficient to free enough disk space. Last gain: %d mb (%d entries)' % (HOST, (free_disk_space_after - free_disk_space), debug_cutoff) - server.ltrim('rucio.daemons', TRIM_CRITICAL_EVENTS, -1) - time.sleep(120) # Sleeping two minutes to wait for the effects on queue trimming on disk - crit_free_disk_space = get_free_disk_space() - report_to_graphite('avail_diskspace', crit_free_disk_space) - print '[%s] Remvoing %s events from daemon queue to gain %d mb of disk space.' % (HOST, TRIM_CRITICAL_EVENTS, (crit_free_disk_space - free_disk_space_after)) - STATUS = CRITICAL - - sys.exit(STATUS) diff --git a/tools/probes/common/check_requests_to_rses b/tools/probes/common/check_requests_to_rses deleted file mode 100755 index ba7523450d..0000000000 --- a/tools/probes/common/check_requests_to_rses +++ /dev/null @@ -1,76 +0,0 @@ -#!/usr/bin/env python -# Copyright European Organization for Nuclear Research (CERN) 2013 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - Wen Guan, , 2014 - -''' -Probe to check requests. -''' - -import datetime -import sys -import time - -from rucio.core import monitor -from rucio.db.sqla.session import get_session - -# Exit statuses -OK, WARNING, CRITICAL, UNKNOWN = 0, 1, 2, 3 - - -if __name__ == "__main__": - - scope = "stresstest" - - try: - session = get_session() - utctime = datetime.datetime.utcnow() - utctimeInt = int(time.mktime(utctime.timetuple())) - timeEnd = utctimeInt - timezoneoffset = int((datetime.datetime.now() - datetime.datetime.utcnow()).seconds) - - # check requests to different sites - sql = "select /*+ index_ffs(REQUESTS REQUESTS_TYP_STA_CRE_IDX) */ atlas_rucio.id2rse(dest_rse_id) rse, state, count(*) \ - from atlas_rucio.requests where request_type='T' group by dest_rse_id, state" - result = session.execute(sql).fetchall() - for rse, state, num in result: - # print "requests.%s.%s %s" % (state, rse, num) - monitor.record_gauge(stat='%s.requests.%s.%s' % (scope, state, rse), value=num) - - # check left requests which are 12 hours old to different sites - timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=12) - sql = "select /*+ index_ffs(REQUESTS REQUESTS_TYP_STA_CRE_IDX) */ atlas_rucio.id2rse(dest_rse_id) rse, state, count(*) \ - from atlas_rucio.requests where request_type='T' and created_at<=to_timestamp('" + str(timeLimit) + "','YYYY-MM-dd HH24:MI:SS.FF') group by dest_rse_id, state" - result = session.execute(sql).fetchall() - for rse, state, num in result: - # print 'requests.12Hours_old.%s.%s %s' % (state, rse, num) - monitor.record_gauge(stat='%s.requests.12Hours_old.%s.%s' % (scope, state, rse), value=num) - - # check left requests which are 24 hours old to different sites - timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=24) - sql = "select /*+ index_ffs(REQUESTS REQUESTS_TYP_STA_CRE_IDX) */ atlas_rucio.id2rse(dest_rse_id) rse, state, count(*) \ - from atlas_rucio.requests where request_type='T' and created_at<=to_timestamp('" + str(timeLimit) + "','YYYY-MM-dd HH24:MI:SS.FF') group by dest_rse_id, state" - result = session.execute(sql).fetchall() - for rse, state, num in result: - # print 'requests.24Hours_old.%s.%s %s' % (state, rse, num) - monitor.record_gauge(stat='%s.requests.24Hours_old.%s.%s' % (scope, state, rse), value=num) - - # check left requests which are 48 hours old to different sites - timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=48) - sql = "select /*+ index_ffs(REQUESTS REQUESTS_TYP_STA_CRE_IDX) */ atlas_rucio.id2rse(dest_rse_id) rse, state, count(*) from atlas_rucio.requests \ - where request_type='T' and created_at<=to_timestamp('" + str(timeLimit) + "','YYYY-MM-dd HH24:MI:SS.FF') group by dest_rse_id, state" - result = session.execute(sql).fetchall() - for rse, state, num in result: - # print 'requests.48Hours_old.%s.%s %s' % (state, rse, num) - monitor.record_gauge(stat='%s.requests.48Hours_old.%s.%s' % (scope, state, rse), value=num) - - except: - sys.exit(UNKNOWN) - finally: - session.remove() - sys.exit(OK) diff --git a/tools/probes/common/check_rse_attributes b/tools/probes/common/check_rse_attributes deleted file mode 100755 index c3d670a032..0000000000 --- a/tools/probes/common/check_rse_attributes +++ /dev/null @@ -1,176 +0,0 @@ -#!/usr/bin/env python -# Copyright European Organization for Nuclear Research (CERN) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - Vincent Garonne, , 2013-2014 -# - David Cameron, , 2014-2015 -# - Tomas Kouba, , 2014 -# - Cedric Serfon, , 2016-2019 -# - Dimitrios Christidis, , 2019 -# -# PY3K COMPATIBLE - -from __future__ import print_function - -import json -import sys -import traceback - - -import requests - -from rucio.db.sqla.session import get_session -from rucio.common.exception import RucioException, RSENotFound -from rucio.core.rse import add_rse_attribute, get_rses_with_attribute, del_rse_attribute, get_rse_usage, get_rse -from rucio.core.rse_expression_parser import parse_expression - - -UNKNOWN = 3 -CRITICAL = 2 -WARNING = 1 -OK = 0 - -RESULT = OK - -# Map of countries to 2-letter code -# Eventually should be available in AGIS -COUNTRYCODES = {'Argentina': 'ar', 'Armenia': 'am', 'Australia': 'au', 'Austria': 'at', - 'Canada': 'ca', 'Switzerland': 'ch', 'Chile': 'cl', 'China': 'cn', - 'Czech Republic': 'cz', 'Germany': 'de', 'Denmark': 'dk', 'Spain': 'es', - 'France': 'fr', 'Greece': 'gr', 'Israel': 'il', 'Italy': 'it', - 'Japan': 'jp', 'Netherlands': 'nl', 'Nordic': 'dk', 'Norway': 'no', - 'Poland': 'pl', 'Portugal': 'pt', 'Romania': 'ro', 'Russian Federation': 'ru', - 'Sweden': 'se', 'Slovakia': 'sk', 'Slovenia': 'si', 'Turkey': 'tr', 'Taiwan': 'tw', - 'UK': 'uk', 'USA': 'us', 'South Africa': 'za'} - - -# Takes DDM endpoint information from AGIS and adds selected attributes to RSEs -if __name__ == '__main__': - - URL = 'http://atlas-agis-api.cern.ch/request/ddmendpoint/query/list/?json&state=ACTIVE&site_state=ACTIVE' - try: - RESP = requests.get(url=URL) - DATA = json.loads(RESP.content) - except Exception as error: - print("Failed to load info from AGIS: %s" % str(error)) - sys.exit(WARNING) - - INTERNAL_PROXY_MAPPING = {} - URL = 'http://atlas-agis-api.cern.ch/request/serviceobject/query/list/?json&state=ACTIVE' - try: - RESP = requests.get(url=URL) - DIC = json.loads(RESP.content) - for rse in DIC: - for service in DIC[rse]: - if 'protocols' in DIC[rse][service]: - for protocol in DIC[rse][service]['protocols']: - if DIC[rse][service]['protocols'][protocol]['doortype'] == 'proxyinternal': - INTERNAL_PROXY_MAPPING[rse] = DIC[rse][service]['protocols'][protocol]['endpoint'] - except Exception as error: - print("Failed to load info from AGIS: %s" % str(error)) - sys.exit(WARNING) - - SPACE_USAGE = [rse['rse'] for rse in get_rses_with_attribute('space_usage_method')] - SOURCE_USED_SPACE = [rse['rse'] for rse in get_rses_with_attribute('source_for_used_space')] - HAS_PROXY = [rse['rse'] for rse in get_rses_with_attribute('root-proxy-internal')] - NOT_FOR_EXTRA_COPY = [rse['rse'] for rse in get_rses_with_attribute('notforextracopy')] - SCRATCHDISK_WITHOUT_SPACE = [rse['rse'] for rse in parse_expression(expression='type=SCRATCHDISK&freespace<50')] - for rse in DATA: - - name = rse['name'] - - session = get_session() - # Check if RSE exists - try: - get_rse(name, session=session) - except RSENotFound: - continue - - print('Working on %s' % name) - - try: - if rse['site'] in INTERNAL_PROXY_MAPPING: - add_rse_attribute(name, 'root-proxy-internal', str(INTERNAL_PROXY_MAPPING[rse['site']]), session=session) - elif name in HAS_PROXY: - print('Will remove root-proxy-internal for site %s' % name) - del_rse_attribute(name, 'root-proxy-internal', session=session) - - if name in SCRATCHDISK_WITHOUT_SPACE: - add_rse_attribute(name, 'notforextracopy', 1, session=session) - elif name in NOT_FOR_EXTRA_COPY: - print('Will remove notforextracopy for site %s' % rse['name']) - del_rse_attribute(name, 'notforextracopy', session=session) - - add_rse_attribute(name, 'ALL', '1', session=session) - add_rse_attribute(name, 'tier', str(rse['tier_level']), session=session) - add_rse_attribute(name, 'istape', str(rse['is_tape']), session=session) - add_rse_attribute(name, 'cloud', str(rse['cloud']), session=session) - add_rse_attribute(name, 'spacetoken', str(rse['token']), session=session) - add_rse_attribute(name, 'site', str(rse['site']), session=session) - add_rse_attribute(name, 'type', str(rse['type']), session=session) - if rse['space_usage_url']: - add_rse_attribute(name, 'space_usage_method', 'json', session=session) - elif name in SPACE_USAGE: - del_rse_attribute(name, 'space_usage_method', session=session) - if rse['space_method'] == 'rucio': - add_rse_attribute(name, 'source_for_used_space', 'rucio', session=session) - elif name in SOURCE_USED_SPACE: - del_rse_attribute(name, 'source_for_used_space', session=session) - if rse['type'] == 'LOCALGROUPDISK' or rse['type'] == 'LOCALGROUPTAPE': - country = COUNTRYCODES[str(rse['country'])] - if name.startswith('CERN'): - country = 'cern' - add_rse_attribute(name, 'country', country, session=session) - if rse['phys_groups']: - add_rse_attribute(name, 'physgroup', str(rse['phys_groups'][0]), session=session) - if rse['type'] not in ['OS_LOGS', 'OS_ES']: - if isinstance(rse['servedrestfts']['MASTER'], list): - add_rse_attribute(name, 'fts', ','.join(rse['servedrestfts']['MASTER']), session=session) - else: - add_rse_attribute(name, 'fts', str(rse['servedrestfts']['MASTER']), session=session) - if rse['type'] not in ['OS_LOGS', 'OS_ES']: - if isinstance(rse['servedrestfts']['TESTING'], list): - add_rse_attribute(name, 'fts_testing', ','.join(rse['servedrestfts']['TESTING']), session=session) - else: - add_rse_attribute(name, 'fts_testing', str(rse['servedrestfts']['TESTING']), session=session) - if 'datapolicies' in rse: - add_rse_attribute(name, 'datapolicyt0disk', 'T0Disk' in rse['datapolicies'], session=session) - add_rse_attribute(name, 'datapolicyt0tape', 'T0Tape' in rse['datapolicies'], session=session) - add_rse_attribute(name, 'datapolicyt0taskoutput', 'T0TaskOutput' in rse['datapolicies'], session=session) - add_rse_attribute(name, 'datapolicynucleus', 'Nucleus' in rse['datapolicies'], session=session) - - space_used = get_rse_usage(rse=name, source='storage', session=session) - unavailable_space = get_rse_usage(rse=name, source='unavailable', session=session) - expired = get_rse_usage(rse=name, source='expired', session=session) - min_free_space = get_rse_usage(rse=name, source='min_free_space', session=session) - expired = expired[0]['total'] if expired != [] else 0 - if unavailable_space and unavailable_space[0]['total']: - unavailable_space = unavailable_space[0]['total'] - else: - unavailable_space = 0 - if space_used: - min_free_space = min_free_space[0]['total'] if min_free_space else 0 - if space_used[0]['used'] == -1: - total_space = space_used[0]['total'] - space_used = get_rse_usage(rse=name, source='rucio', session=session) - freespace = total_space - space_used[0]['used'] - else: - freespace = space_used[0]['free'] - freespace = float(freespace - unavailable_space + expired - min_free_space) / 1000 / 1000 / 1000 / 1000 - freespace = freespace if freespace > 0 else 0 - add_rse_attribute(name, 'freespace', int(freespace), session=session) - - except RucioException as error: - print(error) - sys.exit(CRITICAL) - except Exception: - print(traceback.format_exc()) - RESULT = WARNING - finally: - session.commit() - sys.exit(RESULT) diff --git a/tools/probes/common/check_rses_distance b/tools/probes/common/check_rses_distance deleted file mode 100755 index 8e4ba25c64..0000000000 --- a/tools/probes/common/check_rses_distance +++ /dev/null @@ -1,338 +0,0 @@ -#!/usr/bin/env python -""" - Copyright European Organization for Nuclear Research (CERN) - - Licensed under the Apache License, Version 2.0 (the "License"); - You may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - - Authors: - - Wen Guan, , 2015-2016 - - Cedric Serfon, , 2017-2018 -""" - -import json -import sys -import traceback -import urllib2 -import urlparse - -import requests - -from rucio.db.sqla.constants import RSEType -from rucio.common.config import config_get -from rucio.core import request as request_core -from rucio.core.distance import add_distance_short, get_distances, update_distances -from rucio.core.rse import list_rses, get_rse_attribute - -UNKNOWN = 3 -CRITICAL = 2 -WARNING = 1 -OK = 0 - -__USERCERT = config_get('conveyor', 'usercert') - - -def get_agis_sitenames(): - url = 'http://atlas-agis-api.cern.ch/request/ddmendpoint/query/list/?json' - try: - result = {} - resp = urllib2.urlopen(url) - content = resp.read() - rses = json.loads(content) - for item in rses: - rse = item['name'] - sitename = item['site'].upper() - result[rse] = {'sitename': sitename, - 'protocols': [protocol for protocol in item['protocols'].keys()], - 'main_fts': item['servedrestfts']['MASTER'][0] if 'MASTER' in item['servedrestfts'] else None} - return OK, result - except: - return WARNING, "Failed to load rse-sitename data from url=%s, error: %s" % (url, traceback.format_exc()) - - -def get_agis_distances(): - url = 'http://atlas-agis-api.cern.ch/request/site/query/list_links/?json' - try: - top_distance = 0 - result = {} - resp = urllib2.urlopen(url) - content = resp.read() - site_list = json.loads(content) - for item in site_list: - if 'src' in item and 'dst' in item and 'closeness' in item: - dst = item['dst'].upper() - src = item['src'].upper() - if src not in result: - result[src] = {} - result[src][dst] = item['closeness'] - - # fix transfer inside the same site - result[src][src] = 0 - if dst not in result: - result[dst] = {} - result[dst][dst] = 0 - - if item['closeness'] > top_distance: - top_distance = item['closeness'] - return OK, top_distance, result - except: - return WARNING, None, "Failed to load distance data from url=%s, error: %s" % (url, traceback.format_exc()) - - -def get_active_ftses(): - ftses = [] - for fts_hosts in get_rse_attribute(key='fts'): - for fts_host in fts_hosts.split(","): - if fts_host not in ftses: - ftses.append(fts_host) - return ftses - - -def get_fts_transfer_summary(fts_host): - try: - result = {} - url = fts_host.replace('8446', '8449') - url = url + '/fts3/ftsmon/overview?vo=atlas&time_window=1&page=all' - r = requests.get('%s' % url, - verify=False, - cert=(__USERCERT, __USERCERT), - headers={'Content-Type': 'application/json'}) - if r and r.status_code == 200: - resp = r.json() - for item in resp['overview']['items']: - if item['vo_name'] == 'atlas': - key = item['source_se'] + "#" + item['dest_se'] - if key not in result: - result[key] = {"submitted": item["submitted"] if "submitted" in item else 0, - "active": item["active"] if "active" in item else 0, - "finished": item["finished"] if "finished" in item else 0, - "failed": item["failed"] if "failed" in item else 0, - "transfer_speed": item["current"] if "current" in item and item["current"] != [0, 0] else 0} - else: - print "Duplicated key %s: %s" % (key, result[key]) - return result - else: - print "Failed to get fts %s transfer summary, error: %s" % (fts_host, r.text if r is not None else r) - except: - print "Failed to get fts %s transfer summary, error: %s" % (fts_host, traceback.format_exc()) - return None - - -def get_ftses_transfer_summary(): - try: - result = {} - fts_hosts = get_active_ftses() - for fts_host in fts_hosts: - fts_summary = get_fts_transfer_summary(fts_host) - if fts_summary: - result[fts_host] = fts_summary - else: - result[fts_host] = [] - return OK, result - except: - return WARNING, "Failed to get fts %s summary, error: %s" % (fts_host, traceback.format_exc()) - - -def get_fts_info(fts_summary, src_protocols, dest_protocols): - try: - for src_protocol in src_protocols: - parsed = urlparse.urlparse(src_protocol) - src_name = parsed.scheme + "://" + parsed.netloc.partition(':')[0] - for dest_protocol in dest_protocols: - parsed = urlparse.urlparse(dest_protocol) - dest_name = parsed.scheme + "://" + parsed.netloc.partition(':')[0] - key = src_name + "#" + dest_name - if key in fts_summary: - return fts_summary[key] - except: - print "Failed to get fts info: %s" % traceback.format_exc() - return None - - -def get_downtime_list(): - try: - unavailable_read_rses = list_rses(filters={'availability_read': False}) - unavailable_read_rse_ids = [r['id'] for r in unavailable_read_rses] - return OK, unavailable_read_rse_ids - except: - return WARNING, "Failed to get downtime list: %s" % traceback.format_exc() - - -def get_rse_distances(): - try: - rows = get_distances() - distances = {} - for row in rows: - src_rse_id = row['src_rse_id'] - dest_rse_id = row['dest_rse_id'] - if src_rse_id not in distances: - distances[src_rse_id] = {} - row['distance'] = row['agis_distance'] - distances[src_rse_id][dest_rse_id] = row - return OK, distances - except: - return WARNING, "Failed to get rse distances: %s" % traceback.format_exc() - - -def get_rses(sitenames): - try: - rses = list_rses() - result = [] - for rse in rses: - if rse['deleted'] or rse['staging_area']: - continue - if rse['rse'] not in sitenames: - print "Cannot find site name for rse %s" % rse['rse'] - continue - result.append(rse) - return OK, result - except: - return WARNING, "Failed to get all active rses: %s" % traceback.format_exc() - - -def get_heavy_load_rses(threshold=5000): - try: - loads = request_core.get_heavy_load_rses(threshold=threshold) - result = {} - for load in loads: - result[load['rse_id']] = load['load'] - return OK, result - except: - return WARNING, "Failed to get heavy load rses: %s" % traceback.format_exc() - - -def distance_changed(old_distance, new_distance): - # keys = ['ranking', 'agis_distance', 'geoip_distance', 'active', 'submitted', 'finished', 'failed', 'transfer_speed'] - keys = ['ranking', 'agis_distance'] - # print old_distance - # print new_distance - for key in keys: - old_value = old_distance.get(key, None) - new_value = new_distance.get(key, None) - if old_value != new_value: - return True - return False - - -def get_ranking(ranking, fts_info, threshold, speed_rank, heavy_load_rse=False): - if fts_info['submitted'] < threshold and not heavy_load_rse: - # not too many queued transfers, high rank for fast link - if fts_info['transfer_speed']: - ranking += fts_info['transfer_speed'] / speed_rank - else: - if fts_info['submitted']: - if fts_info['finished']: - ranking -= fts_info['submitted'] / fts_info['finished'] - else: - ranking -= fts_info['submitted'] - return int(ranking) - - -if __name__ == '__main__': - - threshold = 1000 - speed_rank = 10 # MB/s, every 10 MB/s is one rank - - retVal, result = get_agis_sitenames() - if retVal != OK: - print result - sys.exit(retVal) - sitenames = result - - retVal, top_distance, result = get_agis_distances() - if retVal != OK: - print result - sys.exit(retVal) - agis_distances = result - - retVal, result = get_downtime_list() - if retVal != OK: - print result - sys.exit(retVal) - downtime_list = result - - retVal, result = get_rse_distances() - if retVal != OK: - print result - sys.exit(retVal) - old_distances = result - - retVal, result = get_rses(sitenames) - if retVal != OK: - print result - sys.exit(retVal) - rses = result - - retVal, result = get_heavy_load_rses(threshold) - if retVal != OK: - print result - sys.exit(retVal) - heavy_load_rses = result - - retVal, result = get_ftses_transfer_summary() - if retVal != OK: - print result - sys.exit(retVal) - fts_summary = result - - top_rank = top_distance + 2 - - for src_rse in rses: - src_sitename = sitenames[src_rse['rse']]['sitename'] - src_rse_id = src_rse['id'] - - for dest_rse in rses: - dest_sitename = sitenames[dest_rse['rse']]['sitename'] - dest_rse_id = dest_rse['id'] - main_fts = sitenames[dest_rse['rse']]['main_fts'] - - fts_info = None - if main_fts and main_fts in fts_summary: - fts_info = get_fts_info(fts_summary[main_fts], sitenames[src_rse['rse']]['protocols'], sitenames[dest_rse['rse']]['protocols']) - if fts_info is None: - fts_info = {'active': None, 'failed': None, 'finished': None, 'transfer_speed': None, 'submitted': None} - - if src_sitename in agis_distances and dest_sitename in agis_distances[src_sitename]: - if agis_distances[src_sitename][dest_sitename] > -1: - distance = agis_distances[src_sitename][dest_sitename] - else: - distance = None - else: - # for site which is not in agis distance - distance = top_distance / 2 - - if src_sitename in downtime_list: - ranking = 0 - else: - if distance is None: - ranking = None - else: - ranking = top_rank - distance - - if src_rse['rse_type'] == RSEType.TAPE: - # lower down TAPE rank - ranking = 1 - - is_heavy_load_rse = True if src_rse_id in heavy_load_rses else False - new_distance = {'ranking': None if ranking is None else get_ranking(ranking, fts_info, threshold, speed_rank, heavy_load_rse=is_heavy_load_rse), - 'agis_distance': distance} - - if src_rse_id in old_distances and dest_rse_id in old_distances[src_rse_id]: - if not distance_changed(old_distances[src_rse_id][dest_rse_id], new_distance): - continue - else: - """ - print 'update src: %s, dest: %s, old_distance: %s, new_distance:%s, old_rank: %s, new_rank:%s' % (src_rse_id, - dest_rse_id, - old_distances[src_rse_id][dest_rse_id]['distance'], - new_distance['agis_distance'], - old_distances[src_rse_id][dest_rse_id]['ranking'], - new_distance['ranking']) - """ - update_distances(src_rse_id=src_rse_id, dest_rse_id=dest_rse_id, parameters=new_distance, session=None) - else: - # print 'add' - add_distance_short(src_rse_id=src_rse_id, dest_rse_id=dest_rse_id, distance=new_distance) - sys.exit(OK) diff --git a/tools/probes/common/check_ruciomover b/tools/probes/common/check_ruciomover deleted file mode 100755 index bf299b5fb8..0000000000 --- a/tools/probes/common/check_ruciomover +++ /dev/null @@ -1,300 +0,0 @@ -#!/usr/bin/env python -# Copyright European Organization for Nuclear Research (CERN) 2017 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - Nicolo Magini, , 2017 - -''' -Probe to check read/write with all protocols on target RSE with rucio client -''' - -import argparse -import filecmp -import os -import re -import sys -import tempfile -import time -import uuid -from commands import getstatusoutput - -from rucio.client.replicaclient import ReplicaClient -from rucio.client.rseclient import RSEClient -from rucio.common.config import config_get -from rucio.common.exception import RSENotFound, RSEProtocolNotSupported -from rucio.rse import rsemanager - - -def upload(rse, protocol, scope, lfn, no_register=False): - """ - Use the rucio upload command to stage out the file. - :param rse - destination RSE - :param protocol - transfer protocol - :param scope - file scope - :param lfn - file lfn - :param no_register - skip file registration and disable auto-deletion at lifetime expiration - """ - - if no_register: - no_reg_option = '--no-register' - else: - no_reg_option = '--lifetime 3600' - - cmd = 'rucio upload %s --rse %s --protocol %s --scope %s %s' % (no_reg_option, rse, - protocol, scope, lfn) - print 'Will use the following command for stageOut:\n%s\n' % cmd - status, out = getstatusoutput(cmd) - print 'stageOut output:\n%s\n' % out - - if status: - out = out.replace('\n', '') - raise Exception('stageOut failed -- rucio upload did not succeed: %s' % out) - - -def download(rse, protocol, scope, lfn, dst, pfn=None): - """ - Use the rucio download command to stage in the file. - :param rse: source RSE - :param protocol: transfer protocol - :param scope: file scope - :param lfn: file lfn - :param dst: local destination directory - :param pfn: file pfn (required when the downloaded file is not registered in Rucio - """ - - pfn_option = '' - if pfn: - pfn_option = '--pfn %s' % pfn - - cmd = 'rucio download --dir %s --rse %s --protocol %s %s %s:%s' % (dst, - rse, - protocol, - pfn_option, - scope, - lfn) - - print 'Will use the following command for stageIn:\n%s\n' % cmd - status, out = getstatusoutput(cmd) - - print 'stageIn output:\n%s\n' % out - - if status: - out = out.replace('\n', '') - raise Exception('stageIn failed -- rucio download did not succeed: %s' % out) - - -def main(): - ''' - Run the upload and download tests and report the result - ''' - OK, WARNING, CRITICAL, UNKNOWN = 0, 1, 2, 3 - exitstatus = UNKNOWN - - parser = argparse.ArgumentParser() - parser.add_argument('RSE', help='Target RSE to test') - parser.add_argument('--no-register', '-n', action='store_true', - help='Do not register the uploaded file and disable auto-deletion') - parser.add_argument('--color', '-c', action='store_true', - help='Enable color in rucio client output') - group = parser.add_mutually_exclusive_group() - group.add_argument('--download', '-d', metavar='DID', action='store', - default='user.mlassnig:user.mlassnig.pilot.test.single.hits', - help='Specify a fixed container DID for the download test') - group.add_argument('--download-uploaded', '-du', action='store_true', - help='Use the same file in the upload and download tests') - parser.add_argument('--testmode', '-t', action='store_true', - help='Run probe in standalone test mode outside nagios env') - - try: - args = parser.parse_args() - except SystemExit: - sys.exit(UNKNOWN) - - if len(args.download.split(':')) != 2: - print "Wrong format for DID %s - required format is scope:name" % args.download - sys.exit(UNKNOWN) - - if not args.color: - # Disable rucio client colors - os.environ['RUCIO_LOGGING_FORMAT'] = '%(asctime)s %(levelname)s [%(message)s]' - - rse = args.RSE - if not re.match('^[A-Za-z0-9_-]+$', rse): - print 'Invalid RSE name %s' % rse - sys.exit(UNKNOWN) - - print 'Testing RSE:\n%s\n' % rse - - scope = 'tests' - - timestamp = '%d' % time.time() - fileuuid = str(uuid.uuid1()) - - reportfile = '%s/nagios-check_ruciomover.%s.%s.csv' % (os.getcwd(), rse, timestamp) - - workdir = tempfile.mkdtemp(prefix='%s.%s.' % (rse, timestamp)) - os.chdir(workdir) - - print 'Running in working directory:\n%s\n' % os.getcwd() - - if not args.testmode: - try: - proxy = config_get('nagios', 'proxy') - os.environ["X509_USER_PROXY"] = proxy - except Exception as e: - print "Failed to get proxy from rucio.cfg" - sys.exit(UNKNOWN) - - try: - rse_settings = rsemanager.get_rse_info(rse) - - cli = RSEClient() - rse_attribs = cli.list_rse_attributes(rse) - - if rse_attribs['istape'] in (True, 'True'): - print 'Skipping sitemover tests on tape RSE' - sys.exit(UNKNOWN) - - rse_protocols = rse_settings['protocols'] - - # Check if the DID selected for the download test is present on the target RSE - skip_download = False - if not args.download_uploaded: - repcli = ReplicaClient() - reps = [r for r in repcli.list_replicas([{'scope': args.download.split(':')[0], - 'name': args.download.split(':')[1]}])] - if not reps or rse not in reps[0]['rses'].keys(): - skip_download = True - - report = {} - - for prot in rse_protocols: - - # Run tests only if the RSE supports read/write for this protocol. - # Need to check for wan since the nagios probe is running remotely. - if not (prot['domains']['wan']['read'] and prot['domains']['wan']['write']): - continue - - print 'Testing protocol:\n%s\n' % prot['scheme'] - - report[prot['scheme']] = {'upload': 'UNKNOWN', 'download': 'UNKNOWN'} - - filename = '%s.nagios-check_ruciomover.%s.%s.%s.%s' % (scope, rse, prot['scheme'], - timestamp, fileuuid) - print 'Will use the following test file for upload:\n%s\n' % filename - - with open(filename, 'w') as fil: - fil.write('test file for %s with %s at time %s\n' % (rse, - prot['scheme'], timestamp)) - fil.close() - - if rse_settings['deterministic']: - pfn = rsemanager.lfns2pfns(rse_settings, {'scope': scope, 'name': filename}, - scheme=prot['scheme'])['%s:%s' % (scope, filename)] - print 'Rucio will use the following deterministic PFN for upload:\n%s\n' % pfn - else: - print 'Cannot predetermine pfn on non-deterministic storage\n' - if args.no_register: - print 'Skipping test' - os.unlink(filename) - continue - - try: - upload(rse, prot['scheme'], scope, filename, no_register=args.no_register) - report[prot['scheme']]['upload'] = 'OK' - except Exception as e: - print '\n%s\n' % e - report[prot['scheme']]['upload'] = 'CRITICAL' - if args.download_uploaded: - os.unlink(filename) - continue - - dst = 'download/%s.%s.%s' % (rse, prot['scheme'], timestamp) - - pfn_to_download = None - - if args.download_uploaded: - download_scope = scope - did_to_download = filename - destination = '%s/%s/%s' % (dst, download_scope, did_to_download) - # The PFN needs to be specified explicitly when downloading an unregistered file - if args.no_register: - pfn_to_download = pfn - else: - if skip_download: - print "Skipping download test because selected DID %s was not found on RSE %s\n" % (args.download, rse) - os.unlink(filename) - continue - download_scope = args.download.split(':')[0] - did_to_download = args.download.split(':')[1] - destination = '%s/%s' % (dst, did_to_download) - - try: - download(rse, prot['scheme'], download_scope, - did_to_download, dst, pfn=pfn_to_download) - report[prot['scheme']]['download'] = 'OK' - except Exception as e: - print '\n%s\n' % e - report[prot['scheme']]['download'] = 'CRITICAL' - - if args.download_uploaded: - if not filecmp.cmp(filename, destination): - print 'WARNING - Uploaded %s and downloaded %s files differ\n' % (filename, destination) - report[prot['scheme']]['download'] = 'WARNING' - else: - print 'Test successful: uploaded %s and downloaded %s files match\n' % (filename, destination) - - os.unlink(filename) - if os.path.isfile(destination): - os.unlink(destination) - os.removedirs(os.path.dirname(destination)) - elif os.path.isdir(destination): - for fil in os.listdir(destination): - os.unlink('%s/%s' % (destination, fil)) - os.removedirs(destination) - - os.rmdir(workdir) - - print 'Writing report file to:\n%s\n' % reportfile - with open(reportfile, 'w') as fil: - for proto in report: - for act in report[proto]: - fil.write('%s,%s,%s,%s,%s\n' % (rse, act, proto, - report[proto][act], timestamp)) - fil.close() - - # Calculate overall exit status per-RSE as following: - # 1) Ignore UNKNOWN test results - # 2) If all test results are CRITICAL, the RSE status is CRITICAL - # 3) If at least one test result is WARNING or CRITICAL, the RSE status is WARNING - # 4) If all test results are OK, the RSE status is OK - - statuses = [status for res in [d.values() for d in report.values()] - for status in res if status != 'UNKNOWN'] - - if not statuses: - exitstatus = UNKNOWN - elif all([status == 'CRITICAL' for status in statuses]): - exitstatus = CRITICAL - elif 'WARNING' in statuses or 'CRITICAL' in statuses: - exitstatus = WARNING - elif all([status == 'OK' for status in statuses]): - exitstatus = OK - - except RSEProtocolNotSupported as e: - print e - sys.exit(CRITICAL) - except RSENotFound as e: - print e - sys.exit(UNKNOWN) - - print 'Probe ending for RSE %s with status %s' % (rse, exitstatus) - sys.exit(exitstatus) - - -if __name__ == "__main__": - main() diff --git a/tools/probes/common/check_rule_health b/tools/probes/common/check_rule_health deleted file mode 100755 index 8d1f23bbd5..0000000000 --- a/tools/probes/common/check_rule_health +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env python -# Copyright European Organization for Nuclear Research (CERN) 2013 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - Martin Barisits, , 2015-2017 - -''' -Probe to repair rule heatlh -''' - -import sys -from rucio.db.sqla.session import get_session - -# Exit statuses -OK, WARNING, CRITICAL, UNKNOWN = 0, 1, 2, 3 - - -def main(): - ''' - Probe to repair rule health - ''' - try: - session = get_session() - query = '''DECLARE - type array_scope is table of VARCHAR2(30) index by binary_integer; - type array_name is table of VARCHAR2(255) index by binary_integer; - scopes array_scope; - names array_name; - CURSOR get_content IS SELECT scope, name FROM ATLAS_RUCIO.CONTENTS WHERE (CASE when RULE_EVALUATION = 1 then RULE_EVALUATION ELSE NULL END)=1 and created_at < sys_extract_utc(localtimestamp)-2/24 GROUP BY scope, name; -BEGIN - OPEN get_content; - LOOP - FETCH get_content BULK COLLECT INTO scopes, names LIMIT 5000; - FOR i IN 1 .. scopes.count - LOOP - INSERT INTO ATLAS_RUCIO.updated_dids (id, scope, name, rule_evaluation_action, created_at, updated_at) VALUES (sys_guid(), scopes(i), names(i), 'A', sys_extract_utc(localtimestamp), sys_extract_utc(localtimestamp)); - COMMIT; - END LOOP; - EXIT WHEN get_content%NOTFOUND; - END LOOP; - CLOSE get_content; -END;''' # NOQA - session.execute(query) - except Exception: - sys.exit(UNKNOWN) - sys.exit(OK) - - -if __name__ == "__main__": - main() diff --git a/tools/probes/common/check_rules b/tools/probes/common/check_rules deleted file mode 100755 index a38e4372cd..0000000000 --- a/tools/probes/common/check_rules +++ /dev/null @@ -1,103 +0,0 @@ -#!/usr/bin/env python -# Copyright European Organization for Nuclear Research (CERN) 2013 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - Wen Guan, , 2014 - -''' -Probe to check rules. -''' - -import datetime -import sys -import time - -from rucio.core import monitor -from rucio.db.sqla.session import get_session - -# Exit statuses -OK, WARNING, CRITICAL, UNKNOWN = 0, 1, 2, 3 - - -if __name__ == "__main__": - - scope = "stresstest" - - try: - session = get_session() - utctime = datetime.datetime.utcnow() - utctimeInt = int(time.mktime(utctime.timetuple())) - timeEnd = utctimeInt - timezoneoffset = int((datetime.datetime.now() - datetime.datetime.utcnow()).seconds) - - # check rules - sql = """SELECT - CASE - WHEN state = 'R' THEN 'rules_replicating' - WHEN state = 'O' THEN 'rules_ok' - WHEN state = 'S' THEN 'rules_stuck' - WHEN state = 'U' THEN 'rules_suspend' - - ELSE state - END state_desc, num_rows FROM (select /*+ index_ffs((rules RULES_PK) */ state, count(*) num_rows - from atlas_rucio.rules where scope='tests' and account='ddmadmin' and rse_expression!='CERN-PROD-RUCIOTEST_DATADISK' group by state) - """ - - result = session.execute(sql).fetchall() - for state, num in result: - print 'rules.%s %s' % (state, num) - monitor.record_gauge(stat='%s.rules.%s' % (scope, state), value=num) - - # check stuck rules - sql = "select sum(locks_stuck_cnt) from atlas_rucio.rules where scope='tests' and account='ddmadmin' and rse_expression!='CERN-PROD-RUCIOTEST_DATADISK' and state='S' " - result = session.execute(sql).fetchone()[0] or 0 - print 'rules.sum_locks_stuck_cnt %s' % (result) - monitor.record_gauge(stat='%s.rules.sum_locks_stuck_cnt' % scope, value=result) - - # check stuck rules older than 24 hours - timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=24) - sql = "select sum(locks_stuck_cnt) from atlas_rucio.rules where scope='tests' and account='ddmadmin' \ - and rse_expression!='CERN-PROD-RUCIOTEST_DATADISK' and state='S' and created_at <= to_timestamp('" + str(timeLimit) + "','YYYY-MM-dd HH24:MI:SS.FF')" - result = session.execute(sql).fetchone()[0] or 0 - print 'rules.created_24hours_ago.sum_locks_stuck_cnt %s' % (result) - monitor.record_gauge(stat='%s.rules.created_24hours_ago.sum_locks_stuck_cnt' % scope, value=result) - - # check stuck rules older than 1 week - timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=168) - sql = "select sum(locks_stuck_cnt) from atlas_rucio.rules where scope='tests' and account='ddmadmin' \ - and rse_expression!='CERN-PROD-RUCIOTEST_DATADISK' and state='S' and created_at <= to_timestamp('" + str(timeLimit) + "','YYYY-MM-dd HH24:MI:SS.FF')" - result = session.execute(sql).fetchone()[0] or 0 - print 'rules.created_1week_ago.sum_locks_stuck_cnt %s' % (result) - monitor.record_gauge(stat='%s.rules.created_1week_ago.sum_locks_stuck_cnt' % scope, value=result) - - # check replicating rules - sql = "select sum(locks_replicating_cnt) from atlas_rucio.rules where scope='tests' and account='ddmadmin' and rse_expression!='CERN-PROD-RUCIOTEST_DATADISK' and state in ('S','R') " - result = session.execute(sql).fetchone()[0] or 0 - print 'rules.sum_locks_replicating_cnt %s' % (result) - monitor.record_gauge(stat='%s.rules.sum_locks_replicating_cnt' % scope, value=result) - - # check replicating rules older than 24 hours - timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=24) - sql = "select sum(locks_replicating_cnt) from atlas_rucio.rules where scope='tests' and account='ddmadmin' \ - and rse_expression!='CERN-PROD-RUCIOTEST_DATADISK' and state in ('S','R') and created_at <= to_timestamp('" + str(timeLimit) + "','YYYY-MM-dd HH24:MI:SS.FF')" - result = session.execute(sql).fetchone()[0] or 0 - print 'rules.created_24hours_ago.sum_locks_replicating_cnt %s' % (result) - monitor.record_gauge(stat='%s.rules.created_24hours_ago.sum_locks_replicating_cnt' % scope, value=result) - - # check replicating rules older than 1 week - timeLimit = datetime.datetime.utcnow() - datetime.timedelta(hours=168) - sql = "select sum(locks_replicating_cnt) from atlas_rucio.rules where scope='tests' and account='ddmadmin' \ - and rse_expression!='CERN-PROD-RUCIOTEST_DATADISK' and state in ('S','R') and created_at <= to_timestamp('" + str(timeLimit) + "','YYYY-MM-dd HH24:MI:SS.FF')" - result = session.execute(sql).fetchone()[0] or 0 - print 'rules.created_1week_ago.sum_locks_replicating_cnt %s' % (result) - monitor.record_gauge(stat='%s.rules.created_1week_ago.sum_locks_replicating_cnt' % scope, value=result) - - except: - sys.exit(UNKNOWN) - finally: - session.remove() - sys.exit(OK) diff --git a/tools/probes/common/check_set_rse_space_limits b/tools/probes/common/check_set_rse_space_limits deleted file mode 100755 index b52a241e9d..0000000000 --- a/tools/probes/common/check_set_rse_space_limits +++ /dev/null @@ -1,192 +0,0 @@ -#!/usr/bin/env python -""" - Copyright European Organization for Nuclear Research (CERN) 2014 - - Licensed under the Apache License, Version 2.0 (the "License"); - You may not use this file except in compliance with the License. - You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 - - Authors: - - David Cameron, , 2014-2016 - - Mario Lassnig, , 2015 - - Cedric Serfon, , 2017 - - Sets the minimum free space on RSEs according to the policy, which is set in - the configuration table of Rucio server. A relative and absolute limit are - set for the relevant endpoints, for example: - Spacetoken Free ratio Free absolute - PRODDISK 25% 10.0 TB - - The smaller of ratio and absolute is the threshold below which to clean. - Some tokens (tape, groupdisk, localgroupdisk) are not cleaned automatically. - - The capacity of each RSE is Storage used - Rucio used of other RSEs sharing the - token. This allows RSEs to use space pledged but not used by other RSEs. The - minimum free space is evaluated based on this capacity. In the reaper Rucio - calculates the space to clean as MinFreeSpace limit - Storage free, where Storage - free is the total Storage capacity - Rucio used for this RSE. Therefore the - MinFreeSpace limit set here must include all the used space for all the other - RSEs in the token. -""" - -import json -import sys -from urlparse import urlparse -import requests - -# Try to use server environment (direct database access). If that fails use -# client. -server = False -try: - from rucio.api import config - from rucio.api import rse as client - server = True -except: - from rucio.client import Client - from rucio.client.configclient import ConfigClient - client = Client() - config = ConfigClient() - -UNKNOWN, OK, WARNING, CRITICAL = -1, 0, 1, 2 - -# This is the limit of files to delete in each RSE in the reaper loop. To be -# decided what is the ideal value and if it should be per RSE. -max_files_to_delete = 100 - - -def toTB(size): - return size / 1000.0**4 - - -# Get endpoint info from AGIS to know the RSEs in each space token -try: - URL = 'http://atlas-agis-api.cern.ch/request/ddmendpoint/query/list/?json&state=ACTIVE&site_state=ACTIVE' - RESP = requests.get(url=URL) - DATA = json.loads(RESP.content) -except Exception as error: - print "Failed to get information from AGIS: %s" % str(error) - sys.exit(CRITICAL) - -# Map of RSE: hostname -RSE_HOST = {} -for endpoint in DATA: - host = urlparse(endpoint['se']).hostname - if host: - RSE_HOST[endpoint['name']] = host - -try: - RSES = [rse['rse'] for rse in client.list_rses()] -except Exception as error: - print "Failed to get RSEs from Rucio: %s" % str(error) - sys.exit(CRITICAL) - -# Get policy defined in config. Each section is called limitstoken -# {token: (relative limit in %, absolute limit in TB)} -policy = {} -try: - if server: - SECTIONS = config.sections('root') - for section in [s for s in SECTIONS if s.startswith('limits')]: - policy[section[6:].upper()] = (config.get(section, 'rellimit', 'root'), config.get(section, 'abslimit', 'root')) - else: - SECTIONS = config.get_config() - for section in [s for s in SECTIONS if s.startswith('limits')]: - policy[section[6:].upper()] = (SECTIONS[section]['rellimit'], SECTIONS[section]['abslimit']) - -except Exception as error: - print "Failed to get configuration information from Rucio: %s" % str(error) - sys.exit(CRITICAL) - -for rse in RSES: - - tokens = [token for token in policy if rse.endswith(token)] - if not tokens: - continue - - if len(tokens) != 1: - print "RSE %s has multiple limits defined" % rse - continue - - token = tokens[0] - - if not [r for r in DATA if r['name'] == rse]: - print "RSE %s not defined in AGIS" % rse - continue - rse_attr = client.list_rse_attributes(rse) - if 'spacetoken' in rse_attr: - spacetoken = client.list_rse_attributes(rse)['spacetoken'] - else: - print "No space token info for %s" % rse - continue - - # Client and server API are different for get_rse_usage - try: - if server: - spaceinfo = client.get_rse_usage(rse, None) - else: - spaceinfo = client.get_rse_usage(rse) - except Exception as error: - print "Could not get space information for %s: %s" % (rse, str(error)) - continue - - spaceinfo = [i for i in spaceinfo] # Generator can only be used once - - capacity = [source['total'] for source in spaceinfo if source['source'] == 'storage'] - storageused = [source['used'] for source in spaceinfo if source['source'] == 'storage'] - rucioused = [source['used'] for source in spaceinfo if source['source'] == 'rucio'] - if not capacity or not storageused or not rucioused: - print 'Missing space info for %s' % rse - continue - capacity = capacity[0] - storageused = storageused[0] - rucioused = rucioused[0] - - print "RSE %s: total capacity %sTB, Storage used %sTB, Rucio used %sTB" % (rse, toTB(capacity), toTB(storageused), toTB(rucioused)) - - # If this RSE shares space with others remove rucio used from total space - # to calculate the limit - used_others = 0 - for endpoint in DATA: - if endpoint['name'] != rse and (RSE_HOST[endpoint['name']] == RSE_HOST[rse] and spacetoken == endpoint['token']): - try: - if server: - used = client.get_rse_usage(endpoint['name'], None, source='rucio') - else: - used = client.get_rse_usage(endpoint['name'], filters={'source': 'rucio'}) - except Exception as error: - print "Could not get used Rucio space for %s: %s" % (endpoint['name'], str(error)) - continue - - used = [source['used'] for source in used if source['source'] == 'rucio'] - if not used: - print "No Rucio used space information for %s" % rse - continue - used = used[0] - - print "Removing %fTB used space in %s" % (toTB(used), endpoint['name']) - used_others += used - - capacity -= used_others - print "Remaining capacity for %s: %sTB" % (rse, toTB(capacity)) - - if 'MinFreeSpace' in rse_attr: - minfree = int(rse_attr['MinFreeSpace']) - print "RSE %s: Will apply forced value for minimum free space %sTB" % (rse, toTB(minfree)) - else: - minfree = min(capacity * policy[token][0] / 100.0, policy[token][1] * (1000**4)) - print "RSE %s: calculated minimum free space %sTB" % (rse, toTB(minfree)) - - try: - if server: - client.set_rse_limits(rse, 'MinFreeSpace', minfree, 'root') - client.set_rse_limits(rse, 'MaxBeingDeletedFiles', max_files_to_delete, 'root') - else: - client.set_rse_limits(rse, 'MinFreeSpace', minfree) - client.set_rse_limits(rse, 'MaxBeingDeletedFiles', max_files_to_delete) - except Exception as error: - print "Failed to set RSE limits for %s: %s" % (rse, str(error)) - continue - - print "Set MinFreeSpace for %s to %fTB" % (rse, toTB(minfree)) - -sys.exit(OK) diff --git a/tools/probes/common/check_site_status b/tools/probes/common/check_site_status deleted file mode 100755 index 76b3a60af8..0000000000 --- a/tools/probes/common/check_site_status +++ /dev/null @@ -1,170 +0,0 @@ -#!/usr/bin/env python -# Copyright 2012-2018 CERN for the benefit of the ATLAS collaboration. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Authors: -# - Cedric Serfon, , 2014-2019 -# -# PY3K COMPATIBLE - -from __future__ import print_function - -import json -import os -import sys - -import requests - -from rucio.common.config import config_get -from rucio.core.rse import list_rses, update_rse, get_rse_protocols -from rucio.transfertool.fts3 import FTS3Transfertool - - -UNKNOWN = 3 -CRITICAL = 2 -WARNING = 1 -OK = 0 - -DRYRUN = '-n' in sys.argv - -if __name__ == '__main__': - - STATUS = OK - try: - PROXY = config_get('nagios', 'proxy') - os.environ["X509_USER_PROXY"] = PROXY - CERT, KEY = os.environ['X509_USER_PROXY'], os.environ['X509_USER_PROXY'] - except Exception as error: - print(error) - print("Failed to get proxy from rucio.cfg") - sys.exit(CRITICAL) - - try: - FTSHOSTS = config_get('conveyor', 'ftshosts') - FTSHOSTS = FTSHOSTS.split(',') - except Exception as error: - print("Failed to get ftshosts") - STATUS = WARNING - - URL = 'http://atlas-agis-api.cern.ch/request/ddmendpointstatus/query/list/?json' - RESP = requests.get(url=URL) - DATA = json.loads(RESP.content) - MAPPING = {'r': 'availability_read', 'w': 'availability_write', 'd': 'availability_delete'} - BLACKLISTED_SITES = {'availability_read': [], 'availability_write': [], 'availability_delete': []} - DOWNTIMES = {} - for site in DATA: - for activity in DATA[site]: - if activity in ['r', 'w', 'd']: - try: - if DATA[site][activity]['status']['value'] == 'OFF': - BLACKLISTED_SITES[MAPPING[activity]].append(site) - if DATA[site][activity]['status']['provider'] == 'AGIS' and DATA[site][activity]['status']['reason'].find('https://goc.egi.eu/portal/index.php?Page_Type=Downtime') > -1: - DOWNTIMES[site] = DATA[site][activity]['status']['reason'] - except KeyError as error: - print(error) - - URL = 'http://atlas-agis-api.cern.ch/request/ddmendpoint/query/list/?json&state=ACTIVE&site_state=ACTIVE' - RESP = requests.get(url=URL) - DATA = json.loads(RESP.content) - AGIS_RSES = [] - for rse in DATA: - if (rse['name'] not in AGIS_RSES) and not (rse['state'] == 'DISABLED' or rse['site_state'] == 'DISABLED'): - AGIS_RSES.append(rse['name']) - - # Blacklisting - for activity in BLACKLISTED_SITES: - for site in list_rses({activity: True}): - if site['rse'] in BLACKLISTED_SITES[activity]: - print('%s will be blacklisted for %s' % (site['rse'], activity)) - try: - if not DRYRUN: - update_rse(site['rse'], {activity: False}) - if activity in ['availability_write']: - for rse in BLACKLISTED_SITES[activity]: - if rse in DOWNTIMES: - protocols = get_rse_protocols(rse) - for prot in protocols['protocols']: - if 'third_party_copy' in prot['domains']['wan']: - se = '%s://%s' % (prot['scheme'], prot['hostname']) - for fts_host in FTSHOSTS: - print("FTS3Transfertool(external_host='%s').set_se_status(storage_element='%s', message='%s', ban=True, timeout=60)" % (fts_host, se, DOWNTIMES[rse])) - FTS3Transfertool(external_host=fts_host).set_se_status(storage_element=se, message=DOWNTIMES[rse], ban=True, timeout=60) - except Exception as error: - print(error) - if STATUS == OK: - STATUS = WARNING - - # Whitelisting - for activity in BLACKLISTED_SITES: - for site in list_rses({activity: False}): - rse = site['rse'] - if rse not in AGIS_RSES: - print('%s does not exist in AGIS' % (rse)) - continue - if rse not in BLACKLISTED_SITES[activity]: - print('%s will be whitelisted for %s' % (rse, activity)) - try: - if not DRYRUN: - update_rse(rse, {activity: True}) - protocols = get_rse_protocols(rse) - for prot in protocols['protocols']: - if 'third_party_copy' in prot['domains']['wan']: - se = '%s://%s' % (prot['scheme'], prot['hostname']) - for fts_host in FTSHOSTS: - print("FTS3Transfertool(external_host='%s').set_se_status(storage_element='%s', message='%s', ban=False, timeout=60)" % (fts_host, se, DOWNTIMES[rse])) - FTS3Transfertool(external_host=fts_host).set_se_status(storage_element=se, message=DOWNTIMES[rse], ban=False, timeout=60) - except Exception as error: - print(error) - if STATUS == OK: - STATUS = WARNING - - FTS_BAN_SES = {} - for fts_host in FTSHOSTS: - try: - ban_ses = FTS3Transfertool(external_host=fts_host).list_se_status() - FTS_BAN_SES[fts_host] = ban_ses - except Exception as error: - print(error) - if STATUS == OK: - STATUS = WARNING - - SES = [] - for rse in DOWNTIMES: - protocols = get_rse_protocols(rse) - for prot in protocols['protocols']: - if 'third_party_copy' in prot['domains']['wan']: - se = '%s://%s' % (prot['scheme'], prot['hostname']) - if se not in SES: - SES.append(se) - for fts_host in FTS_BAN_SES: - if se not in [storage['se'] for storage in FTS_BAN_SES[fts_host]]: - try: - print("FTS3Transfertool(external_host='%s').set_se_status(storage_element='%s', message='%s', ban=True, timeout=60)" % (fts_host, se, DOWNTIMES[rse])) - FTS3Transfertool(external_host=fts_host).set_se_status(storage_element=se, message=DOWNTIMES[rse], ban=True, timeout=60) - except Exception as error: - print(error) - - for fts_host in FTS_BAN_SES: - ban_ses = FTS_BAN_SES[fts_host] - for banned_se in ban_ses: - if banned_se['se'] not in SES and banned_se['admin_dn'].find('ddmadmin') > -1: - try: - print('%s is banned but is not in downtime. Removing the ban' % (banned_se)) - FTS3Transfertool(external_host=fts_host).set_se_status(storage_element=banned_se['se'], message='', ban=False, timeout=60) - except Exception as error: - print(error) - if STATUS == OK: - STATUS = WARNING - - sys.exit(STATUS) diff --git a/tools/probes/common/check_srm_space b/tools/probes/common/check_srm_space deleted file mode 100755 index 0aff03beb0..0000000000 --- a/tools/probes/common/check_srm_space +++ /dev/null @@ -1,69 +0,0 @@ -#!/usr/bin/env python -# Copyright European Organization for Nuclear Research (CERN) 2013 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - Wen Guan, , 2014 -# -import os -import sys - -from rucio.client import Client -from rucio.common.config import config_get -from rucio.rse import rsemanager as rsemgr -from rucio.api.rse import list_rse_attributes - -OK, WARNING, CRITICAL, UNKNOWN = 0, 1, 2, 3 - - -if __name__ == "__main__": - - rsename = sys.argv[1] - - retvalue = OK - usedsize = 0 - freesize = 0 - - # verifing, that json method is perfered - attr = list_rse_attributes(rsename) - if 'space_usage_method' in attr.keys(): - if attr['space_usage_method'] == 'json': - print "This rse(%s) is accounted by json method." % (rsename) - sys.exit(retvalue) - - try: - proxy = config_get('nagios', 'proxy') - os.environ["X509_USER_PROXY"] = proxy - except Exception as e: - print "Failed to get proxy from rucio.cfg" - retvalue = WARNING - - c = Client() - - rse_settings = rsemgr.get_rse_info(rsename) - for protocol in rse_settings['protocols']: - if protocol['scheme'] == "srm": - rse_settings['protocols'].remove(protocol) - protocol['impl'] = "rucio.rse.protocols.gfal.Default" - rse_settings['protocols'].append(protocol) - try: - gs, ret = rsemgr.get_space_usage(rse_settings, "srm") - if gs: - totalsize = long(ret["totalsize"]) - freesize = long(ret["unusedsize"]) - usedsize = totalsize - freesize - else: - print "Failed to get rse(%s) space information: %s" % (rsename, str(ret)) - retvalue = WARNING - except Exception as e: - print "Failed to get rse(%s) space information: %s" % (rsename, str(e)) - retvalue = WARNING - - if retvalue == OK: - print "Update RSE %s space usage (usedsize: %s, freesize: %s)" % (rsename, usedsize, freesize) - c.set_rse_usage(rsename, "srm", usedsize, freesize) - - sys.exit(retvalue) diff --git a/tools/probes/common/check_stalled_fts_connections b/tools/probes/common/check_stalled_fts_connections deleted file mode 100755 index ccb4dc50cb..0000000000 --- a/tools/probes/common/check_stalled_fts_connections +++ /dev/null @@ -1,111 +0,0 @@ -#!/usr/bin/env python -# Copyright European Organization for Nuclear Research (CERN) 2013 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - Wen Guan, , 2015 - 2016 - -''' -Probe to check stalled fts connections. -''' - -import commands -import datetime -import os -import socket -import sys -import time -import traceback - -from rucio.core import monitor - -# Exit statuses -OK, WARNING, CRITICAL, UNKNOWN = 0, 1, 2, 3 - - -def get_conveyor_services(): - services = {} - status, output = commands.getstatusoutput('/usr/bin/supervisorctl status') - if status == 0: - for line in output.split("\n"): - items = line.split() - name = items[0] - # name = ''.join([i for i in name if not i.isdigit()]) - pid = items[3].split(',')[0] - if name.startswith("conveyor"): - services[pid] = {'name': name, 'stalled': 0, 'cxns': []} - return services - - -def check_connections(services): - status, output = commands.getstatusoutput("/usr/sbin/lsof|/bin/grep fts") - if status == 0: - outputs = output.split("\n") - for line in outputs: - try: - items = line.split() - pid, cxn = items[1], items[8].split('->')[1] - if pid in services.keys(): - fd = items[3].replace('u', '') - file = "/proc/%s/fd/%s" % (pid, fd) - if os.path.exists(file): - mtime = os.lstat(file).st_mtime - if (time.time() - mtime) > 3600: - services[pid]['stalled'] += 1 - if cxn not in services[pid]['cxns']: - services[pid]['cxns'].append(cxn) - except: - print traceback.format_exc() - ret = {} - for pid in services: - if services[pid]['name'] not in ret: - ret[services[pid]['name']] = {'stalled': services[pid]['stalled'], 'cxns': services[pid]['cxns']} - else: - ret[services[pid]['name']]['stalled'] += services[pid]['stalled'] - for cxn in services[pid]['cxns']: - if cxn not in ret[services[pid]['name']]['cxns']: - ret[services[pid]['name']]['cxns'].append(cxn) - return ret - - -if __name__ == "__main__": - try: - hostname = socket.getfqdn() - - services = get_conveyor_services() - print services - services = check_connections(services) - print services - - for name in services: - stalled, cxns = services[name]['stalled'], services[name]['cxns'] - monitor.record_counter('daemons.conveyor.stalled_fts_connections.%s.%s' % (hostname.split('.')[0], name), stalled) - - fileoutput = '/tmp/connections_probes.txt.%i' % (time.time()) - g = open(fileoutput, 'w') - if stalled == 0: - g.write('%s\t%s : Connections stalled for more than 60 minutes\t%i\tNo stalled connections\n' % (hostname, name, OK)) - else: - g.write('%s\t%s : Connections stalled for more than 60 minutes\t%i\t%i stalled connections [%s]\n' % (hostname, name, CRITICAL, stalled, ','.join(cxns))) - g.close() - for i in xrange(0, 10): - s, o = commands.getstatusoutput('/usr/sbin/send_nsca rucio-nagios-prod.cern.ch -c /etc/nagios/send_nsca.cfg < %s' % (fileoutput)) - if not s: - break - print o - time.sleep(2) - os.remove(fileoutput) - - if stalled > 0: - dt = datetime.datetime.now() - if dt.hour > 8 and dt.hour < 18 and dt.isoweekday(): - pass - else: - status, output = commands.getstatusoutput("/usr/bin/supervisorctl restart %s" % name) - except: - print traceback.format_exc() - sys.exit(UNKNOWN) - sys.exit(OK) diff --git a/tools/probes/common/check_storage_space b/tools/probes/common/check_storage_space deleted file mode 100755 index 4ae7ae2819..0000000000 --- a/tools/probes/common/check_storage_space +++ /dev/null @@ -1,156 +0,0 @@ -#!/usr/bin/env python -""" - Copyright European Organization for Nuclear Research (CERN) 2013 - - Licensed under the Apache License, Version 2.0 (the "License"); - You may not use this file except in compliance with the License. - You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 - - Authors: - - Wen Guan, , 2014 - - Sylvain Blunier, , 2016 - - Tomas Javurek, , 2016 - - Cedric Serfon, , 2015-2017 - - Dimitrios Christidis, , 2019 -""" - -import json -import os -import sys - -import gfal2 -import requests - -from rucio.client import Client -from rucio.common.config import config_get -from rucio.common.exception import RSEProtocolNotSupported -from rucio.rse import rsemanager as rsemgr -from rucio.api.rse import list_rse_attributes, get_rse - -OK, WARNING, CRITICAL, UNKNOWN = 0, 1, 2, 3 - -gfal2.set_verbose(gfal2.verbose_level.normal) - - -def consume_atlas_json(data, key): - totalsize = int(data[key]['total_space']) - used = int(data[key]['used_space']) - unusedsize = totalsize - used - return totalsize, unusedsize - - -def consume_wlcg_json(data, key): - index = {share['name']: share for share in data['storageservice']['storageshares'] if 'name' in share} - totalsize = index[key]['totalsize'] - used = index[key]['usedsize'] - unusedsize = totalsize - used - return totalsize, unusedsize - - -def get_space_usage(rse_name): - """ - Get RSE space usage information. - - :returns: a list with dict containing 'totalsize' and 'unusedsize' - - """ - dest = '/tmp/rucio-gsiftp-site-size_' + rse_name - space_usage_url = '' - # url of space usage json, woud be nicer to have it in rse_settings - agis = requests.get('http://atlas-agis-api.cern.ch/request/ddmendpoint/query/list/?json').json() - agis_token = '' - for res in agis: - if rse_name == res['name']: - agis_token = res['token'] - space_usage_url = res['space_usage_url'] - print space_usage_url - if os.path.exists(dest): - os.remove(dest) - ctx = gfal2.creat_context() - ctx.set_opt_string_list("SRM PLUGIN", "TURL_PROTOCOLS", ["gsiftp", "rfio", "gsidcap", "dcap", "kdcap"]) - params = ctx.transfer_parameters() - params.timeout = 3600 - ret = ctx.filecopy(params, str(space_usage_url), str('file://' + dest)) - if ret == 0: - data_file = open(dest) - data = json.load(data_file) - data_file.close() - if 'storageservice' in data: - return consume_wlcg_json(data, agis_token) - else: - return consume_atlas_json(data, agis_token) - - -def get_space(rsename, protocol, client): - retvalue = OK - rse_settings = rsemgr.get_rse_info(rsename) - if rse_settings['protocols']: - schemes = [domain['scheme'] for domain in rse_settings['protocols']] - if protocol == 'json': - print 'Running on %s, using json method' % (rsename) - try: - totalsize, unusedsize = get_space_usage(rsename) - freesize = long(unusedsize) - totalsize = long(totalsize) - usedsize = totalsize - freesize - except Exception as error: - print error - retvalue = WARNING - else: - if 'srm' not in schemes: - protocol = 'https' - try: - global_status, ret = rsemgr.get_space_usage(rse_settings, protocol) - if global_status: - totalsize = long(ret["totalsize"]) - freesize = long(ret["unusedsize"]) - usedsize = totalsize - freesize - else: - print "Failed to get rse(%s) space information: %s" % (rsename, str(ret)) - retvalue = WARNING - except RSEProtocolNotSupported as error: - print 'Protocol %s not found on %s' % (protocol, rsename) - retvalue = WARNING - except Exception as error: - print "Failed to get rse(%s) space information: %s" % (rsename, str(error)) - retvalue = WARNING - - if retvalue == OK: - print "Update RSE %s space usage (usedsize: %s, freesize: %s)" % (rsename, usedsize, freesize) - client.set_rse_usage(rsename, protocol, usedsize, freesize) - client.set_rse_usage(rsename, 'storage', usedsize, freesize) - return retvalue - return WARNING - - -if __name__ == "__main__": - - CLOUD = sys.argv[1] - - exitvalue = OK - retvalue = OK - usedsize = 0 - freesize = 0 - try: - PROXY = config_get('nagios', 'proxy') - os.environ["X509_USER_PROXY"] = PROXY - except Exception as error: - print "Failed to get proxy from rucio.cfg" - - CLIENT = Client() - RSES = CLIENT.list_rses('cloud=%s' % CLOUD) - for rse in sorted(RSES): - rsename = rse['rse'] - print rsename - rse_info = get_rse(rsename) - if rse_info['availability_read']: - attr = list_rse_attributes(rsename) - if 'space_usage_method' in attr.keys(): - retvalue = get_space(rsename, protocol=attr['space_usage_method'], client=CLIENT) - else: - retvalue = get_space(rsename, protocol='srm', client=CLIENT) - else: - print '%s blacklisted for read. Skipping storage space collection' % rsename - - exitvalue = max(exitvalue, retvalue) - sys.exit(exitvalue) diff --git a/tools/probes/common/check_stresstest_dids_requests_replicas_daily b/tools/probes/common/check_stresstest_dids_requests_replicas_daily deleted file mode 100755 index 8505719614..0000000000 --- a/tools/probes/common/check_stresstest_dids_requests_replicas_daily +++ /dev/null @@ -1,134 +0,0 @@ -#!/usr/bin/env python -# Copyright European Organization for Nuclear Research (CERN) 2013 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - Wen Guan, , 2014 - -''' -Probe to check dids, requests and replicas by daily. -''' - -import datetime -import sys -import time - -from rucio.core import monitor -from rucio.db.sqla.session import get_session - -# Exit statuses -OK, WARNING, CRITICAL, UNKNOWN = 0, 1, 2, 3 - - -if __name__ == "__main__": - - scope = "stresstest" - - try: - session = get_session() - utctime = datetime.datetime.utcnow() - utctimeInt = int(time.mktime(utctime.timetuple())) - timeStart = utctimeInt - 3600 * 24 - timeEnd = timeStart + 3600 * 24 - timezoneoffset = int((datetime.datetime.now() - datetime.datetime.utcnow()).seconds) - - # created datasets daily - sql = "select /*+ index_ffs(dids DIDS_PK) */ count(1) from atlas_rucio.dids where scope='tests' and did_type='D' \ - and account='ddmusr01' and project='step14' and created_at>= to_timestamp('" + str(datetime.datetime.fromtimestamp(timeStart)) + "','YYYY-MM-dd HH24:MI:SS') \ - and created_at , 2014 - -''' -Probe to check dids, requests and replicas by hourly. -''' - -import datetime -import sys -import time - -from rucio.core import monitor -from rucio.db.sqla.session import get_session - -# Exit statuses -OK, WARNING, CRITICAL, UNKNOWN = 0, 1, 2, 3 - - -if __name__ == "__main__": - - scope = "stresstest" - - try: - lines = [] - session = get_session() - utctime = datetime.datetime.utcnow() - utctimeInt = int(time.mktime(utctime.timetuple())) - timeStart = utctimeInt / 3600 * 3600 - 3600 - timeEnd = timeStart + 3600 - timezoneoffset = int((datetime.datetime.now() - datetime.datetime.utcnow()).seconds) - - # created datasets hourly - sql = "select /*+ index_ffs(dids DIDS_PK) */ count(1) from atlas_rucio.dids where scope='tests' and did_type='D' \ - and account='ddmusr01' and project='step14' and created_at>= to_timestamp('" + str(datetime.datetime.fromtimestamp(timeStart)) + "','YYYY-MM-dd HH24:MI:SS') \ - and created_at , 2014 - -''' -Probe to check replicas replicating time by hourly. -''' - -import datetime -import sys -import time - -from rucio.core import monitor -from rucio.db.sqla.session import get_session - -# Exit statuses -OK, WARNING, CRITICAL, UNKNOWN = 0, 1, 2, 3 - -if __name__ == "__main__": - - scope = "stresstest" - - try: - session = get_session() - utctime = datetime.datetime.utcnow() - utctimeInt = int(time.mktime(utctime.timetuple())) - timeStart = utctimeInt - 3600 - timeEnd = timeStart + 3600 - timezoneoffset = int((datetime.datetime.now() - datetime.datetime.utcnow()).seconds) - - # replicas time hourly - sql = "select t2.rse, t1.created_at, t1.timediff from (select rse_id, created_at, (cast(updated_at as timestamp)- cast(created_at as timestamp)) timediff \ - from atlas_rucio.replicas where scope='tests' and state='A' and \ - updated_at>= to_timestamp('" + str(datetime.datetime.fromtimestamp(timeStart)) + "','YYYY-MM-dd HH24:MI:SS') \ - and updated_at , 2014 - -''' -Probe to check stress test replicating time statistics every 2 hours. -''' - -import datetime -import sys -import time - -from rucio.core import monitor -from rucio.db.sqla.session import get_session - -# Exit statuses -OK, WARNING, CRITICAL, UNKNOWN = 0, 1, 2, 3 - - -if __name__ == "__main__": - - scope = "stresstest" - - try: - session = get_session() - utctime = datetime.datetime.utcnow() - utctimeInt = int(time.mktime(utctime.timetuple())) - timeStart = utctimeInt - 3600 * 2 - timeEnd = timeStart + 3600 * 2 - timezoneoffset = int((datetime.datetime.now() - datetime.datetime.utcnow()).seconds) - - # replicas time hourly - sql = "select t2.rse, t1.timediff from (select rse_id, (cast(updated_at as timestamp)- cast(created_at as timestamp)) timediff \ - from atlas_rucio.replicas where scope='tests' \ - and state='A' and updated_at>= to_timestamp('" + str(datetime.datetime.fromtimestamp(timeStart)) + "','YYYY-MM-dd HH24:MI:SS') \ - and updated_at , 2014 - -''' -Probe to check stress test replicating time statistics daily. -''' - -import datetime -import sys -import time - -from rucio.core import monitor -from rucio.db.sqla.session import get_session - -# Exit statuses -OK, WARNING, CRITICAL, UNKNOWN = 0, 1, 2, 3 - - -if __name__ == "__main__": - - scope = "stresstest" - - try: - session = get_session() - utctime = datetime.datetime.utcnow() - utctimeInt = int(time.mktime(utctime.timetuple())) - timeStart = utctimeInt - 3600 * 24 - timeEnd = timeStart + 3600 * 24 - timezoneoffset = int((datetime.datetime.now() - datetime.datetime.utcnow()).seconds) - - # replicas time hourly - sql = "select t2.rse, t1.timediff from (select rse_id, (cast(updated_at as timestamp)- cast(created_at as timestamp)) timediff \ - from atlas_rucio.replicas where scope='tests' \ - and state='A' and updated_at>= to_timestamp('" + str(datetime.datetime.fromtimestamp(timeStart)) + "','YYYY-MM-dd HH24:MI:SS') \ - and updated_at , 2014 - -''' -Probe to check the backlog of stuck rules. -''' -import sys -import traceback - -from rucio.core import monitor -from rucio.db.sqla.session import get_session - -# Exit statuses -OK, WARNING, CRITICAL, UNKNOWN = 0, 1, 2, 3 - -if __name__ == "__main__": - try: - session = get_session() - - result = session.execute('SELECT COUNT(1) FROM ATLAS_RUCIO.RULES where state=\'S\' and (error !=\'MissingSourceReplica\' or error IS NULL)').fetchone()[0] - monitor.record_gauge(stat='judge.stuck_rules_without_missing_source_replica', value=result) - - result = session.execute('SELECT COUNT(1) FROM ATLAS_RUCIO.RULES where state=\'S\' and error =\'MissingSourceReplica\'').fetchone()[0] - monitor.record_gauge(stat='judge.stuck_rules_with_missing_source_replica', value=result) - except: - print traceback.format_exc() - sys.exit(UNKNOWN) - sys.exit(OK) diff --git a/tools/probes/common/check_suspicious_logs b/tools/probes/common/check_suspicious_logs deleted file mode 100644 index 035ff60883..0000000000 --- a/tools/probes/common/check_suspicious_logs +++ /dev/null @@ -1,62 +0,0 @@ -#!/usr/bin/env python -# Copyright 2012-2018 CERN for the benefit of the ATLAS collaboration. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Authors: -# - Cedric Serfon , 2019 -# -# PY3K COMPATIBLE - -from __future__ import print_function - -import sys - -from datetime import datetime, timedelta -from rucio.core.replica import get_suspicious_files, list_replicas, add_bad_pfns -from rucio.db.sqla.constants import BadPFNStatus - - -UNKNOWN = 3 -CRITICAL = 2 -WARNING = 1 -OK = 0 - -if __name__ == '__main__': - - try: - CNT_THRESHOLD = sys.argv[1] - except IndexError: - print('No threshold value defined for CNT_THRESHOLD, will use the default one : 10') - CNT_THRESHOLD = 10 - try: - NB_DAYS = sys.argv[2] - except IndexError: - print('No threshold value defined for NB_DAYS, will use the default one : 3') - NB_DAYS = 3 - - BAD_PFNS = [] - - try: - for bad_rep in get_suspicious_files(rse_expression='type=DATADISK', younger_than=datetime.now() - timedelta(days=NB_DAYS), nattempts=CNT_THRESHOLD, session=None): - scope, name, rse, cnt = bad_rep['scope'], bad_rep['name'], bad_rep['rse'], bad_rep['cnt'] - if bad_rep['name'].startswith('log'): - print('%s:%s declared %s times suspicious on %s' % (scope, name, cnt, rse)) - for rep in list_replicas([{'scope': scope, 'name': name}], rse_expression=rse): - pfn = rep['rses'][rse][0] - BAD_PFNS.append(pfn) - add_bad_pfns(pfns=BAD_PFNS, account='root', state=BadPFNStatus.BAD, reason='Lost log files', expires_at=None, session=None) - except Exception as error: - print(error) - sys.exit(CRITICAL) - sys.exit(OK) diff --git a/tools/probes/common/check_sync_rses_with_agis b/tools/probes/common/check_sync_rses_with_agis deleted file mode 100755 index 92e7f5a2a5..0000000000 --- a/tools/probes/common/check_sync_rses_with_agis +++ /dev/null @@ -1,267 +0,0 @@ -#!/usr/bin/env python -# Copyright European Organization for Nuclear Research (CERN) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - Vincent Garonne, , 2013-2014 -# - Mario Lassnig, , 2014 -# - Cedric Serfon, , 2014-2018 -# - David Cameron, , 2015 - -import json -import os.path -import sys -import traceback -import urlparse - -import requests - -from rucio.core.account import list_accounts, list_account_attributes -from rucio.core.account_limit import set_account_limit -from rucio.core.rse import get_rse_protocols, add_protocol, add_rse, update_protocols -from rucio.common.exception import Duplicate, RSEProtocolPriorityError, RSEProtocolNotSupported, RSENotFound - - -UNKNOWN = 3 -CRITICAL = 2 -WARNING = 1 -OK = 0 - -if __name__ == '__main__': - - URL = 'http://atlas-agis-api.cern.ch/request/ddmendpoint/query/list/?json&state=ACTIVE&site_state=ACTIVE' - RESP = requests.get(url=URL) - DATA = json.loads(RESP.content) - RETVALUE = OK - - ACC_ATTRS = [(a['account'], list_account_attributes(a['account'])) for a in list_accounts()] - ADMIN_ACCOUNTS = [x[0] for x in ACC_ATTRS if {'key': 'admin', 'value': True} in x[1]] - for rse in DATA: - print rse['name'] - deterministic = False - try: - deterministic = rse['is_deterministic'] - volatile = False - add_rse(rse=rse['name'], deterministic=deterministic, volatile=volatile) - # Set infinite quotas for admin accounts - for account in ADMIN_ACCOUNTS: - set_account_limit(account, rse['name'], -1) - if not rse['is_tape']: - set_account_limit('sonar', rse['name'], 10000000000000) - except Duplicate as error: - pass - except Exception: - RETVALUE = CRITICAL - errno, errstr = sys.exc_info()[:2] - trcbck = traceback.format_exc() - print 'Interrupted processing with %s %s %s.' % (errno, errstr, trcbck) - - prefix = rse['endpoint'] - space_token = rse['token'] - - existing_protocols = [] - try: - rucio_protocols = get_rse_protocols(rse['name'], None) - for prot in rucio_protocols['protocols']: - existing_protocols.append((prot['scheme'], prot['hostname'], prot['port'])) - - except RSENotFound as error: - print error - continue - if rse['type'] in ['OS_ES', 'OS_LOGS']: - print 'This is a Object store endpoint. Skipping the protocols' - priority = {} - for activity in rse['arprotocols']: - index = 0 - if activity in ['read_lan', 'read_wan', 'write_lan', 'write_wan', 'delete_lan', 'delete_wan', 'third_party_copy']: - for protocol in rse['arprotocols'][activity]: - index += 1 - path = protocol['path'] - o = urlparse.urlparse(protocol['endpoint']) - if (o.scheme, o.netloc, path) not in priority: - priority[(o.scheme, o.netloc, path)] = {'read_lan': 0, 'read_wan': 0, 'write_lan': 0, 'write_wan': 0, 'delete_lan': 0, 'delete_wan': 0, 'third_party_copy': 0} - priority[(o.scheme, o.netloc, path)][activity] = index - for prio in priority: - scheme, host_with_port, prefix = prio - if not prefix.endswith('/'): - prefix += '/' - port = 443 - hostname = host_with_port - if host_with_port.find(':') > -1: - hostname, port = host_with_port.split(':') - impl = None - if scheme == 's3': - impl = 'rucio.rse.protocols.s3boto.Default' - elif scheme == 's3+rucio': - if rse['type'] == 'OS_ES': - impl = 'rucio.rse.protocols.ses3.Default' - else: - impl = 'rucio.rse.protocols.signeds3.Default' - params = {'hostname': hostname, - 'scheme': scheme, - 'port': port, - 'prefix': prefix, - 'impl': impl, - 'domains': {"lan": {"read": priority[prio]['read_lan'], - "write": priority[prio]['write_lan'], - "delete": priority[prio]['delete_lan']}, - "wan": {"read": priority[prio]['read_wan'], - "write": priority[prio]['write_wan'], - "delete": priority[prio]['delete_wan'], - "third_party_copy": priority[prio]['third_party_copy']}}} - print params - if impl: - try: - add_protocol(rse=rse['name'], parameter=params) - except Duplicate as error: - print error - else: - print 'No implementation defined for %s on RSE %s' % (scheme, rse['name']) - RETVALUE = CRITICAL - - else: - prot_read = [] - prot_write = [] - prot_delete = [] - priority = {} - for activity in rse['aprotocols']: - index = 0 - if activity in ['read_lan', 'read_wan', 'write_lan', 'write_wan', 'delete_lan', 'delete_wan', 'third_party_copy']: - for protocol, agis_prio, agis_prefix in rse['aprotocols'][activity]: - index += 1 - o = urlparse.urlparse(protocol) - if o.scheme not in ('https', 'http', 'srm', 'gsiftp', 'root', 'davs', 'dav'): - continue - if (o.scheme, o.netloc) not in priority: - priority[(o.scheme, o.netloc)] = {'read_lan': 0, 'read_wan': 0, 'write_lan': 0, 'write_wan': 0, 'delete_lan': 0, 'delete_wan': 0, 'third_party_copy': 0} - priority[(o.scheme, o.netloc)][activity] = index - if sum([act['read_lan'] for act in priority.values()]) == 0: - for key in priority: - priority[key]['read_lan'] = priority[key]['read_wan'] - if sum([act['write_lan'] for act in priority.values()]) == 0: - for key in priority: - priority[key]['write_lan'] = priority[key]['write_wan'] - if sum([act['delete_lan'] for act in priority.values()]) == 0: - for key in priority: - priority[key]['delete_lan'] = priority[key]['delete_wan'] - - for protocol in rse['protocols']: - try: - o = urlparse.urlparse(protocol) - if o.scheme not in ('https', 'http', 'srm', 'gsiftp', 'root', 'davs', 'dav'): - continue - - protocols = rse['protocols'][protocol] - - extended_attributes = None - if o.scheme == 'srm': - extended_attributes = {"web_service_path": o.path + '?SFN=', "space_token": space_token} - impl = 'rucio.rse.protocols.gfal.Default' - elif o.scheme in ('davs', 'dav'): - extended_attributes = None - if rse['is_mkdir'] is True: - impl = 'rucio.rse.protocols.gfalv2.Default' - else: - impl = 'rucio.rse.protocols.gfal.Default' - - elif o.scheme in ('https', 'http'): - extended_attributes = None - impl = 'rucio.rse.protocols.gfalv2.Default' - elif o.scheme == 'gsiftp': - extended_attributes = None - impl = 'rucio.rse.protocols.gfal.Default' - elif o.scheme == 'root': - extended_attributes = None - impl = 'rucio.rse.protocols.gfal.Default' - else: - continue - - port = 8443 - netloc = o.netloc - if o.port and str(o.port) in o.netloc: - netloc = o.netloc[:-len(':' + str(o.port))] - port = o.port - else: - if o.scheme in ('https', 'davs'): - port = 443 - elif o.scheme == 'gsiftp': - port = 2811 - elif o.scheme == 'root': - port = 1094 - - # For disk end-points not for tape - prefix = rse['protocols'][protocol][0][2] - if not rse['is_tape'] and deterministic and not prefix.endswith('/rucio') and not prefix.endswith('/rucio/'): - prefix = os.path.join(prefix, 'rucio/') - - params = {'hostname': netloc, - 'scheme': o.scheme, - 'port': port, - 'prefix': prefix, - 'impl': impl, - 'extended_attributes': extended_attributes, - 'domains': {"lan": {"read": priority[(o.scheme, o.netloc)]['read_lan'], - "write": priority[(o.scheme, o.netloc)]['write_lan'], - "delete": priority[(o.scheme, o.netloc)]['delete_lan']}, - "wan": {"read": priority[(o.scheme, o.netloc)]['read_wan'], - "write": priority[(o.scheme, o.netloc)]['write_wan'], - "delete": priority[(o.scheme, o.netloc)]['delete_wan'], - "third_party_copy": priority[(o.scheme, o.netloc)]['third_party_copy']}}} - - rucio_protocol = None - for prot in rucio_protocols['protocols']: - if prot['scheme'] == o.scheme and prot['hostname'] == netloc and prot['port'] == port: - rucio_protocol = prot - try: - existing_protocols.remove((o.scheme, netloc, port)) - except ValueError: - pass - break - if params != rucio_protocol: - if rucio_protocol: - try: - for domain in ['lan', 'wan']: - for act in ['read', 'write', 'delete']: - if rucio_protocol['domains'][domain][act] != priority[(o.scheme, o.netloc)]['%s_%s' % (act, domain)]: - print '%s : Protocol %s Activity %s_%s : priority in Rucio %s != priority in AGIS %s' % (rse['name'], o.scheme, act, domain, - rucio_protocol['domains'][domain][act], - priority[(o.scheme, o.netloc)]['%s_%s' % (act, domain)]) - update_protocols(rse['name'], o.scheme, {'domains': {domain: {act: priority[(o.scheme, o.netloc)]['%s_%s' % (act, domain)]}}}, hostname=netloc, port=port) - if rucio_protocol['domains']['wan']['third_party_copy'] != priority[(o.scheme, o.netloc)]['third_party_copy']: - print '%s : Protocol %s Activity %s : priority in Rucio %s != priority in AGIS %s' % (rse['name'], o.scheme, 'third_party_copy', - rucio_protocol['domains']['wan']['third_party_copy'], - priority[(o.scheme, o.netloc)]['third_party_copy']) - update_protocols(rse['name'], o.scheme, {'domains': {'wan': {'third_party_copy': priority[(o.scheme, o.netloc)]['third_party_copy']}}}, hostname=netloc, port=port) - - except RSEProtocolNotSupported as error: - print error - else: - print 'Will create protocol %s at %s with priorities read_lan,write_lan,delete_lan,read_wan,write_wan,delete_wan,third_party_copy : ' \ - '%s,%s,%s,%s,%s,%s,%s' % (o.scheme, rse['name'], - params['domains']['lan']['read'], params['domains']['lan']['write'], params['domains']['lan']['delete'], - params['domains']['wan']['read'], params['domains']['wan']['write'], params['domains']['wan']['delete'], - params['domains']['wan']['third_party_copy']) - try: - add_protocol(rse=rse['name'], parameter=params) - except Exception as error: - print error - except Duplicate as error: - pass - except RSEProtocolPriorityError as error: - print 'RSE %s protocol %s: %s' % (rse['name'], o.scheme, error) - if RETVALUE != CRITICAL: - RETVALUE = WARNING - except Exception: - RETVALUE = CRITICAL - errno, errstr = sys.exc_info()[:2] - trcbck = traceback.format_exc() - print 'RSE %s protocol %s : Interrupted processing with %s %s %s.' % (rse['name'], o.scheme, errno, errstr, trcbck) - if existing_protocols: - RETVALUE = WARNING - for scheme, hostname, port in existing_protocols: - print 'WARNING : Protocol %s://%s:%s is defined in Rucio but not in AGIS on RSE %s !!!!' % (scheme, hostname, port, rse['name']) - sys.exit(RETVALUE) diff --git a/tools/probes/common/check_transfer_queues_status b/tools/probes/common/check_transfer_queues_status deleted file mode 100755 index 1d04b16432..0000000000 --- a/tools/probes/common/check_transfer_queues_status +++ /dev/null @@ -1,51 +0,0 @@ -#!/usr/bin/env python -# Copyright European Organization for Nuclear Research (CERN) 2013 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - Mario Lassnig, , 2013-2015 -# - Cedric Serfon, , 2014 -# - Wen Guan, , 2015 - -''' -Probe to check the queues of the transfer service -''' - -import sys - -from rucio.core import monitor -from rucio.db.sqla.session import get_session - -# Exit statuses -OK, WARNING, CRITICAL, UNKNOWN = 0, 1, 2, 3 - -active_queue = """SELECT -CASE - WHEN state = 'S' THEN 'queues.requests.submitted.' || replace(activity, ' ', '_') || '.' || nvl(replace(substr(external_host, 9, length(external_host)-13), '.', '_'), 'no_fts_host') - WHEN state = 'Q' THEN 'queues.requests.queued.' || replace(activity, ' ', '_') || '.' || nvl(replace(substr(external_host, 9, length(external_host)-13), '.', '_'), 'no_fts_host') - WHEN state = 'F' THEN 'queues.requests.failed.' || replace(activity, ' ', '_') || '.' || nvl(replace(substr(external_host, 9, length(external_host)-13), '.', '_'), 'no_fts_host') - WHEN state = 'D' THEN 'queues.requests.done.' || replace(activity, ' ', '_') || '.' || nvl(replace(substr(external_host, 9, length(external_host)-13), '.', '_'), 'no_fts_host') - WHEN state = 'L' THEN 'queues.requests.lost.' || replace(activity, ' ', '_') || '.' || nvl(replace(substr(external_host, 9, length(external_host)-13), '.', '_'), 'no_fts_host') - WHEN state = 'W' THEN 'queues.requests.waiting.' || replace(activity, ' ', '_') || '.' || nvl(replace(substr(external_host, 9, length(external_host)-13), '.', '_'), 'null_fts_host') - ELSE state -END state_desc, -num_rows -FROM -( -select state, count(*) num_rows, activity, external_host -FROM atlas_rucio.requests -GROUP BY state, activity, external_host -)""" - -if __name__ == "__main__": - try: - session = get_session() - for k in session.execute(active_queue).fetchall(): - print k[0], k[1], - monitor.record_gauge(stat=k[0], value=k[1]) - except: - sys.exit(UNKNOWN) - sys.exit(OK) diff --git a/tools/probes/common/check_unevaluated_dids b/tools/probes/common/check_unevaluated_dids deleted file mode 100755 index 9bb75232ac..0000000000 --- a/tools/probes/common/check_unevaluated_dids +++ /dev/null @@ -1,31 +0,0 @@ -#!/usr/bin/env python -# Copyright European Organization for Nuclear Research (CERN) 2013 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - Vincent Garonne, , 2013 - -''' -Probe to check the backlog of dids waiting for rule evaluation. -''' - -import sys - -from rucio.core import monitor -from rucio.db.sqla.session import get_session - -# Exit statuses -OK, WARNING, CRITICAL, UNKNOWN = 0, 1, 2, 3 - -if __name__ == "__main__": - try: - session = get_session() - result = session.execute('SELECT COUNT(*) FROM ATLAS_RUCIO.updated_dids').fetchone()[0] - monitor.record_gauge(stat='judge.waiting_dids', value=result) - print result - except: - sys.exit(UNKNOWN) - sys.exit(OK) diff --git a/tools/probes/common/check_unlocked_replicas b/tools/probes/common/check_unlocked_replicas deleted file mode 100755 index 238916c933..0000000000 --- a/tools/probes/common/check_unlocked_replicas +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env python -# Copyright European Organization for Nuclear Research (CERN) 2013 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - Vincent Garonne, , 2013 - -''' -Probe to check the backlog of unlocked replicas. -''' - -import sys - -from rucio.core import monitor -from rucio.db.sqla.session import get_session - -# Exit statuses -OK, WARNING, CRITICAL, UNKNOWN = 0, 1, 2, 3 - -# select (select rse from "ATLAS_RUCIO".rses where id = rse_id), -# n -# from -# (SELECT /*+ index_FFS(replicas REPLICAS_TOMBSTONE_IDX) */ -# CASE WHEN ("ATLAS_RUCIO".replicas.tombstone IS NOT NULL) THEN "ATLAS_RUCIO".replicas.rse_id END as rse_id, -# count(*) as n -# FROM "ATLAS_RUCIO".replicas -# WHERE "ATLAS_RUCIO".replicas.tombstone is not null -# GROUP BY CASE WHEN ("ATLAS_RUCIO".replicas.tombstone IS NOT NULL) THEN "ATLAS_RUCIO".replicas.rse_id END) - -if __name__ == "__main__": - try: - session = get_session() - result = session.execute('select /*+ index_ffs(replicas REPLICAS_TOMBSTONE_IDX) */ count(1) from atlas_rucio.replicas where tombstone is not null').fetchone()[0] - monitor.record_gauge(stat='reaper.unlocked_replicas', value=result) - print result - result = session.execute('select /*+ index_ffs(replicas REPLICAS_TOMBSTONE_IDX) */ count(1) from atlas_rucio.replicas where tombstone is not null and tombstone < sysdate - 2/24').fetchone()[0] - monitor.record_gauge(stat='reaper.expired_replicas', value=result) - except: - sys.exit(UNKNOWN) - sys.exit(OK) diff --git a/tools/probes/common/check_updated_account_counters b/tools/probes/common/check_updated_account_counters deleted file mode 100755 index 70125925fa..0000000000 --- a/tools/probes/common/check_updated_account_counters +++ /dev/null @@ -1,31 +0,0 @@ -#!/usr/bin/env python -# Copyright European Organization for Nuclear Research (CERN) 2013 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - Martin Barisits, , 2014 - -''' -Probe to check the backlog of updated account counters. -''' -import sys -import traceback - -from rucio.core import monitor -from rucio.db.sqla.session import get_session - -# Exit statuses -OK, WARNING, CRITICAL, UNKNOWN = 0, 1, 2, 3 - -if __name__ == "__main__": - try: - session = get_session() - result = session.execute('select count(1) from atlas_rucio.updated_account_counters').fetchone()[0] - monitor.record_gauge(stat='abbacus.updated_account_counters', value=result) - except: - print traceback.format_exc() - sys.exit(UNKNOWN) - sys.exit(OK) diff --git a/tools/probes/common/check_updated_dids b/tools/probes/common/check_updated_dids deleted file mode 100755 index 289b78b0d4..0000000000 --- a/tools/probes/common/check_updated_dids +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/env python -# Copyright European Organization for Nuclear Research (CERN) 2013 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - Vincent Garonne, , 2013 - -''' -Probe to check the backlog of updated dids. -''' -import sys -import traceback - -from rucio.core import monitor -from rucio.db.sqla.session import get_session - -# Exit statuses -OK, WARNING, CRITICAL, UNKNOWN = 0, 1, 2, 3 - -if __name__ == "__main__": - try: - session = get_session() - result = session.execute('select count(1) from atlas_rucio.updated_dids').fetchone()[0] - monitor.record_gauge(stat='judge.updated_dids', value=result) - # created_at, count, max, min, avg, stdev = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 - # result = session.execute('select * from atlas_rucio.concurency_stats where created_at > sysdate - 1/1440') - # for row in result: - # created_at, count, max, min, avg, stdev = row - # monitor.record_gauge(stat='judge.updated_dids_per_min.count', value=count or 0) - # monitor.record_gauge(stat='judge.updated_dids_per_min.max', value=max or 0) - # monitor.record_gauge(stat='judge.updated_dids_per_min.min', value=min or 0) - # monitor.record_gauge(stat='judge.updated_dids_per_min.avg', value=avg or 0) - # monitor.record_gauge(stat='judge.updated_dids_per_min.stdev', value=stdev or 0) - # print created_at, count, max, min, avg, stdev - except: - print traceback.format_exc() - sys.exit(UNKNOWN) - sys.exit(OK) diff --git a/tools/probes/common/check_updated_rse_counters b/tools/probes/common/check_updated_rse_counters deleted file mode 100755 index de0878a724..0000000000 --- a/tools/probes/common/check_updated_rse_counters +++ /dev/null @@ -1,31 +0,0 @@ -#!/usr/bin/env python -# Copyright European Organization for Nuclear Research (CERN) 2013 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - Martin Barisits, , 2014 - -''' -Probe to check the backlog of updated rse counters. -''' -import sys -import traceback - -from rucio.core import monitor -from rucio.db.sqla.session import get_session - -# Exit statuses -OK, WARNING, CRITICAL, UNKNOWN = 0, 1, 2, 3 - -if __name__ == "__main__": - try: - session = get_session() - result = session.execute('select count(1) from atlas_rucio.updated_rse_counters').fetchone()[0] - monitor.record_gauge(stat='abbacus.updated_rse_counters', value=result) - except: - print traceback.format_exc() - sys.exit(UNKNOWN) - sys.exit(OK) diff --git a/tools/probes/common/check_used_space b/tools/probes/common/check_used_space deleted file mode 100755 index 843e4e6401..0000000000 --- a/tools/probes/common/check_used_space +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/env python -# Copyright European Organization for Nuclear Research (CERN) 2013 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - Cedric Serfon, , 2014 -# -import datetime -import sys -from rucio.client import Client - -OK, WARNING, CRITICAL, UNKNOWN = 0, 1, 2, 3 -host = sys.argv[1] -c = Client() - -for dict in c.get_rse_usage(host): - if dict['source'] == 'rucio': - if datetime.datetime.utcnow() - dict['updated_at'] < datetime.timedelta(hours=1): - print 'Used space :', dict['used'] - sys.exit(OK) - else: - print 'Used space has not been refreshed for more than 1 hour. Last refresh on %s' % str(dict['updated_at']) - sys.exit(WARNING) diff --git a/tools/probes/common/check_voms b/tools/probes/common/check_voms deleted file mode 100755 index 717d990f73..0000000000 --- a/tools/probes/common/check_voms +++ /dev/null @@ -1,238 +0,0 @@ -#!/usr/bin/env python -# Copyright European Organization for Nuclear Research (CERN) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - Vincent Garonne, , 2012-2013 -# - Cedric Serfon, , 2014-2018 -# - David Cameron, , 2015 -# - Dimitrios Christidis, , 2019 -# -# PY3K COMPATIBLE - -from __future__ import print_function - -import os -import sys -import time - -import ldap # pylint: disable=import-error - -from rucio.db.sqla.session import get_session -from rucio.client import Client -from rucio.common.config import config_get -from rucio.common.exception import Duplicate - -from VOMSAdmin.VOMSCommands import VOMSAdminProxy # pylint: disable=import-error - - -UNKNOWN = 3 -CRITICAL = 2 -WARNING = 1 -OK = 0 - -LDAP_HOSTS = ['ldaps://xldap.cern.ch'] -LDAP_OPTIONS = [ - # default configuration comes from /etc/openldap/ldap.conf - # (ldap.OPT_X_TLS_CACERTDIR, '/etc/pki/tls/certs'), - # (ldap.OPT_X_TLS_CACERTFILE, '/etc/pki/tls/certs/ca-bundle.crt'), - # (ldap.OPT_X_TLS_REQUIRE_CERT, ldap.OPT_X_TLS_NEVER), - (ldap.OPT_X_TLS_REQUIRE_CERT, ldap.OPT_X_TLS_DEMAND), - (ldap.OPT_REFERRALS, 0), -] -LDAP_BASE = 'OU=Users,OU=Organic Units,DC=cern,DC=ch' -LDAP_SCOPE = ldap.SCOPE_SUBTREE -LDAP_FILTER = '(&(objectClass=user)(!(memberOf=CN=cern-accounts-service,OU=e-groups,OU=Workgroups,DC=cern,DC=ch)))' -LDAP_ATTRS = ['cn', 'mail', 'proxyAddresses', 'cernExternalMail'] -LDAP_PAGE_SIZE = 1000 - - -def get_accounts_identities(): - session = get_session() - query = '''select b.identity, a.account, a.email from atlas_rucio.accounts a, atlas_rucio.account_map b where a.account=b.account and identity_type='X509' and account_type='USER' ''' - dns = {} - try: - result = session.execute(query) - for dn, account, email in result: - dns[dn] = (account, email) - return dns - except Exception as error: - print(error) - - -def get_ldap_identities(): - """Get user account info from CERN AD/LDAP""" - for opt_key, opt_val in LDAP_OPTIONS: - ldap.set_option(opt_key, opt_val) - - conn = ldap.initialize(",".join(LDAP_HOSTS)) - conn.simple_bind_s() - - paged_serverctrls = [] - old_paged_search = [int(x) for x in ldap.__version__.split('.')] < [2, 4, 0] - if old_paged_search: - paged_serverctrls.append(ldap.controls.SimplePagedResultsControl(ldap.LDAP_CONTROL_PAGE_OID, True, (LDAP_PAGE_SIZE, ''))) - else: - paged_serverctrls.append(ldap.controls.SimplePagedResultsControl(True, size=LDAP_PAGE_SIZE, cookie='')) - - ret = {} - while True: - msgid = conn.search_ext(LDAP_BASE, LDAP_SCOPE, filterstr=LDAP_FILTER, attrlist=LDAP_ATTRS, serverctrls=paged_serverctrls) - rtype, rdata, rmsgid, serverctrls = conn.result3(msgid=msgid) - for dn, attrs in rdata: - cn = attrs['cn'][0] - user = { - 'mails': [], - # 'x509': [], - } - - for attr in ['mail', 'cernExternalMail']: - if attr in attrs: - user[attr] = attrs[attr][0] - - for pmail in attrs.get('proxyAddresses', []): - if pmail.lower().startswith('smtp:'): - mail = pmail[len('smtp:'):] - if mail.lower() not in [umail.lower() for umail in user['mails']]: - user['mails'].append(mail) - for mail in attrs.get('mail', []): - if mail.lower() not in [umail.lower() for umail in user['mails']]: - user['mails'].append(mail) - - ret[cn] = user - - cookie = None - for serverctrl in serverctrls: - if old_paged_search: - if serverctrl.controlType == ldap.LDAP_CONTROL_PAGE_OID: - unused_est, cookie = serverctrl.controlValue - if cookie: - serverctrl.controlValue = (LDAP_PAGE_SIZE, cookie) - break - else: - if serverctrl.controlType == ldap.controls.SimplePagedResultsControl.controlType: - cookie = serverctrl.cookie - if cookie: - serverctrl.size = LDAP_PAGE_SIZE - break - - if not cookie: - break - - paged_serverctrls = serverctrls - return ret - - -if __name__ == '__main__': - try: - PROXY = config_get('nagios', 'proxy') - os.environ["X509_USER_PROXY"] = PROXY - CERT, KEY = os.environ['X509_USER_PROXY'], os.environ['X509_USER_PROXY'] - except Exception as error: - print("Failed to get proxy from rucio.cfg") - sys.exit(CRITICAL) - starttime = time.time() - status = OK - nbusers = 0 - nonicknames = [] - client = Client() - admin = VOMSAdminProxy(vo='atlas', host='voms2.cern.ch', port=8443, - user_cert=CERT, user_key=KEY) - res = admin.call_method('list-users') - accounts_email = {account['account']: account['email'] for account in client.list_accounts()} - accounts = accounts_email.keys() - scopes = [_ for _ in client.list_scopes()] - dns = get_accounts_identities() - ldap_accounts = get_ldap_identities() - for account in accounts_email: - if account in ldap_accounts: - if accounts_email[account] and not accounts_email[account].lower() in [email.lower() for email in ldap_accounts[account]['mails']]: - print('Bad email for %s : %s vs %s' % (account, accounts_email[account], ldap_accounts[account])) - try: - client.update_account(account=account, key='email', value=ldap_accounts[account]['mail']) - except Exception as error: - print(error) - else: - print('%s might not be in ATLAS anymore' % account) - voms_identities = {} - if isinstance(res, list): - for user in res: - valid_nickname = True - dn = user._DN - scope = None - if dn in dns: - # Check if scope exists - account = dns[dn][0] - scope = 'user.' + account - else: - nbusers += 1 - attempts = 0 - totattemps = 3 - for attempts in range(0, totattemps): - if attempts < totattemps - 1: - try: - dn = user._DN - ca = user._CA - email = user._mail - result = admin.call_method('list-user-attributes', dn, ca) - if result is None: - print("Failed to list-user-attributes for dn: %s" % dn) - continue - nickname = None - try: - nickname = result[0]._value - except TypeError as error: - print('ERROR : Failed to process DN: %s' % dn) - if nickname: - if nickname in accounts: - if nickname not in voms_identities: - voms_identities[nickname] = [identity['identity'] for identity in client.list_identities(account=nickname) if identity['type'] == 'X509'] - if dn not in voms_identities[nickname]: - try: - client.add_identity(account=nickname, identity=dn, authtype='X509', email=email, default=True) - print('Identity %(dn)s added' % locals()) - except Duplicate: - pass - scope = 'user.' + account - break - else: - if nickname in ldap_accounts: - if email.lower() not in [mail.lower() for mail in ldap_accounts[nickname]['mails']]: - print('Account %s does not exist. To create it : rucio-admin account add --type USER --email %s %s' % (nickname, email, nickname)) - break - account = nickname - if account not in accounts: - try: - client.add_account(account=account, type='USER', email=email) - client.add_identity(account=account, identity=dn, authtype='X509', email=email, default=True) - scope = 'user.' + account - print('Account %(account)s added' % locals()) - except Exception: - pass - elif user._DN not in nonicknames: - nonicknames.append(user._DN) - except Exception as error: - print(error) - else: - try: - print('ERROR getting info for %s' % (user._DN)) - except UnicodeEncodeError: - print('ERROR getting info for %s' % (repr(user._DN))) - status = WARNING - if scope and scope not in scopes and valid_nickname: - try: - client.add_scope(account, scope) - print('Scope %(scope)s added' % locals()) - except Duplicate: - pass - else: - sys.exit(CRITICAL) - print('%i users extracted from VOMS' % nbusers) - if nonicknames != []: - print('Users with no nickname : %s' % str(nonicknames)) - - sys.exit(status) diff --git a/tools/probes/common/check_voms_admin b/tools/probes/common/check_voms_admin deleted file mode 100755 index 58a788f268..0000000000 --- a/tools/probes/common/check_voms_admin +++ /dev/null @@ -1,176 +0,0 @@ -#!/usr/bin/env python -# Copyright European Organization for Nuclear Research (CERN) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - David Cameron, , 2015 -# - Cedric Serfon, , 2016 -# -# Get country groups from VOMS, get DNs with production role, -# map to Rucio account and set country-: admin -# Get all DNs in country and set country-: user -# Get all DNs in phys/perf group with production role and set group-: admin -# Get all names in cloud support e-group and set cloud-: admin - -import os -import re -import sys -import shlex -import subprocess - -from rucio.api.account import list_accounts, add_account_attribute, del_account_attribute, list_account_attributes -from rucio.common.config import config_get -from rucio.common.exception import RucioException, Duplicate, AccountNotFound - -from VOMSAdmin.VOMSCommands import VOMSAdminProxy - -UNKNOWN = 3 -CRITICAL = 2 -WARNING = 1 -OK = 0 - -result = OK - - -def set_account_attributes(dns): - - result = OK - if not dns: - print 'No user' - return result - - account_mapping = {} - for dn in dns: - # Get Rucio account - accounts = list_accounts(filter={'identity': dn, 'account_type': 'USER'}) - if not accounts: - print "Warning: no user accounts for %s" % dn - continue - account_type, role = dns[dn].split(':') - for acc in accounts: - account = acc['account'] - if not (account, account_type) in account_mapping: - account_mapping[(account, account_type)] = role - elif account_mapping[(account, account_type)] != role and role == 'admin': - print 'Will promote account %s to admin role' % (account) - account_mapping[(account, account_type)] = role - else: - pass - # print 'Do nothing for %s:%s DN : %s' % (account, account_type, dn) - for account, account_type in account_mapping: - role = account_mapping[(account, account_type)] - print 'Set %s role for %s for %s' % (role, account_type, account) - try: - attrs = dict([(dummy['key'], dummy['value']) for dummy in list_account_attributes(account)]) - if account_type in attrs and attrs[account_type] == role: - print 'Account %s already has the role %s for %s' % (account, role, account_type) - continue - elif account_type in attrs: - print 'Removing attribute %s from account %s' % (account_type, account) - del_account_attribute(account_type, account, 'root') - print 'Adding attribute %s for %s from account %s' % (role, account_type, account) - add_account_attribute(account_type, role, account, 'root') - except Duplicate: - pass - except RucioException, error: - print "Failed to add account attribute: %s" % str(error) - result = WARNING - return result - - -def add_cloud_admins(): - - result = OK - for cloud in ['ca', 'de', 'es', 'fr', 'it', 'ng', 'nl', 'ru', 't0', 'tw', 'uk', 'us']: - egroup = 'atlas-support-cloud-%s' % cloud - if cloud == 'ru': - egroup = 'atlas-adc-cloud-ru' - cmd = "/usr/bin/ldapsearch -x -h xldap.cern.ch -b 'CN=%s,OU=e-groups,OU=Workgroups,DC=cern,DC=ch' member" % egroup - - proc = subprocess.Popen(shlex.split(cmd), stderr=subprocess.STDOUT, stdout=subprocess.PIPE) - (stdout, stderr) = proc.communicate() - if proc.returncode != 0: - print "Warning: Failed to run ldapsearch: %s" % stdout - result = WARNING - continue - - # Always exceptions... - cloud = cloud.replace('t0', 'cern') - cloud = cloud.replace('ng', 'nd') - for line in stdout.split('\n'): - match = re.match('member: CN=(\w+),OU=Users', line) - if match: - account = match.group(1) - print 'Set admin role for cloud-%s for %s' % (cloud, account) - try: - add_account_attribute('cloud-%s' % cloud, 'admin', account, 'root') - except Duplicate: - pass - except AccountNotFound: - print 'Warning: Account %s not in Rucio' % account - result = WARNING - except RucioException, error: - print "Failed to add account attribute: %s" % str(error) - result = WARNING - return result - - -if __name__ == '__main__': - result = OK - try: - PROXY = config_get('nagios', 'proxy') - os.environ["X509_USER_PROXY"] = PROXY - CERT, KEY = os.environ['X509_USER_PROXY'], os.environ['X509_USER_PROXY'] - except Exception as error: - print "Failed to get proxy from rucio.cfg" - sys.exit(CRITICAL) - - admin = VOMSAdminProxy(vo='atlas', host='voms2.cern.ch', port=8443, - user_cert=CERT, user_key=KEY) - res = admin.call_method('list-sub-groups', '/atlas') - if not res: - print 'Could not list VOMS groups' - sys.exit(CRITICAL) - - for group in res: - match_pattern = re.match('/atlas/(\w\w)$', group) or re.match('/atlas/(cern)$', group) or re.match('/atlas/(usatlas)$', group) - if not match_pattern: - match_pattern = re.match('/atlas/(.*-.*)$', group) - if not match_pattern: - continue - vomsgroup = match_pattern.group(0) - phys_group = match_pattern.group(1) - dns = {} - print 'Working on group-%s' % (phys_group) - list_dns = admin.call_method('list-users-with-role', vomsgroup, 'Role=production') or [] - for dn in list_dns: - dns[dn._DN] = 'group-%s:admin' % (phys_group) - set_account_attributes(dns) - continue - - vomsgroup = match_pattern.group(0) - country = match_pattern.group(1) - if country == 'usatlas': - country = 'us' - - print 'Working on country-%s' % (country) - dns = {} - # Get DNs in country group - list_dns = admin.call_method('list-members', vomsgroup) or [] - for dn in list_dns: - dns[dn._DN] = 'country-%s:user' % (country) - - # Get DNs with production role in each country (upgrades the role from user to admin if applicable) - list_dns = admin.call_method('list-users-with-role', vomsgroup, 'Role=production') or [] - for dn in list_dns: - dns[dn._DN] = 'country-%s:admin' % (country) - set_account_attributes(dns) - - # Add cloud admins from ldap egroups - add_cloud_admins() - - sys.exit(result) diff --git a/tools/probes/common/check_webDAV_service b/tools/probes/common/check_webDAV_service deleted file mode 100755 index d7f37adeff..0000000000 --- a/tools/probes/common/check_webDAV_service +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env python -# Copyright European Organization for Nuclear Research (CERN) 2013 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - Cedric Serfon, , 2014 - -''' -Probe to check the queues of the transfer service -''' - -import sys -from rucio.common.exception import ServiceUnavailable -from rucio.rse import rsemanager - -scheme = 'https' - -print sys.argv -site = sys.argv[1] -# Exit statuses recognized by Nagios -OK, WARNING, CRITICAL, UNKNOWN = 0, 1, 2, 3 - -rse_settings = rsemanager.get_rse_info(site) -dict = rsemanager.select_protocol(rse_settings, operation='write', scheme=scheme) -basepath = '%s://%s:%s%s' % (dict['scheme'], dict['hostname'], dict['port'], dict['prefix']) -print basepath -p = rsemanager.create_protocol(rse_settings, operation='write', scheme='https') -try: - p.connect() -except ServiceUnavailable, e: - sys.exit(CRITICAL) - -if not p.exists(basepath): - sys.exit(CRITICAL) -p.close() - -sys.exit(OK) diff --git a/tools/probes/common/check_webdav_space b/tools/probes/common/check_webdav_space deleted file mode 100755 index 8f3232e516..0000000000 --- a/tools/probes/common/check_webdav_space +++ /dev/null @@ -1,62 +0,0 @@ -#!/usr/bin/env python -# Copyright European Organization for Nuclear Research (CERN) 2013 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - Wen Guan, , 2014 -# - Sylvain Blunier, , 2016 -# -import os -import sys - - -from rucio.client import Client -from rucio.common.config import config_get -from rucio.rse import rsemanager as rsemgr - -OK, WARNING, CRITICAL, UNKNOWN = 0, 1, 2, 3 - - -if __name__ == "__main__": - - rsename = sys.argv[1] - - retvalue = OK - usedsize = 0 - freesize = 0 - - try: - proxy = config_get('nagios', 'proxy') - os.environ["X509_USER_PROXY"] = proxy - except Exception as e: - retvalue = WARNING - - c = Client() - - rse_settings = rsemgr.get_rse_info(rsename) - for protocol in rse_settings['protocols']: - if protocol['scheme'] == "https": - rse_settings['protocols'].remove(protocol) - protocol['impl'] = "rucio.rse.protocols.webdav.Default" - rse_settings['protocols'].append(protocol) - try: - gs, ret = rsemgr.get_space_usage(rse_settings, "https") - if gs: - totalsize = long(ret["totalsize"]) - freesize = long(ret["unusedsize"]) - usedsize = totalsize - freesize - else: - print "Failed to get rse(%s) space information: %s" % (rsename, str(ret)) - retvalue = WARNING - except Exception as e: - print "Failed to get rse(%s) space information: %s" % (rsename, str(e)) - retvalue = WARNING - - if retvalue == OK: - print "Update RSE %s space usage (usedsize: %s, freesize: %s)" % (rsename, usedsize, freesize) - c.set_rse_usage(rsename, "https", usedsize, freesize) - - sys.exit(retvalue) diff --git a/tools/probes/common/daemon_activity_reports b/tools/probes/common/daemon_activity_reports deleted file mode 100755 index 2f5b72e3bc..0000000000 --- a/tools/probes/common/daemon_activity_reports +++ /dev/null @@ -1,136 +0,0 @@ -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - Ralph Vigne, 2015 - -import json -import locale -import requests -import smtplib -import urllib - -from datetime import datetime, timedelta -from email.mime.text import MIMEText - -ES_URL = 'cl-analytics.mwt2.org' -SEVERITY_LIST = ['critical', 'error', 'warning', 'info', 'debug'] -DATES = [(datetime.today() - timedelta(days=1)).strftime('%Y-%m-%d'), (datetime.today() - timedelta(days=2)).strftime('%Y-%m-%d')] -LINK = ("http://cl-analytics.mwt2.org:5601/#/dashboard/Rucio-Errror-Report?_g=(filters:!(),refreshInterval:(display:Off,pause:!f,section:0,value:0)," - "time:(from:now-1d%2Fd,mode:quick,to:now-1d%2Fd))&_a=(filters:!((meta:(apply:!t,disabled:!t,index:rucio-daemon-logs,key:severity_label,negate:!f,value:critical)," - "query:(match:(severity_label:(query:critical,type:phrase)))),(meta:(apply:!t,disabled:!t,index:rucio-daemon-logs,key:severity_label,negate:!f,value:error)," - "query:(match:(severity_label:(query:error,type:phrase)))),(meta:(apply:!t,disabled:!t,index:rucio-daemon-logs,key:severity_label,negate:!f,value:warning)," - "query:(match:(severity_label:(query:warning,type:phrase)))),(meta:(apply:!t,disabled:!f,index:rucio-daemon-logs,key:severity_label,negate:!t,value:info)," - "query:(match:(severity_label:(query:info,type:phrase)))),(meta:(apply:!t,disabled:!f,index:rucio-daemon-logs,key:severity_label,negate:!t,value:debug)," - "query:(match:(severity_label:(query:debug,type:phrase))))),query:(query_string:(analyze_wildcard:!t,lowercase_expanded_terms:!f,query:'{QUERY}')),title:'Rucio%20-%20Errror%20Report')") - -locale.setlocale(locale.LC_ALL, '') - - -def execute_query(selectors=[]): - query = {"size": 0, "query": {"filtered": {"filter": {"range": {"@timestamp": {"gte": "now-2d/d", "lte": "now-1d/d"}}}}}, - "aggs": {"daemons": {"terms": {"field": "daemon_raw", "size": 0, "order": {"_term": "asc"}}, - "aggs": {"days": {"date_histogram": {"field": "@timestamp", "interval": "day", "format": "yyyy-MM-dd"}, - "aggs": {"severity": {"terms": {"field": "severity_label", "size": 0}, - "aggs": {"hosts": {"terms": {"field": "host", "size": 0}}}}}}}}}} - # Adding wildcard to query - for tup in selectors: - if 'query' not in query['query']['filtered'].keys(): - query['query']['filtered']['query'] = {"wildcard": {}} - query['query']['filtered']['query']['wildcard'][tup.keys()[0]] = tup.values()[0] - - # Query data from ES using QueryDSL interface - resp = requests.post('http://%s:9200/rucio-daemon-logs/_search' % ES_URL, data=json.dumps(query)) - if resp.status_code != 200: - raise Exception('Failed querying data: %s' % resp.text) - return json.loads(resp.text) - - -def html_table_row(tmpDaemon, pp_nodes=False): - html = '' - html += '' - current = DATES[0] - base = DATES[1] - query = urllib.quote_plus(('(daemon:%s) AND (host:rucio-daemon-int*)' if pp_nodes else '(daemon:%s)') % tmpDaemon['name']) - html += '%s' % (LINK.replace('{QUERY}', query), tmpDaemon['name']) - - for sl in SEVERITY_LIST: - delta = tmpDaemon[current][sl] - tmpDaemon[base][sl] - color = "black" - if delta > 0: - color = 'red' - elif delta < 0: - color = 'green' - html += '%s
%s
(%s)
' % (locale.format("%.0f", tmpDaemon[current][sl], grouping=True), - color, - locale.format("%+.0f", delta, grouping=True), - locale.format("%+.1f", (100.0 / tmpDaemon[base][sl] * delta), grouping=True) + '%' if tmpDaemon[base][sl] != 0 else '-') - html += '\n' - return html - - -def create_table(buckets, pp_hosts=False): - html = '' - - html += '\n' - html += '' % ([]) - for sl in (['Daemon'] + SEVERITY_LIST): - html += '' % sl - html += '\n' - - tmpDaemon = {'name': buckets[0]['key'].split('-')[0]} - for date in DATES: - tmpDaemon[date] = {} - for sl in SEVERITY_LIST: - tmpDaemon[date][sl] = 0 - - for bDaemon in buckets: - if tmpDaemon['name'] != bDaemon['key'].split('-')[0]: - html += html_table_row(tmpDaemon, pp_hosts) - tmpDaemon = {'name': bDaemon['key'].split('-')[0]} - for date in DATES: - tmpDaemon[date] = {} - for sl in SEVERITY_LIST: - tmpDaemon[date][sl] = 0 - - for bDate in bDaemon['days']['buckets']: - if bDate['key_as_string'] not in DATES: - continue - for bSeverity in bDate['severity']['buckets']: - if pp_hosts: - for bHost in bSeverity['hosts']['buckets']: - if bHost['key'].find('-int-') != -1: - tmpDaemon[bDate['key_as_string']][bSeverity['key']] += bHost['doc_count'] - else: - tmpDaemon[bDate['key_as_string']][bSeverity['key']] += bSeverity['doc_count'] - html += html_table_row(tmpDaemon) - html += '
%s
\n' - return html - - -if __name__ == "__main__": - buckets = execute_query()['aggregations']['daemons']['buckets'] - html = '\n' - html += '

# Total Events (%s)

\n' % (DATES[0]) - html += create_table(buckets) - html += '

# Pre-Production Events (%s)

\n' % (DATES[0]) - html += create_table(buckets, True) - html += '

Additional Links

' - html += '' - html += '\n' - - msg = MIMEText(html, 'html') - - msg['Subject'] = '[RUCIO] Daemon Activity Report' - msg['From'] = 'rucio-dev@cern.ch' - msg['To'] = 'rucio-dev@cern.ch; atlas-adc-ddm-support@cern.ch' - - # Send the message via our own SMTP server, but don't include the - # envelope header. - s = smtplib.SMTP('localhost') - s.sendmail('rucio-dev@cern.ch', ['rucio-dev@cern.ch', 'atlas-adc-ddm-support@cern.ch'], msg.as_string()) diff --git a/tools/probes/common/graphite2nagios b/tools/probes/common/graphite2nagios deleted file mode 100755 index 401c623c0f..0000000000 --- a/tools/probes/common/graphite2nagios +++ /dev/null @@ -1,99 +0,0 @@ -#!/usr/bin/env python -# Copyright European Organization for Nuclear Research (CERN) 2013 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - Ralph Vigne 2015 -# - Cedric Serfon, 2017 - -''' -Probe arbitrary Graphite time series against a provided threshold. Examples: - -1) Check how many minutes in the last half hour had an average submit time higher than 5 seconds. - Command: python tools/probes/common/graphite2nagios --target aliasByNode(removeBelowValue(stats.timers.rucio.transfertool.fts3.submit_transfer.*.mean,5000),6) --critical 25 --from 30 --warning 20 - Explained: - *) Average per minute is indicated by using the mean - attribute at the end of the target metric. Other options are count, count_ps, lower, upper, and sum - *) The example above uses removeBelowValue to filter data points below the intended threshold, which is 5 seconds. Other options are: removeAboveValue, removeAbovePercentile, removeBelowPercentile - Note the sliding time frame, thus messages can be sent multiple times. - -2) Check if the current availability of the load balancer is above 20% - Command: python tools/probes/common/graphite2nagios -target aliasByNode(removeAboveValue(stats.rucio.monitoring.loadbalancer.rucio-lb-prod..Idle_pct,20),4) --critical 1 --from 1 - => check if the availability if the load balancer is below 20 percent, and sends a critical if so. Must be executed every minute. - -More information on supported function can be found at: http://graphite.readthedocs.org/en/latest/functions.html -''' - -import logging -import pprint -import sys - -import requests - -from optparse import OptionParser - -OK, WARNING, CRITICAL, UNKNOWN = 0, 1, 2, 3 -GRAPHITE_URL = "rucio-graphite-prod-02.cern.ch" - - -def parse_options(): - parser = OptionParser() - parser.add_option('-t', '--target', help='Graphite metric to be validated', action='store', dest='target') - parser.add_option('-c', '--critical', help='Threshold for number of data points triggering a critical message.', action='store', dest='critical') - parser.add_option('-f', '--from', help='Optional: Time period of data to be requested in minutes e.g. for last 30 minutes = 30', action='store', dest='period') - parser.add_option('-w', '--warning', help='Optional: Threshold for number of data points triggering a waning message.', action='store', dest='warning') - parser.add_option('-v', '--verbose', help='Optional: For the curious.', action='store_true', dest='debug', default=False) - (options, args) = parser.parse_args() - - # Checking mandatory arguments - if not options.target: - parser.error('Target metric not specified. E.g. stats.timers.rucio.transfertool.fts3.submit_transfer.*.mean') - if not options.critical: - parser.error('No threshold value for critical defined, but mandatory. E.g. 10') - - # Checking optional arguments and print warning - if options.debug: - logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) - else: - logging.basicConfig(stream=sys.stdout, level=logging.ERROR) - - if not options.period: - logging.debug('No timer period provided. Using the last 60 minites as default period') - options.period = 60 - if not options.warning: - logging.debug('No threshold for warning message provided. Warningsd will not be printed.') - options.warning = options.critical - return options - - -if __name__ == "__main__": - options = parse_options() - pp = pprint.PrettyPrinter(indent=4) - exit_code = OK - URL = 'http://%s/render?format=json&from=-%smin&target=%s' % (GRAPHITE_URL, int(options.period) + 1, options.target) # Adding 1 minute to aviod null values in case of "in the last minute" requests - logging.debug('Requesting metric: %s' % URL) - req = requests.get(URL) - if req.status_code != 200: - logging.debug('Failed requesting data from Graphite with: %s. Use --verbose for details.' % req.status_code) - JSON = req.json() - if options.debug: - pp.pprint(JSON) - for target in JSON: - counter = 0 - for db in target['datapoints']: - if db[0] is not None: - counter += 1 - if int(options.warning) < counter and counter < int(options.critical): - print '%s INCIDENTS %s INTERVAL %s min.' % (target['target'], counter, options.period) - if exit_code in [OK]: - exit_code = WARNING - if int(options.critical) <= counter: - print '%s INCIDENTS %s INTERVAL %s min.' % (target['target'], counter, options.period) - if exit_code in [OK, WARNING]: - exit_code = CRITICAL - logging.debug('%s incidents observed for target %s.' % (counter, target['target'])) - if exit_code != OK: - print 'Details: https://%s/render?from=-%smin&target=%s' % (GRAPHITE_URL, int((int(options.period) * 1.2) + 1), options.target) # Adding 20% of time to make plot more comprehensive - sys.exit(exit_code) diff --git a/tools/probes/common/monitor_apache b/tools/probes/common/monitor_apache deleted file mode 100755 index 606e1d9cc7..0000000000 --- a/tools/probes/common/monitor_apache +++ /dev/null @@ -1,195 +0,0 @@ -#!/usr/bin/env python -# Copyright European Organization for Nuclear Research (CERN) 2013 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - Ralph Vigne, , 2014 -# - Mario Lassnig, , 2015 - -import argparse -import datetime -import logging -import requests -import sys - -from pystatsd import Client -from sys import stdout - -logging.basicConfig(stream=stdout, - level=logging.ERROR, - format='%(asctime)s\t%(process)d\t%(levelname)s\t%(message)s') - -logger = logging.getLogger(__name__) - - -def monitor_nginx(uri): - raise NotImplementedError - - -def monitor_apache(uri): - fqdn = uri - logger.info('Requesting status from Apache running on %s' % fqdn) - - response = requests.get(fqdn + '/server-status?auto', verify='/etc/pki/tls/certs/CERN-bundle.pem', timeout=5) - if response.status_code != 200: - logger.error('Invalid HTTP status code') - logger.error(response) - - text = response.text.split('\n') - # Total Accesses: 2975146 - # Total kBytes: 14555116 - # CPULoad: .601986 - # Uptime: 111858 - # ReqPerSec: 26.5975 - # BytesPerSec: 133244 - # BytesPerReq: 5009.65 - # BusyWorkers: 23 - # IdleWorkers: 7 - - stats = dict() - IGNORE = ['BytesPerSec', 'BytesPerReq', 'Uptime', 'Total_kBytes', 'Total_Accesses', 'ReqPerSec'] - - server_name = get_server_name(uri) - stats['server_name'] = server_name - for i in range(9): - tmp = text[i].split(': ') - key = tmp[0].replace(' ', '_') - if key not in IGNORE: - stats[key] = tmp[1] - ascii = ''.join(text[9:]) - logger.debug(ascii) - stats['Keepalive'] = int(ascii.count('K')) - stats['Sending'] = int(ascii.count('W')) - stats['Reading'] = int(ascii.count('R')) - stats['Waiting'] = int(ascii.count('_')) - stats['availability'] = 100 / ((int(stats['BusyWorkers']) + int(stats['IdleWorkers']))) * (stats['Keepalive'] + stats['Waiting']) - return stats - - -def backend_graphite(url, stats, prefix): - server, port = url.split(':') - try: - pystatsd_client = Client(host=server, port=port, prefix='%s.%s' % (prefix, stats['server_name'])) - except Exception, e: - logger.error('Unable to connect to Graphite backend %s: %s' % (url, e)) - raise - for s in stats: - if s in ['server_name']: - continue - try: - logger.debug('%s.%s.%s => %s' % (prefix, stats['server_name'], s, float(stats[s]))) - pystatsd_client.gauge(s, float(stats[s])) - except Exception as e: - logger.error('Failed reporting %s (%s): %s' % (s, stats[s], e)) - - -def get_server_name(fqdn): - return fqdn.split('//')[1].split('.')[0] - - -def backend_xsls(fqdn, stats): - xml_str = '' - xml_str += 'rucio.%s.httpd' % stats['server_name'] - xml_str += '%s' % stats['availability'] - xml_str += '%s of %s worker processes idel.' % ((int(stats['BusyWorkers']) + int(stats['IdleWorkers'])), int(stats['IdleWorkers'])) - xml_str += '%s' % (datetime.datetime.now().isoformat().split('.')[0]) - xml_str += '' - for s in stats: - if s in ['server_name']: - continue - xml_str += '%s' % (stats['server_name'], s, stats[s]) - xml_str += '' - xml_str += '' - logger.debug(xml_str) - r = requests.post(fqdn, files={'file': xml_str}) - return r.status_code - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description='Takes backend classes and Apaches URLs') - parser.add_argument('--backend', metavar='B', type=str, nargs='+', help='a list of type::URL[:port][::scope] to which the script will report to. E.g. --backend graphite::rucio-graphite-int.cern.ch:8025/listen/now::rucio.backends xsls::xsls.cern.ch') - parser.add_argument('--server', metavar='S', type=str, nargs='+', help='a list of type::FQDN tuples. Type is either apache or nginx. E.g. --sever apache::www.example.com nginx::www.example2.net') - parser.add_argument('--verbose', help='makes it chatty', action="store_true") - args = parser.parse_args() - - if args.verbose: - logger.setLevel(level=logging.DEBUG) - args = vars(args) - - if not args['server']: - logger.critical("No server to monitor provided. Exiting.") - sys.exit(1) - servers = [] - for server in args['server']: - try: - t, fqdn = server.split('::') - logger.debug('Monitoring server => Type: %s\tFQDN: %s' % (t, fqdn)) - servers.append((t, fqdn)) - except ValueError: - logger.critical('Can not unpack server information: %s' % server) - sys.exit(1) - if not len(servers): - logger.critical("No server to monitor provided. Exiting.") - sys.exit(1) - - backends = [] - for backend in args['backend']: - try: - tmp = backend.split('::') - if len(tmp) < 2 or len(tmp) > 3: - raise ValueError - t, url, prefix = tmp if len(tmp) == 3 else [tmp[0], tmp[1], None] - logger.debug('Reporting to backend => Type: %s\tURL: %s\tPrefix: %s' % (t, url, prefix)) - backends.append((t, url, prefix)) - except ValueError: - logger.critical('Can not unpack backend information: %s' % backend) - sys.exit(1) - if not len(backends): - logger.critical("No backend provided. Exiting.") - sys.exit(1) - - for server in servers: - try: - t, fqdn = server - stats = {} - if t == 'apache': - stats = monitor_apache(fqdn) - elif t == 'nginx': - stats = monitor_nginx(fqdn) - else: - logger.critical('Can not monitor server of type %s: Not supported' % type) - sys.exit(1) - logger.info('Stats gathering from %s OK.' % fqdn) - except Exception as e: - logger.error('Failed requesting data from %s server: %s' % (t, fqdn)) - logger.error(e) - stats = {'server_name': get_server_name(fqdn), - 'availability': 0} # Resetting Gauge values in Graphite - - for backend in backends: - t, url, prefix = backend - if t == 'graphite': - if not prefix: - logger.critical('Can not report to graphite without prefix') - sys.exit(1) - try: - backend_graphite(url, stats, prefix) - logger.info('Reporting to %s OK.' % url) - except Exception as e: - logger.error('Unable to report to Graphite backend: %s' % e) - elif t == 'xsls': - try: - sc = backend_xsls(url, stats) - except Exception: - sc = -1 - if sc != 200: - logger.error('Unable to report to XSLS backend. Statuscode: %s' % sc) - else: - logger.info('Reporting to %s OK.' % url) - else: - logger.critical('Can not report to backend of type %s: Not supported' % type) - sys.exit(1) - sys.exit(0) diff --git a/tools/probes/common/monitor_client b/tools/probes/common/monitor_client deleted file mode 100755 index f6e5fb30ca..0000000000 --- a/tools/probes/common/monitor_client +++ /dev/null @@ -1,126 +0,0 @@ -#!/usr/bin/env python - -# Copyright European Organization for Nuclear Research (CERN) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - Joaquin Bogado, , 2014 - -import sys -import time -import datetime -import requests -import socket -from rucio.client import Client -from rucio.common.utils import uuid - -c = Client() -scp = '' -fl = [] -dsname = 'tests.rucio_client_test_bm_' + uuid().urn.split(':')[2] - - -def test_create_dataset(file_list, dsn=dsname): - c.add_dataset(scp, dsn) - tick = time.time() - c.add_files_to_dataset(scp, dsn, file_list[0:100]) - tick = time.time() - tick - c.add_files_to_dataset(scp, dsn, file_list[100:600]) - c.add_files_to_dataset(scp, dsn, file_list[600:900]) - c.add_files_to_dataset(scp, dsn, file_list[900:1200]) - return tick - - -def test_read_big_dataset(): - tick = time.time() - c.list_files(scp, dsname) - return time.time() - tick - - -def test_quering_replicas_chuncked(file_list): - tick = time.time() - c.list_replicas(file_list[0:100]) - c.list_replicas(file_list[100:200]) - c.list_replicas(file_list[200:300]) - c.list_replicas(file_list[300:400]) - c.list_replicas(file_list[400:500]) - c.list_replicas(file_list[500:600]) - c.list_replicas(file_list[600:700]) - c.list_replicas(file_list[700:800]) - c.list_replicas(file_list[800:900]) - c.list_replicas(file_list[900:1000]) - return (time.time() - tick) / 10 - - -def test_create_rule(): - tick = time.time() - c.add_replication_rule([{'scope': scp, 'name': dsname}], 1, 'MOCK', lifetime=3600) # lifetime = 1 hour. Should not trigger replications because all the files are in MOCK rse - return time.time() - tick - - -def get_filelist(): - # get the offset for the subset of files. - try: - f = open('/tmp/.slice') - s = int(f.read()) - f.close() - except IOError: - s = 0 - f = open('/tmp/.slice', 'w') - f.write(str((s + 1) % 20)) # the offset is a number between 0 and 19 - f.close() - files = ('rucio_test_mock_file_{0:05}'.format(x) for x in xrange(20000)) - f = [] - f.extend(files) - fl = [] - for i in (({'scope': 'mock', 'name': x}) for x in f[(s * 1000): (s * 1000) + 1000]): - fl.append(i) - return fl - - -def main(): - global scp - import argparse - parser = argparse.ArgumentParser(description='Rucio client test') - parser.add_argument('-r', '--report', dest='report', default=False, action='store_true', help='Report the results to xsls.cern.ch') - parser.add_argument('-s', '--scope', dest='scp', default='', help='Scope for testing. Usually user.', required=True) - parser.add_argument('-q', '--quiet', dest='quiet', default=False, action='store_true', help='No xml output') - args = parser.parse_args() - - scp = args.scp - - fl = get_filelist() - - timestamp = datetime.datetime.now().isoformat().split('.')[0] - - createtime = test_create_dataset(fl) - readtime = test_read_big_dataset() - querytime = test_quering_replicas_chuncked(fl) - ruletime = test_create_rule() - - xmlmetric = """ - -rucio.{5}.client_benchmark -{0} -100 - -{2} -{1} -{3} -{4} - -""".format(timestamp, readtime, createtime, querytime, ruletime, socket.getfqdn()[:-8]) - if not args.quiet: - print xmlmetric - if args.report is True: - if not args.quiet: - print "Reporting to http://xsls.cern.ch" - r = requests.post('http://xsls.cern.ch', files={'file': xmlmetric}) - sys.exit(not(r.status_code == 200)) - - -if __name__ == '__main__': - main() diff --git a/tools/probes/common/monitor_flume b/tools/probes/common/monitor_flume deleted file mode 100755 index 15263e3b30..0000000000 --- a/tools/probes/common/monitor_flume +++ /dev/null @@ -1,131 +0,0 @@ -#!/usr/bin/env python -# Copyright European Organization for Nuclear Research (CERN) 2013 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - Ralph Vigne, , 2014 -# - Mario Lassnig, , 2015 - -import argparse -import json -import logging -import requests -import sys - -from pystatsd import Client -from sys import stdout - -logging.basicConfig(stream=stdout, - level=logging.ERROR, - format='%(asctime)s\t%(process)d\t%(levelname)s\t%(message)s') - -logger = logging.getLogger(__name__) - - -def monitor_agent(uri): - logger.info('Requesting status from Apache running on %s' % uri) - - response = requests.get(uri, timeout=1) - if response.status_code != 200: - logger.error('Invalid HTTP status code') - logger.error(response) - - all_stats = json.loads(response.text) - INCLUDE = ['ChannelFillPercentage', 'EventDrainSuccessCount', 'EventAcceptedCount', 'ConnectionFailedCount'] - # Proposed thresholds: CHANNEL.ChannelFillPercentage 50% - # SINK.ConnectionFailedCount > 0 - # SINK.EventAcceptedCount - SINK.EventDrainSuccessCount > 200 (becaus read in chunks of hundred) - - server_name = get_server_name(uri) - stats = dict() - stats['server_name'] = server_name - for s in all_stats: - for m in INCLUDE: - if m in all_stats[s]: - stats['%s.%s' % (s, m)] = all_stats[s][m] - logger.debug(stats) - return stats - - -def backend_graphite(url, stats, prefix): - server, port = url.split(':') - try: - pystatsd_client = Client(host=server, port=port, prefix='%s.%s' % (prefix, stats['server_name'])) - except Exception, e: - logger.error('Unable to connect to Graphite backend %s: %s' % (url, e)) - raise - for s in stats: - if s in ['server_name']: - continue - try: - logger.debug('%s.%s.%s => %s' % (prefix, stats['server_name'], s, float(stats[s]))) - pystatsd_client.gauge(s, float(stats[s])) - except Exception as e: - logger.error('Failed reporting %s (%s): %s' % (s, stats[s], e)) - - -def get_server_name(fqdn): - return fqdn.split('//')[1].split('.')[0] - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description='Takes backend classes and Apaches URLs') - parser.add_argument('--backend', metavar='B', type=str, nargs='+', help='a list of type::URL[:port][::scope] to which the script will report to. E.g. --backend graphite::rucio-graphite-int.cern.ch:8025/listen/now::rucio.backends xsls::xsls.cern.ch') - parser.add_argument('--server', metavar='S', type=str, nargs='+', help='a list of FQDNs. --server http://rucio-daemon-prod-01.cern.ch:34545/metrics') - parser.add_argument('--verbose', help='makes it chatty', action="store_true") - args = parser.parse_args() - - if args.verbose: - logger.setLevel(level=logging.DEBUG) - args = vars(args) - - servers = args['server'] - if not len(servers): - logger.critical("No server to monitor provided. Exiting.") - sys.exit(1) - - backends = [] - for backend in args['backend']: - try: - tmp = backend.split('::') - if len(tmp) < 2 or len(tmp) > 3: - raise ValueError - t, url, prefix = tmp if len(tmp) == 3 else [tmp[0], tmp[1], None] - logger.debug('Reporting to backend => Type: %s\tURL: %s\tPrefix: %s' % (t, url, prefix)) - backends.append((t, url, prefix)) - except ValueError: - logger.critical('Can not unpack backend information: %s' % backend) - sys.exit(1) - if not len(backends): - logger.critical("No backend provided. Exiting.") - sys.exit(1) - - for server in servers: - try: - url = server - stats = monitor_agent(url) - logger.info('Stats gathering from %s OK.' % url) - except Exception as e: - logger.error('Failed requesting data from %s' % (server)) - logger.error(e) - stats = {'server_name': get_server_name(server), - 'availability': 0} # Resetting Gauge values in Graphite - - for backend in backends: - t, url, prefix = backend - if t == 'graphite': - if not prefix: - logger.critical('Can not report to graphite without prefix') - sys.exit(1) - try: - backend_graphite(url, stats, prefix) - logger.info('Reporting to %s OK.' % url) - except Exception as e: - logger.error('Unable to report to Graphite backend: %s' % e) - else: - logger.critical('Can not report to backend of type %s: Not supported' % type) - sys.exit(1) - sys.exit(0) diff --git a/tools/probes/common/monitor_haproxy_local b/tools/probes/common/monitor_haproxy_local deleted file mode 100755 index b1a113a992..0000000000 --- a/tools/probes/common/monitor_haproxy_local +++ /dev/null @@ -1,147 +0,0 @@ -#!/usr/bin/env python -# Copyright European Organization for Nuclear Research (CERN) 2013 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - Ralph Vigne , 2014 - - -import argparse -import socket -import traceback -import logging -import logging.handlers -import sys - -from pystatsd import Client -from sys import stdout - -# Define logger -logging.basicConfig(stream=stdout, - level=logging.INFO, - format='%(asctime)s\t%(process)d\t%(levelname)s\t%(message)s') - -logger = logging.getLogger('rucio-haproxy-monitoring') - -# Adding syslog handler -handler = logging.handlers.SysLogHandler('/dev/log') -handler.setFormatter(logging.Formatter('%(name)s[%(process)d] %(message)s')) -logger.addHandler(handler) - - -def monitor_haproxy(socket_name): - data = {} - INCLUDE_INFO = ['Process_num', 'Idle_pct'] - INCLUDE_STAT = ['scur', 'qcur', 'chkfail', 'status', 'weight', 'rate', 'hrsp_1xx', 'hrsp_2xx', 'hrsp_3xx', 'hrsp_4xx', 'hrsp_5xx', 'req_rate', 'qtime', 'ctime', 'rtime', 'ttime'] - - # Request data from socket - logger.debug('Connecting to socket: %s' % socket_name) - try: - s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) - s.connect(socket_name) - logger.debug('Requesting info') - s.send('show info\n') - raw_info = s.recv(4096) - s.close() # Note: socket is not reusable - logger.debug('Requesting stat') - s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) - s.connect(socket_name) - s.send('show stat\n') - raw_stat = s.recv(8192) - s.close() - except Exception as e: - logger.error('Failed requesting data from socket %s with execption %s' % (socket_name, e)) - logger.error(traceback.format_exc(e)) - return None - logger.debug('Successfully requested data from socket.') - - # Transforming info response into dictonary - logger.debug('Parsing info response') - for entry in raw_info.split('\n'): - tmp = entry.split(': ') - try: - if tmp[0] in INCLUDE_INFO: - data[tmp[0]] = float(tmp[1]) - except Exception as e: - logger.error('Entry: %s failed with exception: %s' % (tmp, e)) - logger.error(traceback.format_exc(e)) - logger.debug('Done parsing info response.') - - # Transforming stat response into dictonary - logger.debug('Parsing stat response') - raw_stat = raw_stat.split('\n') - headers = raw_stat.pop(0).split(',')[2:-1] # Get the column headers and remove pxname and svname - for stat in raw_stat: - stat = stat.split(',') - if len(stat) == 1: - logger.debug('Ignored line: %s' % stat[0]) - continue # Line is something else than stats - prefix = '%s.%s' % (stat.pop(0), stat.pop(0)) # Build metric prefix using pxname and svname - for column in range(len(headers)): - try: - if headers[column] in INCLUDE_STAT: - if (headers[column] == 'status') and (stat[column] in ['UP', 'DOWN', 'MAINT']) and (data['Process_num'] == 1.0): - for s in ['UP', 'DOWN', 'MAINT']: - data[prefix + '.' + headers[column] + '.' + s] = 0 # set all status to zero to support gauge values - data[prefix + '.' + headers[column] + '.' + stat[column]] = 1 - else: - data[prefix + '.' + headers[column]] = float(stat[column]) - except Exception as e: - logger.debug('Igonring data: %s -> %s' % (headers[column], stat[column])) - logger.debug('Done parsing stat response.') - return data - - -def backend_graphite(url, stats, prefix): - process_num = stats['Process_num'] - del(stats['Process_num']) - server_name = socket.getfqdn().split('.')[0] - prefix = '%s.%s.%s' % (prefix, server_name, int(process_num)) - logger.debug('Reporting to prefix: %s' % prefix) - server, port = url.split(':') - try: - pystatsd_client = Client(host=server, port=port, prefix=prefix) - except Exception, e: - logger.error('Unable to connect to Graphite backend %s: %s' % (url, e)) - raise - - for s in stats: - try: - pystatsd_client.gauge(s, float(stats[s])) - logger.debug('%s.%s => %s' % (prefix, s, float(stats[s]))) - except Exception as e: - logger.error('Failed reporting %s (%s): %s' % (s, stats[s], e)) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument('--backend', metavar='B', type=str, nargs=1, help=' Graphite server URL[:port][::scope] to which the script will report to. E.g. --backend rucio-graphite-int.cern.ch:8025/listen/now::rucio.loadbalancer') - parser.add_argument('--sockets', metavar='S', type=str, nargs='+', help='a list of socket files e.g. /var/run/haproxy_admin_process_no_1.sock') - parser.add_argument('--verbose', help='makes it chatty', action="store_true") - - args = parser.parse_args() - - if args.verbose: - logger.setLevel(level=logging.DEBUG) - args = vars(args) - - try: - url, prefix = args['backend'][0].split('::') - logger.debug('Reporting to backend => URL: %s\tPrefix: %s' % (url, prefix)) - except ValueError: - logger.critical('Can not unpack backend information: %s' % args['backend'][0]) - sys.exit(1) - - logger.info('Cheking sockets: %s' % args['sockets']) - for socket_name in args['sockets']: - try: - data = monitor_haproxy(socket_name) - backend_graphite(url, data, prefix) - except Exception as e: - logger.error(e) - logger.error(traceback.format_exc(e)) - sys.exit(1) - sys.exit(0) diff --git a/tools/probes/common/service_monitoring.cfg b/tools/probes/common/service_monitoring.cfg deleted file mode 100644 index 3963fb4291..0000000000 --- a/tools/probes/common/service_monitoring.cfg +++ /dev/null @@ -1,112 +0,0 @@ -[ - { "id": "rucio.daemons.automatix", - "from": "-10minutes", - "until": "-1minutes", - "targets": [ ], - "webpage": "https://rucio-graphite-prod-02.cern.ch/dashboard/#Monitoring-Automatix", - "availability": { "metric": "summarize(sumSeries(stats.rucio.monitoring.daemons.automatix.*.*.count),'10min',true)", - "mapping": "100 if ( {value} > 0) else 0", - "info": "If log activity within the last 10 minutes = 100% else 0%" - } - }, - { "id": "rucio.daemons.conveyor", - "from": "-10minutes", - "until": "-1minutes", - "targets": [ ], - "webpage": "https://rucio-graphite-prod-02.cern.ch/dashboard/#Monitoring-Conveyor", - "availability": { "metric": "summarize(sumSeries(stats.rucio.monitoring.daemons.conveyor*.*.*.count),'10min',true)", - "mapping": "100 if ( {value} > 0) else 0", - "info": "If log activity within the last 10 minutes = 100% else 0%" - } - }, - { "id": "rucio.daemons.hermes", - "from": "-10minutes", - "until": "-1minutes", - "targets": [ ], - "webpage": "https://rucio-graphite-prod-02.cern.ch/dashboard/#Monitoring-Hermes", - "availability": { "metric": "summarize(sumSeries(stats.rucio.monitoring.daemons.hermes*.*.*.count),'10min',true)", - "mapping": "100 if ( {value} > 0) else 0", - "info": "If log activity within the last 10 minutes = 100% else 0%" - } - }, - { "id": "rucio.daemons.judge", - "from": "-10minutes", - "until": "-1minutes", - "targets": [ ], - "webpage": "https://rucio-graphite-prod-02.cern.ch/dashboard/#Monitoring-Judge", - "availability": { "metric": "summarize(sumSeries(stats.rucio.monitoring.daemons.judge*.*.*.count),'10min',true)", - "mapping": "100 if ( {value} > 0) else 0", - "info": "If log activity within the last 10 minutes = 100% else 0%" - } - }, - { "id": "rucio.daemons.kronos", - "from": "-10minutes", - "until": "-1minutes", - "targets": [ ], - "webpage": "https://rucio-graphite-prod-02.cern.ch/dashboard/#Monitoring-Kronos", - "availability": { "metric": "summarize(sumSeries(stats.rucio.monitoring.daemons.kronos*.*.*.count),'10min',true)", - "mapping": "100 if ( {value} > 0) else 0", - "info": "If log activity within the last 10 minutes = 100% else 0%" - } - }, - { "id": "rucio.daemons.necromancer", - "from": "-10minutes", - "until": "-1minutes", - "targets": [ ], - "webpage": "https://rucio-graphite-prod-02.cern.ch/dashboard/#Monitoring-Necromancer", - "availability": { "metric": "summarize(sumSeries(stats.rucio.monitoring.daemons.necromancer*.*.*.count),'10min',true)", - "mapping": "100 if ( {value} > 0) else 0", - "info": "If log activity within the last 10 minutes = 100% else 0%" - } - }, - { "id": "rucio.daemons.reaper", - "from": "-10minutes", - "until": "-1minutes", - "targets": [ ], - "webpage": "https://rucio-graphite-prod-02.cern.ch/dashboard/#Monitoring-Reaper", - "availability": { "metric": "summarize(sumSeries(stats.rucio.monitoring.daemons.reaper*.*.*.count),'10min',true)", - "mapping": "100 if ( {value} > 0) else 0", - "info": "If log activity within the last 10 minutes = 100% else 0%" - } - }, - { "id": "rucio.daemons.transmogrifier", - "from": "-10minutes", - "until": "-1minutes", - "targets": [ ], - "webpage": "https://rucio-graphite-prod-02.cern.ch/dashboard/#Monitoring-Transmogrifier", - "availability": { "metric": "summarize(sumSeries(stats.rucio.monitoring.daemons.transmogrifier*.*.*.count),'10min',true)", - "mapping": "100 if ( {value} > 0) else 0", - "info": "If log activity within the last 10 minutes = 100% else 0%" - } - }, - { "id": "rucio.daemons.undertaker", - "from": "-10minutes", - "until": "-1minutes", - "targets": [ ], - "webpage": "https://rucio-graphite-prod-02.cern.ch/dashboard/#Monitoring-Undertaker", - "availability": { "metric": "summarize(sumSeries(stats.rucio.monitoring.daemons.undertaker*.*.*.count),'10min',true)", - "mapping": "100 if ( {value} > 0) else 0", - "info": "If log activity within the last 10 minutes = 100% else 0%" - } - }, - { "id": "rucio.loadbalancer", - "from": "-10minutes", - "until": "-1minutes", - "targets": [ ], - "webpage": "https://rucio-graphite-prod-02.cern.ch/dashboard/#Monitoring-HAProxy", - "availability": { "metric": "averageSeries(summarize(stats.rucio.monitoring.loadbalancer.rucio-lb-prod-*.*.Idle_pct, '10min', 'avg', true))", - "mapping": "100 if ( {value} > 10) else 0", - "info": "Idle percentage as reported by HAProxy" - } - }, - { "id": "rucio.httpd", - "from": "-10minutes", - "until": "-1minutes", - "targets": [ ], - "webpage": "https://rucio-graphite-prod-02.cern.ch/dashboard/#Monitoring-RucioBackendServer", - "availability": { "metric": "averageSeries(summarize(stats.rucio.monitoring.backends.rucio-server-prod-*.availability,'10min','avg',true))", - "mapping": "100 if ( {value} > 10) else 0", - "info": "Ratio between busy and idle workers as reported by Apache's scoreboards" - } - } -] diff --git a/tools/probes/common/service_monitoring.py b/tools/probes/common/service_monitoring.py deleted file mode 100644 index a734a863ff..0000000000 --- a/tools/probes/common/service_monitoring.py +++ /dev/null @@ -1,105 +0,0 @@ -#!/usr/bin/env python -# Copyright European Organization for Nuclear Research (CERN) 2013 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# You may not use this file except in compliance with the License. -# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 -# -# Authors: -# - Ralph Vigne 2015 -# - Cedric Serfon, 2017 - -import datetime -import logging -import json -import sys - -import requests - -from sys import stdout - -logging.basicConfig(stream=stdout, - level=logging.ERROR, - format='%(asctime)s\t%(process)d\t%(levelname)s\t%(message)s') - -LOGGER = logging.getLogger(__name__) - -GRAPHITE_URL = "rucio-graphite-prod-02.cern.ch" - -if len(sys.argv) != 2: - print "Usage: service_monitoring [path to config file]" - -with open(sys.argv[1]) as f: - SERVICES = json.load(f) - -for service in SERVICES: - data = {} - - LOGGER.debug('Working on service %s' % service['id']) - for target in service['targets']: - # Requesting raw data - LOGGER.debug('Requesting data for %s' % target['target']) - url = 'http://%s/render?from=%s&until=%s&format=json&target=%s' % (GRAPHITE_URL, service['from'], service['until'], target['target']) - r = requests.get(url) - if r.status_code != 200: - LOGGER.error('Failed with status code %s when requesting data from %s' % (r.status_code, url)) - continue - - # Create list of numericValue for XML report - for metric in r.json(): - valueID = target['name'].replace('{target}', metric['target']) - data[valueID] = {'value': metric['datapoints'][-1][0], 'desc': target['desc'] if 'desc' in target else "None given"} - LOGGER.debug('Setting %s to %s' % (valueID, data[valueID]['value'])) - - # Derive availability - LOGGER.debug('Derive availability based on %s' % service['availability']['metric']) - if service['availability']['metric'] != '': - url = 'http://%s/render?from=%s&until=%s&format=json&target=alias(%s, "availability")' % (GRAPHITE_URL, service['from'], service['until'], service['availability']['metric']) - LOGGER.debug(url) - r = requests.get(url) - if r.status_code != 200: - LOGGER.error('Failed with status code %s when requesting data from %s' % (r.status_code, url)) - continue - LOGGER.debug(r.json()) - value = r.json()[0]['datapoints'][-1][0] - if value is None or (value == 0 and len(r.json()[0]['datapoints']) > 1): # Happens occasionally - try: - value = r.json()[0]['datapoints'][-2][0] - except: - pass - if 'mapping' in service['availability'] and service['availability']['mapping'] != '': - mapping = service['availability']['mapping'].replace('{value}', str(value)) - LOGGER.debug('Availability mapping function: %s' % (mapping)) - try: - availability = eval(mapping) - except: - LOGGER.error('Failed to derive availability.\nURL: %s\nMapping: %s\nDatapoints: %s' % (url, mapping, r.json())) - LOGGER.error(sys.exc_info()[0]) - else: - availability = value - LOGGER.debug('Availability of %s: %s' % (service['id'], availability)) - - if availability != 100: # For a week or so, we print if not 100 and set report 100 to SLS - print 'Availability of %s: %s (value: %s)' % (service['id'], availability, value) - if availability > 10: - status = 'available' - elif availability <= 10: - status = 'degraded' - else: - status = 'unavailable' - - # Creating XML report - xml_str = '' - xml_str += '%s' % service['id'] - xml_str += '%s' % status - xml_str += '%s' % service['availability']['info'] - xml_str += '' + service['webpage'] + '' - xml_str += 'rucio-admin@cern.ch' - xml_str += '%s' % (datetime.datetime.now().isoformat().split('.')[0]) - xml_str += '%s' % availability - for metric in data: - xml_str += '%s' % (metric, data[metric]['desc'], data[metric]['value']) - xml_str += '' - xml_str += '' - LOGGER.debug(xml_str) - r = requests.post("http://xsls.cern.ch", files={'file': xml_str})