From 8242c4c7fc20b73f0569d2828a1e727c30341058 Mon Sep 17 00:00:00 2001 From: Emmanuel Raviart Date: Tue, 14 Mar 2017 14:04:16 +0100 Subject: [PATCH] Remove trailing spaces. --- dlstats/fetchers/world_bank.py | 370 ++++++++++++++++----------------- 1 file changed, 185 insertions(+), 185 deletions(-) diff --git a/dlstats/fetchers/world_bank.py b/dlstats/fetchers/world_bank.py index eea8162..176d02d 100644 --- a/dlstats/fetchers/world_bank.py +++ b/dlstats/fetchers/world_bank.py @@ -65,10 +65,10 @@ def f(*args,**kwargs): return try_it class WorldBankAPI(Fetcher): - + def __init__(self, **kwargs): super().__init__(provider_name='WORLDBANK', version=VERSION, **kwargs) - + self.provider = Providers(name=self.provider_name, long_name='World Bank', version=VERSION, @@ -78,12 +78,12 @@ def __init__(self, **kwargs): fetcher=self) self.api_url = 'http://api.worldbank.org/v2/' - + self.requests_client = requests.Session() - + self.blacklist = [ '13', # Enterprise Surveys - '26', # Corporate scorecard # datacatalog id="89" + '26', # Corporate scorecard # datacatalog id="89" '29', # Global Social Protection '31', # Country Policy and Institutional Assessment (CPIA) '36', # Statistical Capacity Indicators # datacatalog id="8" @@ -92,34 +92,34 @@ def __init__(self, **kwargs): '44', # Readiness for Investment in Sustainable Energy (RISE) '45', # INDO-DAPOER ] - + """ A Exclure: economycoverage: WLD, EAP, ECA, LAC, MNA, SAS, SSA, HIC, LMY, IBRD, IDA numberofeconomies: 214 - topics: + topics: mobileapp: ??? > Les données agrégés par régions sont aussi dans les countries mais avec un id="NA" dans region Aggregates """ - + self._available_countries = None self._available_countries_by_name = None @retry(tries=5, sleep_time=2) def download_or_raise(self, url, params={}): - + if not os.path.exists(self.store_path): os.makedirs(self.store_path, exist_ok=True) - + filename = hashlib.sha224(url.encode("utf-8")).hexdigest() filepath = os.path.abspath(os.path.join(self.store_path, filename)) if os.path.exists(filepath): os.remove(filepath) - + response = self.requests_client.get(url, params=params, stream=True) #response = requests.get(url, params=params) - + logger.info("download url[%s] - filepath[%s]" % (response.url, filepath)) response.raise_for_status() @@ -127,20 +127,20 @@ def download_or_raise(self, url, params={}): with open(filepath, mode='wb') as f: for chunk in response.iter_content(): f.write(chunk) - + self.for_delete.append(filepath) - + with open(filepath) as f: #, mode='rb' return json.load(f) - + def download_json(self, url, parameters={}): #TODO: settings per_page = 1000 payload = {'format': 'json', 'per_page': per_page} payload.update(parameters) - + response_json = self.download_or_raise(self.api_url + url, params=payload) - + first_page = response_json#.json() if isinstance(first_page, list): number_of_pages = int(first_page[0]['pages']) @@ -157,46 +157,46 @@ def download_json(self, url, parameters={}): def available_countries_by_name(self): if self._available_countries_by_name: return self._available_countries_by_name - + self._available_countries_by_name = OrderedDict() for page in self.download_json('countries'): for source in page[1]: self._available_countries_by_name[source['name']] = source - + return self._available_countries_by_name #@property def available_countries(self): if self._available_countries: return self._available_countries - + self._available_countries = {} - + """ { "id": "ABW", "iso2Code": "AW", "name": "Aruba", - "region": + "region": { "id": "LCN", "iso2code": "ZJ", "value": "Latin America & Caribbean (all income levels)" }, - "adminregion": + "adminregion": { "id": "", "iso2code": "", "value": "" }, - "incomeLevel": + "incomeLevel": { "id": "NOC", "iso2code": "XR", "value": "High income: nonOECD" }, - "lendingType": + "lendingType": { "id": "LNX", "iso2code": "XX", @@ -205,13 +205,13 @@ def available_countries(self): "capitalCity": "Oranjestad", "longitude": "-70.0167", "latitude": "12.5167" - }, + }, """ - + for page in self.download_json('countries'): for source in page[1]: self._available_countries[source['id']] = source - + return self._available_countries @@ -225,11 +225,11 @@ def build_data_tree(self): http://api.worldbank.org/v2/datacatalog/metatypes/type;url;lastrevisiondate?format=json&per_page=50 > Voir si numberofeconomies = nombre de series ? - + > calendar: updatefrequency, updateschedule - + > use: detailpageurl pour doc_href - + datacatalog": [ { id": "3", @@ -248,29 +248,29 @@ def build_data_tree(self): }, { "id": "url", - "value": "http://databank.worldbank.org/data/views/variableselection/selectvariables.aspx?source=global-economic-monitor-(gem)" + "value": "http://databank.worldbank.org/data/views/variableselection/selectvariables.aspx?source=global-economic-monitor-(gem)" }, { "id": "apisourceid", !!! lien avec id source ! "value": "15" - } - - }, - ] + } + + }, + ] """ categories = [] - + position = 0 - + for page in self.download_json('sources'): for source in page[1]: - + if source["id"] in self.blacklist: continue - + position += 1 - + cat = { "provider_name": self.provider_name, "category_code": source["code"], @@ -278,16 +278,16 @@ def build_data_tree(self): #TODO: "doc_href": ?, "position": position, "datasets": [{ - "name": source["name"], + "name": source["name"], "dataset_code": source["code"], - "last_update": None, + "last_update": None, "metadata": {"id": source["id"]} }] } categories.append(cat) return categories - + """ http://api.worldbank.org/v2/datacatalog?format=json&per_page=20 @@ -308,11 +308,11 @@ def build_data_tree(self): QEDS/SDDS | Quarterly External Debt Statistics SDDS (New) | 2016-01-28 SE4ALL | Sustainable Energy for All | 2015-09-09 WDI | World Development Indicators | 2016-02-17 - WGI | Worldwide Governance Indicators | 2015-09-25 + WGI | Worldwide Governance Indicators | 2015-09-25 """ - + for page in self.download_json('datacatalog'): - + for source in page["datacatalog"]: name = None is_time_series = False @@ -338,20 +338,20 @@ def build_data_tree(self): if value["value"].lower() == "current": last_update = clean_datetime() else: - try: + try: last_update = clean_datetime(datetime.strptime(value["value"], '%d-%b-%Y')) #17-Feb-2016 - except: + except: pass elif value["id"] == "updatefrequency": - metadata["updatefrequency"] = value["value"] + metadata["updatefrequency"] = value["value"] elif value["id"] == "updateschedule": - metadata["updateschedule"] = value["value"] - + metadata["updateschedule"] = value["value"] + if not dataset_id or is_time_series is False or not dataset_code or dataset_id in self.blacklist: continue - + position += 1 - + cat = { "provider_name": self.provider_name, "category_code": dataset_code, @@ -360,68 +360,68 @@ def build_data_tree(self): "position": position, "datasets": [{ "dataset_code": dataset_code, - "name": name, - "last_update": last_update or clean_datetime(), + "name": name, + "last_update": last_update or clean_datetime(), "metadata": metadata }] } categories.append(cat) - + return categories def upsert_dataset(self, dataset_code): - + self.get_selected_datasets() - + dataset_settings = self.selected_datasets[dataset_code] dataset = Datasets(provider_name=self.provider_name, dataset_code=dataset_code, name=dataset_settings["name"], fetcher=self) - + if dataset_code in DATASETS: dataset.series.data_iterator = ExcelData(dataset, DATASETS[dataset_code]["url"]) dataset.doc_href = DATASETS[dataset_code]["doc_href"] else: dataset.last_update = clean_datetime() dataset.series.data_iterator = WorldBankAPIData(dataset, dataset_settings) - + return dataset.update_database() - + class WorldBankAPIData(SeriesIterator): def __init__(self, dataset, settings): super().__init__(dataset) - + self.wb_id = settings["metadata"]["id"] - + if self.dataset_code in ONLY_WORLD_COUNTRY: self.available_countries = {"WLD": self.fetcher.available_countries().get("WLD")} else: self.available_countries = self.fetcher.available_countries() - + self.dataset.dimension_keys = ["indicator", "country", "frequency"] self.dataset.concepts["country"] = "Country" self.dataset.concepts["indicator"] = "Indicator" self.dataset.concepts["frequency"] = "Frequency" - + self.dataset.codelists["country"] = dict([(k, c["name"]) for k, c in self.available_countries.items()]) self.dataset.codelists["indicator"] = {} self.dataset.codelists["frequency"] = {} - + self.dataset.set_dimension_country("country") - + self.obs_status = {"E": "estimate", "F": "forecast"} - + """ Chaque entrée de series_listed: { "id": "CPTOTNSXN", "name": "CPI Price, nominal", "unit": "", - "source": + "source": { "id": "15", "value": "Global Economic Monitor" @@ -429,19 +429,19 @@ def __init__(self, dataset, settings): "sourceNote": "The consumer price index reflects the change in prices for the average consumer of a constant basket of consumer goods. Data is not seasonally adjusted.", "sourceOrganization": "World Bank staff calculations based on Datastream data.", "topics": [ ] - }, + }, """ self.indicators = self._download_indicators(self.wb_id) - + if not self.dataset.metadata: self.dataset.metadata = {} - + if not "indicators" in self.dataset.metadata: self.dataset.metadata["indicators"] = {} self.countries_to_process = list(self.available_countries.keys()) #self.countries_str = ";".join(list(self.available_countries.keys())) - + self.blacklist_indicator = [ "IC.DCP.COST", "IC.DCP.PROC", @@ -479,13 +479,13 @@ def __init__(self, dataset, settings): "IC.REG.COST", "IC.RP.COST", "IC.RP.PROC", - "IC.RP.TIME", + "IC.RP.TIME", ] - + self.release_date = None self.current_indicator = None self.current_country = None - + self.rows = self._process() def _download_indicators(self, dataset_code): @@ -494,13 +494,13 @@ def _download_indicators(self, dataset_code): Indicator du dataset 15 ? http://api.worldbank.org/v2/indicators/CPTOTNSXN?format=json - + http://api.worldbank.org/v2/sources/15/indicators?format=json { "id": "CPTOTNSXN", "name": "CPI Price, nominal", "unit": "", - "source": + "source": { "id": "15", "value": "Global Economic Monitor" @@ -508,12 +508,12 @@ def _download_indicators(self, dataset_code): "sourceNote": "The consumer price index reflects the change in prices for the average consumer of a constant basket of consumer goods. Data is not seasonally adjusted.", "sourceOrganization": "World Bank staff calculations based on Datastream data.", "topics": [ ] - }, - + }, + Values de l'indicator ci-dessus: http://api.worldbank.org/v2/countries/br/indicators/CPTOTNSXN?format=json - - """ + + """ output = [] for page in self.fetcher.download_json('/'.join(['sources', dataset_code, @@ -528,18 +528,18 @@ def _download_values(self, country_code, indicator_code): # définition d'un indicator : http://api.worldbank.org/v2/indicators?format=json 15608 indicators - 313 pages - + > all: tous pays http://api.worldbank.org/v2/countries/all/indicators/IQ.CPA.ECON.XQ?format=json - + > values d'un indicator pour le BR (brésil) http://api.worldbank.org/countries/br/indicators/NY.GDP.MKTP.CD?format=json - + > Le champs lastupdated est dans les meta seulement en V2 http://api.worldbank.org/v2/countries/br/indicators/NY.GDP.MKTP.CD?format=json - - http://api.worldbank.org/v2/countries/FRA/indicators/NY.GDP.PCAP.CD?format=json - + + http://api.worldbank.org/v2/countries/FRA/indicators/NY.GDP.PCAP.CD?format=json + sample values in series: [ { @@ -557,8 +557,8 @@ def _download_values(self, country_code, indicator_code): "unit": "", "obs_status": "", "decimal": ​1 - }, - ] + }, + ] """ """ Pas de données: http://api.worldbank.org/v2/countries/all/indicators/DPANUSIFS?format=json&per_page=100 @@ -570,10 +570,10 @@ def _download_values(self, country_code, indicator_code): "lastupdated": null, "total": ​0 }, - null - ] + null + ] """ - + datas = [] release_date = None @@ -582,75 +582,75 @@ def _download_values(self, country_code, indicator_code): country_code, 'indicators', indicator_code])): - + if not release_date: release_date = page[0]['lastupdated'] - + datas.extend(page[1]) - + except Exception as err: logger.critical("dataset[%s] - country[%s] - indicator[%s] - error[%s]" % (self.dataset_code, country_code, indicator_code, str(err))) - + return release_date, datas def _process(self): - + for current_indicator in self.indicators: self.current_indicator = current_indicator - + #if not self.current_indicator["id"] == "CC.EST": # continue - + count = 0 - + if self.current_indicator["id"] in self.blacklist_indicator: continue - + is_release_controled = False is_rejected = False - + slug_indicator = slugify(self.current_indicator["id"], save_order=True) - + for current_country in self.countries_to_process: self.current_country = current_country - - logger.info("Fetching dataset[%s] - indicator[%s] - country[%s]" % (self.dataset_code, - self.current_indicator["id"], + + logger.info("Fetching dataset[%s] - indicator[%s] - country[%s]" % (self.dataset_code, + self.current_indicator["id"], self.current_country)) - + release_date, datas = self._download_values(self.current_country, self.current_indicator["id"]) - + if not datas: continue - + self.release_date = clean_datetime(datetime.strptime(release_date, '%Y-%m-%d')) if is_release_controled is False: - + is_release_controled = True - + if self.dataset.metadata["indicators"].get(slug_indicator): - + if self.release_date >= self.dataset.metadata["indicators"][slug_indicator]: msg = "Reject series updated for provider[%s] - dataset[%s] - key[%s]" - logger.info(msg % (self.provider_name, - self.dataset_code, + logger.info(msg % (self.provider_name, + self.dataset_code, self.current_indicator["id"])) - + is_rejected = True break self.dataset.metadata["indicators"][slug_indicator] = self.release_date self.dataset.last_update = clean_datetime() - + count += 1 - + yield {"datas": datas}, None - + if not is_rejected: logger.info("TOTAL - dataset[%s] - indicator[%s] - count[%s]" % (self.dataset_code, self.current_indicator["id"], @@ -672,32 +672,32 @@ def _search_frequency(self, data): return 'D' def build_series(self, datas): - + datas = datas["datas"] - + series = {} series["last_update"] = self.release_date series['frequency'] = self._search_frequency(datas[0]) - - series['key'] = "%s.%s.%s" % (self.current_indicator["id"], - self.current_country, + + series['key'] = "%s.%s.%s" % (self.current_indicator["id"], + self.current_country, series['frequency']) - - series['name'] = "%s - %s - %s" % (self.current_indicator["name"], - self.available_countries[self.current_country]["name"], + + series['name'] = "%s - %s - %s" % (self.current_indicator["name"], + self.available_countries[self.current_country]["name"], constants.FREQUENCIES_DICT[series["frequency"]]) #if self.current_indicator.get("sourceNote"): # series["notes"] = self.current_indicator.get("sourceNote") - + values = [] value_found = False for point in datas: - + frequency = self._search_frequency(point) if frequency != series['frequency']: raise Exception("Diff frequency [%s] != [%s] - series[%s]" % (frequency, series['frequency'], series['key'])) - + value = { 'attributes': None, 'value': str(point["value"]).replace("None", ""), @@ -706,7 +706,7 @@ def build_series(self, datas): } if not value_found and value["value"] != "": value_found = True - + if "obs_status" in point: obs_status = point.get("obs_status") if obs_status and len(obs_status) > 0: @@ -717,23 +717,23 @@ def build_series(self, datas): self.dataset.concepts["obs_status"] = "Observation Status" if not "obs_status" in self.dataset.attribute_keys: self.dataset.attribute_keys.append("obs_status") - + values.append(value) if not value_found: - msg = {"provider_name": self.provider_name, - "dataset_code": self.dataset_code} - raise errors.RejectEmptySeries(**msg) + msg = {"provider_name": self.provider_name, + "dataset_code": self.dataset_code} + raise errors.RejectEmptySeries(**msg) keyfunc = lambda x: x["ordinal"] series['values'] = sorted(values, key=keyfunc) series['provider_name'] = self.provider_name series['dataset_code'] = self.dataset_code - + series['start_date'] = series['values'][0]["ordinal"] series['end_date'] = series['values'][-1]["ordinal"] - + #PATCH for v in series['values']: v.pop("ordinal") @@ -745,21 +745,21 @@ def build_series(self, datas): } if not self.current_indicator["id"] in self.dataset.codelists['indicator']: self.dataset.codelists['indicator'][self.current_indicator["id"]] = self.current_indicator["name"] - + if not series["frequency"] in self.dataset.codelists['frequency']: self.dataset.codelists['frequency'][series["frequency"]] = constants.FREQUENCIES_DICT[series["frequency"]] - + series['attributes'] = None - + self.dataset.add_frequency(series["frequency"]) return series class ExcelData(SeriesIterator): - + def __init__(self, dataset, url, is_autoload=True): SeriesIterator.__init__(self, dataset) - + self.release_date = None self.url = url self.available_countries = self.fetcher.available_countries_by_name() @@ -771,15 +771,15 @@ def __init__(self, dataset, url, is_autoload=True): if not "country" in self.dataset.concepts: self.dataset.concepts["country"] = "Country" - + if not "country" in self.dataset.codelists: self.dataset.codelists["country"] = {} - + self.dataset.set_dimension_country("country") - + if is_autoload: self._load_file() - + self.rows = self._get_datas() def _get_manual_countries(self): @@ -810,24 +810,24 @@ def _get_manual_countries(self): 'South Asia developing': 'SAAS', 'Sub-Saharan Africa developing': 'SSA', 'Middle Income Countries': 'MIC', - 'Developing Asia': 'WIDUKIND-ASIA-DEV', + 'Developing Asia': 'WIDUKIND-ASIA-DEV', } def _load_file(self): filename = "data-%s.zip" % (self.dataset_code) - download = Downloader(url=self.url, + download = Downloader(url=self.url, filename=filename, store_filepath=self.get_store_path(), use_existing_file=self.fetcher.use_existing_file,) self.filepath, response = download.get_filepath_and_response() - + if self.filepath: self.fetcher.for_delete.append(self.filepath) release_date_str = response.headers['Last-Modified'] - #Last-Modified: Tue, 05 Apr 2016 15:05:11 GMT - self.release_date = clean_datetime(datetime.strptime(release_date_str, + #Last-Modified: Tue, 05 Apr 2016 15:05:11 GMT + self.release_date = clean_datetime(datetime.strptime(release_date_str, "%a, %d %b %Y %H:%M:%S GMT")) if self.dataset.last_update and self.dataset.last_update >= self.release_date: @@ -835,7 +835,7 @@ def _load_file(self): raise errors.RejectUpdatedDataset(provider_name=self.provider_name, dataset_code=self.dataset_code, comments=comments) - + self.dataset.last_update = self.release_date def _get_datas(self): @@ -844,7 +844,7 @@ def _get_datas(self): for fname in _zipfile.namelist(): info = _zipfile.getinfo(fname) - + #bypass directory if info.file_size == 0 or info.filename.endswith('/'): continue @@ -852,57 +852,57 @@ def _get_datas(self): if 'Commodity Prices' in fname: logger.warning("bypass %s" % fname) continue - + #if not self.release_date: # last_update = clean_datetime(datetime(*self.zipfile.getinfo(fname).date_time[0:6])) - + series_name = fname[:-5] logger.info("open excel file[%s] - series.name[%s]" % (fname, series_name)) - + excel_book = xlrd.open_workbook(file_contents = _zipfile.read(fname)) - + for sheet in excel_book.sheets(): if sheet.name in ['Sheet1','Sheet2','Sheet3','Sheet4', 'Feuille1','Feuille2','Feuille3','Feuille4']: continue - + periods = sheet.col_slice(0, start_rowx=2) start_period = periods[0].value end_period = periods[-1].value - + frequency = None start_date = None end_date = None - - if sheet.name == 'annual': + + if sheet.name == 'annual': frequency = 'A' start_date = get_ordinal_from_period(str(int(start_period)), freq='A') end_date = get_ordinal_from_period(str(int(end_period)), freq='A') periods = [str(int(p.value)) for p in periods] - elif sheet.name == 'quarterly': + elif sheet.name == 'quarterly': frequency = 'Q' start_date = get_ordinal_from_period(start_period,freq='Q') end_date = get_ordinal_from_period(end_period,freq='Q') periods = [p.value for p in periods] - elif sheet.name == 'monthly': + elif sheet.name == 'monthly': frequency = 'M' start_date = get_ordinal_from_period(start_period.replace('M','-'),freq='M') end_date = get_ordinal_from_period(end_period.replace('M','-'),freq='M') periods = [p.value.replace('M','-') for p in periods] - #elif sheet.name == 'daily': + #elif sheet.name == 'daily': # frequency = 'D' # start_date = self._translate_daily_dates(start_period) # end_date = self._translate_daily_dates(end_period) # TODO: periods = [p.value for p in periods] else: - msg = {"provider_name": self.provider_name, + msg = {"provider_name": self.provider_name, "dataset_code": self.dataset_code, "frequency": sheet.name} raise errors.RejectFrequency(**msg) - + self.dataset.add_frequency(frequency) - + columns = iter(range(1, sheet.row_len(0))) - + for column in columns: settings = { "column": column, @@ -916,14 +916,14 @@ def _get_datas(self): } } yield settings, None - + def _translate_daily_dates(self,value): date = xlrd.xldate_as_tuple(value, self.excel_book.datemode) return pandas.Period(year=date[0], month=date[1], day=date[2], freq=self.frequency) - + def _get_country(self, col_header): - + country = None country_item = None if col_header in self.available_countries: @@ -931,7 +931,7 @@ def _get_country(self, col_header): country_item = col_header else: for k, v in self.manual_countries.items(): - if k.lower() == col_header.lower(): + if k.lower() == col_header.lower(): country = v country_item = k @@ -941,7 +941,7 @@ def _get_country(self, col_header): if country and not country in self.dataset.codelists["country"]: self.dataset.codelists["country"][country] = country_item - + return country def build_series(self, settings): @@ -950,26 +950,26 @@ def build_series(self, settings): periods = settings["periods"] series_name = settings["series_name"] bson = settings["bson"] - + dimensions = {} - + col_header = sheet.cell_value(0, column) dimensions['country'] = self._get_country(col_header) - + values = [] _values = [str(v) for v in sheet.col_values(column, start_rowx=2)] - - for i, v in enumerate(_values): + + for i, v in enumerate(_values): value = { 'attributes': None, 'period': str(periods[i]), - 'value': str(v).replace(",", ".") + 'value': str(v).replace(",", ".") } values.append(value) - - bson['values'] = values + + bson['values'] = values bson['name'] = series_name + ' - ' + col_header + ' - ' + constants.FREQUENCIES_DICT[bson['frequency']] - + series_key = slugify(bson['name'], save_order=True) bson['provider_name'] = self.provider_name @@ -978,6 +978,6 @@ def build_series(self, settings): bson['attributes'] = None bson['dimensions'] = dimensions bson['last_update'] = self.dataset.last_update - + return bson - +