In [1]:
# -*- coding: utf-8 -*-
#
# imports
import pandas as pd
import requests
import re
import os
from yaml import safe_load, dump
from bs4 import BeautifulSoup as bs
import random
from modules import prevent_bad_replacement, delete_random_values, write_csv, standardise_cell_values, fill_gaps, csvs_to_pandas, find_csv_urls, get_mapping_dicts, override_writer


"""
sdg-csv-data-filler is the first module in a data pipeline to take
data from the SDG data repo and make it exportable as CSVW.
"""

# setting paths to directories and files
remote_data_url = "https://github.com/ONSdigital/sdg-data/tree/develop/data"
cwd = os.getcwd()
data_path = os.path.join(cwd, 'data')
out_path = os.path.join(cwd, 'out')
overrides_yam = (os.path.join(cwd,"overrides_dict.yaml"))


In [4]:
def entry_point(data_url):
    # generate urls
    urls_gen = find_csv_urls(data_url)
    with open('overrides_dict.yaml') as file:
        generic_from_yam = safe_load(file)['generic_overrides']

    # define pattern for name matching outside the for-loop. Used for writing out later
    pattern = "(indicator_\d{1,2}-\d{1,2}-\d+\.csv)$"

    # create an empty results dict
    results = {}

    for _url in urls_gen:
        # get the overrides dict for this dataset
        overrides_dict = get_mapping_dicts(overrides_yam, _url)
        # Create df
        df = csvs_to_pandas(_url)

                #get dataset name
        file_name = f"{re.search(pattern, _url).group(0)}"

        if df is None or df.empty: # sometimes no df will be returned so it needs to be skipped
            results[file_name] = False
            continue
        # Apply transformations to the df 
        df = override_writer(df, overrides_dict)
        


        #Writing the df to csv locally. 
        was_written = write_csv(df, out_path, file_name)
        results[file_name] = was_written

    return results

In [5]:
results = entry_point(data_url=remote_data_url)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [6]:
results

{&#39;indicator_1-1-1.csv&#39;: True,
 &#39;indicator_1-2-1.csv&#39;: True,
 &#39;indicator_1-2-2.csv&#39;: True,
 &#39;indicator_1-4-2.csv&#39;: True,
 &#39;indicator_1-5-1.csv&#39;: True,
 &#39;indicator_1-5-3.csv&#39;: True,
 &#39;indicator_1-5-4.csv&#39;: True,
 &#39;indicator_2-1-1.csv&#39;: True,
 &#39;indicator_2-1-2.csv&#39;: True,
 &#39;indicator_2-2-2.csv&#39;: True,
 &#39;indicator_2-3-1.csv&#39;: True,
 &#39;indicator_2-3-2.csv&#39;: True,
 &#39;indicator_2-5-1.csv&#39;: True,
 &#39;indicator_2-5-2.csv&#39;: True,
 &#39;indicator_3-1-1.csv&#39;: True,
 &#39;indicator_3-1-2.csv&#39;: True,
 &#39;indicator_3-2-1.csv&#39;: True,
 &#39;indicator_3-2-2.csv&#39;: True,
 &#39;indicator_3-3-1.csv&#39;: True,
 &#39;indicator_3-3-2.csv&#39;: True,
 &#39;indicator_3-3-3.csv&#39;: True,
 &#39;indicator_3-3-4.csv&#39;: True,
 &#39;indicator_3-3-5.csv&#39;: True,
 &#39;indicator_3-4-1.csv&#39;: True,
 &#39;indicator_3-4-2.csv&#39;: True,
 &#39;indicator_3-5-2.csv&#39;: True,
 &#39;indica

In [13]:
print(f"number of CSVs missing from output = {len(list(find_csv_urls(remote_data_url)))-len(results.keys())}")

number of CSVs missing from output = 13


95