In [None]:
# -*- coding: utf-8 -*-

# imports
import pandas as pd
import requests
import os
import pyyaml?
from bs4 import BeautifulSoup as bs

"""
sdg-csv-data-filler is the first module in a data pipeline to take
data from the SDG data repo and make it exportable as CSVW.
"""

# setting paths to directories and files
remote_data_url = "https://github.com/ONSdigital/sdg-data/tree/develop/data"
cwd = os.getcwd()
data_path = os.path.join(cwd, 'data')
out_path = os.path.join(cwd, 'out')
gap_filler_yam_path = (os.path.join
                       (data_path,
                        "gap_filler.yaml"))
header_mapping_yam_path = (os.path.join
                           (data_path,
                            "header_mapping.yaml"))

def get_mapping_dicts(gap_filler_yaml, header_mapping_yaml):
    """
    Loads dictionaries for the gap filling and mapping of column names 
    from locally stored .yaml files
    
        Parameters:
            gap_filler_yaml (string): Path to the yaml file storing the values
                to fill gaps with in each column
            header_mapping_yaml (string): Path to the yaml file storing the 
                names to change headers to for each column
        Returns:
            dict: gap filler dict
            dict: header mapping dict
            
    """
    with open(gap_filler_yaml) as file:
        gap_filler_dict = yaml.safe_load(file)
    with open(header_mapping_yaml) as file:
        header_mapping_dict = yaml.safe_load(file)
    return gap_filler_dict, header_mapping_dict

def find_csv_urls(url):
    """
    Provided with a data folder URL, this function finds the URLS
    of the CSV files within the folder. A generator is yielded with
    the links of all files in the folder.
        Parameters:
            url (string): the URL of the repo/folder which contains
                the CSV files to be captured
        Yields:
            string: generator, the next URL for the CSV file in the 
            remote data folder 
    """
    soup = bs(page)
    csv_link_pattern = r"\/ONSdigital\/sdg-data\/blob\/develop\/data\/indicator_\d-\d{1,2}-\d{1,2}\.csv"
    to_repl_pattern = r"\/sdg-data\/blob\/develop"
    replacement_pattern = "/sdg-data/develop"
    for link in soup.findAll('a', attrs={'href': re.compile(pattern)}):
        link = link.get('href')
        link = re.sub(to_repl_pattern, replacement_pattern, link)
        yield ("https://raw.githubusercontent.com"+link)

def extract_name(url):
    """
    Extracts the name of the dataset from the url, for use later when
        saving the modified dataset locally.
    
        Parameters:
            url (string): the URL of the remotely hosted csv file 
            being captured
        Returns:
            string: the name of the dataset
    """
    capture_all = re.match(patt_catch, url)
    data_name = (capture_all.group(1))
    return data_name
                       
def csvs_to_pandas(url):
    """
    Provided with a URL of a file, the fucntion will check if the CSV
    is populated and if not empty return a Pandas dataframe of the CSV
        Parameters:
            url (string): the URL of a CSV file to be captured
        Returns:
            pd.DataFrame: a Pandas dataframe of the CSV
    """
    if "no data for this indicator yet" in bs(requests.get(url)).text:
        return None
    else:
        return pd.read_csv(csv_url)
                       
def fill_gaps(pd_df, gap_filler_dict):
    """
    Given a Pandas dataframe and a dictionary containing the column names
    the correct 'fillers' for each column, this function will fill
    each column with the correct values when empty cells are found.
        Parameters:
            pd_df (pd.Dataframe): the variable to which the dataframe 
                containing the csv data is assigned
            gap_filler_dict (dict): a dictionary with column name and value 
                to fill gaps as key value pairs, e.g.
                {"Age":"All","Sex":"T"}
        Returns:
            pd.Dataframe: A dataframe with all cells full"""
    df = pd_df.fillna(gap_filler_dict, axis=1)
    return df

def standardise_cell_values(pd_df, dict_of_nonstandard_standard):
    """
    Maps non-standard values e.g. "Males" to standard values like "M".
    Mapping is carried out on a column-specific basis.
    """
    df = (pd_df.replace
          (to_replace=dict_of_nonstandard_standard,
          value=None))
    return df
                       
def standardise_headers(pd_df, dict_of_nonstandard_standard):
    """
    Changes the non-standard CSV column headers to harmonised-data 
    column headers. 
    e.g. 'Age Group' to 'Age'
    """
    # Do whitespaces need to be replaced with underscores?
    col_names = list(pd_df.columns)
    correction_dict = ({col_nm: col_nm.strip().title() 
                        for col_nm in col_names})
    df = pd_df.rename(columns=correction_dict)
    return df
                       
def write_csv(pd_df, out_path):
    """
    Converts a Pandas dataframe to CSV and writes it out to a local folder.
        Parameters:
            pd_df (pd.Dataframe): The pandas data frame of the data
            path (string): the path of the local "out" folder
        Returns:
            writes csv """ #I am unsure how to express this output
    #code adapted from https://github.com/open-sdg/sdg-build/blob/797c7848d48de122d9eddfff6b2c8a9898f7e225/sdg/data.py#L35
#     def write_csv(inid, df, ftype='data', site_dir=''):
#     """
#     For a given ID and data set, write out as csv
#     Args:
#         inid: str. The indicator identifier
#         df: DataFrame. 
#         ftype: Sets directory path
#         site_dir: str. The site directory to build to.
#     Returns:
#         bool: Status
#     """
    status = True

    # If the csv dir isn't there, make it
    csv_dir = out_path
    if not os.path.exists(csv_dir):
        os.makedirs(csv_dir, exist_ok=True)

    try:
        df.to_csv(csv_dir, index=False)
    except Exception as e:
        print(e)
        return False

    return status                   

def entry_point(data_url):
    urls_gen = find_csv_urls(data_url)
    gap_filler_dict, header_mapping_dict = (get_mapping_dicts
                                            (gap_filler_yam_path, 
                                             header_mapping_yam_path))
    for _url in urls_gen:
        data_name = extract_name(_url)
        df = csvs_to_pandas(_url)
        if not df:
            continue
        df = fill_gaps(df, gap_filler_dict)
        df = standardise_headers(df)
        write_csv(df, out_path)

entry_point(data_url=remote_data_url)