# Proof of concept 2

Demonstrating that all the functions work with the actual override dictionaries and full data sets (rather than samples). 

In [1]:
import pandas as pd
import os
import re
from yaml import safe_load, dump
from modules import prevent_bad_replacement, delete_random_values, write_csv, standardise_cell_values, fill_gaps, csvs_to_pandas, find_csv_urls, get_mapping_dicts

In [2]:
csv_1_path = "https://raw.githubusercontent.com/ONSdigital/sdg-data/develop/data/indicator_1-2-1.csv"
csv_4_path = "https://raw.githubusercontent.com/ONSdigital/sdg-data/develop/data/indicator_8-4-1.csv"

In [3]:
cwd = os.getcwd()
overrides_yam = (os.path.join(cwd,"overrides_dict.yaml"))
data_path = os.path.join(cwd, 'data')
out_path = os.path.join(cwd, 'out')

In [4]:
overrides_dict = get_mapping_dicts(overrides_yam, csv_1_path)
overrides_dict

{'Age': {'FILL_NA': 'Age_gap_filler_value',
  'OldValue1': 'NewValue1',
  'OldValue2': 'NewValue2',
  'OldValue3': 'NewValue3',
  'to': 'correct_header_for_Age'},
 'Observation status': {'FILL_NA': 'Observation_status_gap_filler_value',
  'OldValue1': 'NewValue1',
  'OldValue2': 'NewValue2',
  'OldValue3': 'NewValue3',
  'to': 'correct_header_for_Observation_status'},
 'Sex': {'FILL_NA': 'Sex_gap_filler_value',
  'OldValue1': 'NewValue1',
  'OldValue2': 'NewValue2',
  'OldValue3': 'NewValue3',
  'to': 'correct_header_for_Sex'},
 'Unit measure': {'FILL_NA': 'Unit_measure_gap_filler_value',
  'OldValue1': 'NewValue1',
  'OldValue2': 'NewValue2',
  'OldValue3': 'NewValue3',
  'to': 'correct_header_for_Unit_measure'},
 'Unit multiplier': {'FILL_NA': 'Unit_multiplier_gap_filler_value',
  'OldValue1': 'NewValue1',
  'OldValue2': 'NewValue2',
  'OldValue3': 'NewValue3',
  'to': 'correct_header_for_Unit_multiplier'},
 'Year': {'FILL_NA': 'Year_gap_filler_value',
  'OldValue1': 'NewValue1',
  '

In [5]:
def override_writer(df, overrides_dict):
    """Takes the data frame and makes column-specific replacements or overrides. 
        If fix_headers is True (False is default), it will change the headers to name in the overides dict. 
        If standardise_cells is True (default), it will search for the value to be replaced and if found
        the value will be replaced. If fill_gaps is True (default) it will fill any gaps with the replacement
        value. 
        
        Parameters:
            df (pd.Dataframe): dataframe to be processed
            overrides_dict (dict): The overrides dictionary specific to the dataset being processed
            
        Returns:
            pd.Dataframe: complete with requested value overrides 
    """
    fix_headers = overrides_dict['fix_headers']
    standardise_cells = overrides_dict['standardise_cells']
    fill_gaps = overrides_dict['fill_gaps']
    if fix_headers:
        #not used at the moment
        pass
    if standardise_cells:
        for column in df.columns:
            if column in ['value','Value']: 
                continue #skipping because Value is never a key in the dict
            import ipdb; ipdb.set_trace()
            df[column] = df[column].map(
                overrides_dict[column],
                na_action='ignore')
    if fill_gaps:
        for column in df.columns:
            if column in ['value','Value']: 
                continue #skipping because Value is never a key in the dict
            df[column].fillna(
                value=overrides_dict[column]['FILL_NA'],
                inplace=True)
    return df

# Struggling with pd.Series.map(dict)

In [None]:
def proof_of_concept_2(url_to_csv,
                     overrides=overrides_yam,
                     out_path=out_path):
    # Create df. 
    df = csvs_to_pandas(url_to_csv)
    #Get overrides dict
    overrides_dict = get_mapping_dicts(overrides, url_to_csv)
    # Testing that the overrides work
    df = override_writer(df, overrides_dict)
    #Writing the df to csv locally. 
    pattern = "(indicator_\d{1,2}-\d{1,2}-\d+\.csv)$"
    
    file_name = f"poc2_{re.search(pattern, url_to_csv).group(0)}"
    was_written = write_csv(df, out_path, file_name)
    return df, was_written

poc_df, was_written = proof_of_concept_2(csv_1_path)

In [7]:
was_written


True

In [None]:
remote_data_url = r"https://github.com/ONSdigital/sdg-data/tree/develop/data"

for url in [csv_1_path, csv_4_path]:
    df, written = proof_of_concept_2(url_to_csv,
                                     overrides=overrides_yam,
                                     out_path=out_path)
    file_name = f"{re.search(pattern, url).group(0)}"
    results_dict[file_name] = {'df':df,'written_out':written}