# Simon UN Data
This notebook attempts to:
- Read in raw data (currently split into multiple files)
- Assemble the raw data into a single .csv file
- Save the combined file to a local directory
- Assess the data quality of the combined .csv

In [1]:
%load_ext lab_black
%load_ext autoreload
%load_ext watermark

In [2]:
%autoreload 2

In [3]:
%watermark -ntz -p pandas -a Simon-Lee-UK -u -d -t -z

Author: Simon-Lee-UK

Last updated: 2021-02-18 22:01:57GMT

pandas: 1.2.1



In [4]:
import sys
from pathlib import Path
import pandas as pd
from pandas_profiling import ProfileReport
from pyprojroot import here

split_data_path = (
    here() / "raw_data"
)  # here() returns the root of the repository as a pathlib object
interim_data_path = here() / "data" / "interim"

## Read in split data
To start, read in the first file:

In [5]:
file_prefix = "all_energy_statistics"
example_path = split_data_path / (file_prefix + "1.csv")
df_first = pd.read_csv(example_path, header=0, nrows=10)

In [6]:
df_first.head(5)

Unnamed: 0,country_or_area,commodity_transaction,year,unit,quantity,quantity_footnotes,category
0,Austria,Additives and Oxygenates - Exports,1996,"Metric tons, thousand",5,,additives_and_oxygenates
1,Austria,Additives and Oxygenates - Exports,1995,"Metric tons, thousand",17,,additives_and_oxygenates
2,Belgium,Additives and Oxygenates - Exports,2014,"Metric tons, thousand",0,,additives_and_oxygenates
3,Belgium,Additives and Oxygenates - Exports,2013,"Metric tons, thousand",0,,additives_and_oxygenates
4,Belgium,Additives and Oxygenates - Exports,2012,"Metric tons, thousand",35,,additives_and_oxygenates


Read in the second file:

In [7]:
second_path = split_data_path / (file_prefix + "2.csv")
df_second = pd.read_csv(second_path, header=None, nrows=10)

In [8]:
df_second.head(5)

Unnamed: 0,0,1,2,3,4,5,6
0,India,Brown coal briquettes - final consumption,1994,"Metric tons, thousand",1580,,brown_coal_briquettes
1,India,Brown coal briquettes - final consumption,1993,"Metric tons, thousand",1494,,brown_coal_briquettes
2,India,Brown coal briquettes - final consumption,1992,"Metric tons, thousand",1139,,brown_coal_briquettes
3,India,Brown coal briquettes - final consumption,1991,"Metric tons, thousand",1777,,brown_coal_briquettes
4,India,Brown coal briquettes - final consumption,1990,"Metric tons, thousand",1678,,brown_coal_briquettes


In [9]:
def load_un_data(
    raw_data_dir,
    interim_data_dir,
    un_data_filename="un_energy_data.csv",
    file_prefix="all_energy_statistics",
    first_suffix=1,
    force_refresh=False,
):
    """
    Desc.

    """
    combined_file_path = Path(interim_data_dir) / un_data_filename

    if not combined_file_path.is_file() or force_refresh:
        print(
            f"No existing combined data found at: {combined_file_path}\n"
            f"Attempting to combine split data from: {raw_data_dir}...\n"
        )
        combined_df = combine_split_data(
            raw_data_dir, file_prefix=file_prefix, first_suffix=first_suffix
        )
        # execute code to pre-process data and save to the interim directory
        print(f"Saving combined data set to: {combined_file_path}\n")
        combined_df.to_csv(combined_file_path, index=False)
    else:
        combined_df = pd.read_csv(combined_file_path)

    return combined_df


def combine_split_data(raw_data_dir, file_prefix, first_suffix):
    """
    Combines .csv files in a given directory that match an input prefix-suffix convention; returns as a DataFrame.

    Parameters
    ----------
    raw_data_dir : str / pathlib.PosixPath
        The path to the directory containing all .csv files to be combined.
    file_prefix : str
        The consistent filename prefix used to identify those .csv files to be combined;
        default value = 'all_energy_statistics'.
    first_suffix : int / str
        The numeric suffix found in the first .csv's filename (function assumes numerically ordered filenames);
        default value = 1. The function assumes that only this file contains column titles and that column order
        in all subsequent files is identical to that of the first file.

    Returns
    -------
    combined_df : pandas.DataFrame
        A pandas DataFrame containing data from all .csv files matching the convention defined via prefix and
        suffix arguments.

    Raises
    ------
    ValueError
        Raised when the initial .csv file cannot be found based on input 'raw_data_dir', 'file_prefix' and 'first_suffix'.

    """
    indv_files = []
    live_suffix = int(first_suffix)
    while True:
        try:
            live_filename = f"{file_prefix}{live_suffix}.csv"
            live_path = raw_data_dir / live_filename
            if len(indv_files) == 0:
                first_file = pd.read_csv(live_path, header=0)
                column_titles = first_file.columns.to_list()
                indv_files.append(first_file)
            else:
                subsequent_file = pd.read_csv(live_path, header=0, names=column_titles)
                indv_files.append(subsequent_file)
            live_suffix += 1
        except FileNotFoundError as err:
            if len(indv_files) == 0:
                raise ValueError(
                    f"No initial .csv file found at: {live_path}\n"
                    f"Please check the 'raw_data_dir', 'file_prefix' and 'first_suffix' arguments."
                ) from err
            else:
                print(f"Successfully combined {len(indv_files)} files.")
                break

    combined_df = pd.concat(indv_files, axis="index", ignore_index=True)

    return combined_df

In [10]:
combined_file = combine_split_data(split_data_path, file_prefix, 1)

Successfully combined 12 files.


In [11]:
combined_file.dtypes

country_or_area           object
commodity_transaction     object
year                       int64
unit                      object
quantity                 float64
quantity_footnotes       float64
category                  object
dtype: object

In [12]:
len(combined_file)

1189471

In [13]:
report = ProfileReport(combined_file, title="Pandas Profiling Overview")

In [14]:
# report