# Simon UN Data
This notebook attempts to:
- Read in raw data (currently split into multiple files)
- Assemble the raw data into a single .csv file
- Save the combined file to a local directory
- Assess the data quality of the combined .csv

In [1]:
%load_ext lab_black
%load_ext autoreload
%load_ext watermark

In [2]:
%autoreload 2

In [3]:
%watermark -ntz -p pandas -a Simon-Lee-UK -u -d -t -z

Author: Simon-Lee-UK

Last updated: 2021-02-16 22:14:34GMT

pandas: 1.2.1



In [4]:
import sys
from pathlib import Path
import pandas as pd
from pandas_profiling import ProfileReport
from pyprojroot import here

split_data_path = (
    here() / "raw_data"
)  # here() returns the root of the repository as a pathlib object
interim_data_path = here() / "data" / "interim"

## Read in split data

In [5]:
file_prefix = "all_energy_statistics"
example_path = split_data_path / (file_prefix + "1.csv")
df_first = pd.read_csv(example_path, header=0, nrows=10)

In [6]:
df_first.head(5)

Unnamed: 0,country_or_area,commodity_transaction,year,unit,quantity,quantity_footnotes,category
0,Austria,Additives and Oxygenates - Exports,1996,"Metric tons, thousand",5,,additives_and_oxygenates
1,Austria,Additives and Oxygenates - Exports,1995,"Metric tons, thousand",17,,additives_and_oxygenates
2,Belgium,Additives and Oxygenates - Exports,2014,"Metric tons, thousand",0,,additives_and_oxygenates
3,Belgium,Additives and Oxygenates - Exports,2013,"Metric tons, thousand",0,,additives_and_oxygenates
4,Belgium,Additives and Oxygenates - Exports,2012,"Metric tons, thousand",35,,additives_and_oxygenates


In [7]:
second_path = split_data_path / (file_prefix + "2.csv")
df_second = pd.read_csv(second_path, header=None, nrows=10)

In [8]:
df_second.head(5)

Unnamed: 0,0,1,2,3,4,5,6
0,India,Brown coal briquettes - final consumption,1994,"Metric tons, thousand",1580,,brown_coal_briquettes
1,India,Brown coal briquettes - final consumption,1993,"Metric tons, thousand",1494,,brown_coal_briquettes
2,India,Brown coal briquettes - final consumption,1992,"Metric tons, thousand",1139,,brown_coal_briquettes
3,India,Brown coal briquettes - final consumption,1991,"Metric tons, thousand",1777,,brown_coal_briquettes
4,India,Brown coal briquettes - final consumption,1990,"Metric tons, thousand",1678,,brown_coal_briquettes


In [9]:
df_first.columns

Index(['country_or_area', 'commodity_transaction', 'year', 'unit', 'quantity',
       'quantity_footnotes', 'category'],
      dtype='object')

In [10]:
df_first.columns.to_list()

['country_or_area',
 'commodity_transaction',
 'year',
 'unit',
 'quantity',
 'quantity_footnotes',
 'category']

In [11]:
df_second.columns = df_first.columns

In [12]:
df_second.head(5)

Unnamed: 0,country_or_area,commodity_transaction,year,unit,quantity,quantity_footnotes,category
0,India,Brown coal briquettes - final consumption,1994,"Metric tons, thousand",1580,,brown_coal_briquettes
1,India,Brown coal briquettes - final consumption,1993,"Metric tons, thousand",1494,,brown_coal_briquettes
2,India,Brown coal briquettes - final consumption,1992,"Metric tons, thousand",1139,,brown_coal_briquettes
3,India,Brown coal briquettes - final consumption,1991,"Metric tons, thousand",1777,,brown_coal_briquettes
4,India,Brown coal briquettes - final consumption,1990,"Metric tons, thousand",1678,,brown_coal_briquettes


In [13]:
df_third = pd.read_csv(
    second_path, header=0, names=df_first.columns.to_list(), nrows=10
)
df_third.head()

Unnamed: 0,country_or_area,commodity_transaction,year,unit,quantity,quantity_footnotes,category
0,India,Brown coal briquettes - final consumption,1993,"Metric tons, thousand",1494,,brown_coal_briquettes
1,India,Brown coal briquettes - final consumption,1992,"Metric tons, thousand",1139,,brown_coal_briquettes
2,India,Brown coal briquettes - final consumption,1991,"Metric tons, thousand",1777,,brown_coal_briquettes
3,India,Brown coal briquettes - final consumption,1990,"Metric tons, thousand",1678,,brown_coal_briquettes
4,Ireland,Brown coal briquettes - final consumption,2014,"Metric tons, thousand",27,,brown_coal_briquettes


In [14]:
def some_function(input_dir, file_prefix, first_suffix, output_dir, output_filename):
    """
    Some doc
    """
    indv_files = []
    live_suffix = int(first_suffix)
    while True:
        try:
            live_filename = f"{file_prefix}{live_suffix}.csv"
            live_path = input_dir / live_filename
            if len(indv_files) == 0:
                first_file = pd.read_csv(live_path, header=0)
                column_titles = first_file.columns.to_list()
                indv_files.append(first_file)
            else:
                subsequent_file = pd.read_csv(live_path, header=0, names=column_titles)
                indv_files.append(subsequent_file)
            live_suffix += 1
        except FileNotFoundError as err:
            if len(indv_files) == 0:
                raise ValueError(
                    f"No initial .csv file found at: {live_path}\n"
                    f"Please check the 'input_dir', 'file_prefix' and 'first_suffix' arguments."
                ) from err
            else:
                print(f"Successfully combined {len(indv_files)} files.")
                break

    return pd.concat(indv_files, axis="index", ignore_index=True)

In [15]:
combined_file = some_function(
    split_data_path, file_prefix, 1, interim_data_path, "temp"
)

Successfully combined 12 files.


In [21]:
len(combined_file)

1189471

In [19]:
report = ProfileReport(combined_file, title="Pandas Profiling Overview")

In [20]:
report

Summarize dataset:   0%|          | 0/21 [00:00<?, ?it/s]

KeyboardInterrupt: 