In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

## Loading the 12 split CSVs and combining back into a single dataframe

- As the original CSV has been split into 12, only the first file contains the proper column names.
- Therefore, we need to ensure that files 2 through to 12 are read in a way where their first row isn't treated as the column names.

In [None]:
# Load CSV, need to go up one folder level then back down
df = pd.read_csv('Raw Data/all_energy_statistics1.csv')
column_names = df.columns.to_list()

In [None]:
# Iterating through every subsquent CSV and appending to first CSV
file_number = 2
for i in np.arange(11):
    # print(f'About to load file number {file_number}') # for debugging/ sanity check
    df_ = pd.read_csv(f'Raw Data/all_energy_statistics{file_number}.csv',header=None)
    df_.columns = column_names
    df = df.append(df_)
    file_number+=1

In [None]:
df

In [5]:
# Checking that the CSV splitting & rejoining process worked correctly - does my newly rejoined/ appended dataframe match shape of dataframe from Kaggle kernel?
row_num_kaggle = 1189482 # https://www.kaggle.com/alexanderklarge/checking-out-data-set-for-seeyoudata-project
row_num_here = df.shape[0]
assert row_num_kaggle == row_num_here, 'Dataframes don\'t match!'

## Cleaning the dataframe

- First of all, I'd like to take the commodity_transaction column and turn it into a separate column per commodity type

In [6]:
df.head(3)

Unnamed: 0,country_or_area,commodity_transaction,year,unit,quantity,quantity_footnotes,category
0,Austria,Additives and Oxygenates - Exports,1996,"Metric tons, thousand",5.0,,additives_and_oxygenates
1,Austria,Additives and Oxygenates - Exports,1995,"Metric tons, thousand",17.0,,additives_and_oxygenates
2,Belgium,Additives and Oxygenates - Exports,2014,"Metric tons, thousand",0.0,,additives_and_oxygenates


In [7]:
# Creating a groupby
# Problem here is losing string columns i.e. unit, but I suppose I can rejoin them on somehow later
cols_of_interest = ['country_or_area','year','commodity_transaction','quantity']
df_groupby = df[cols_of_interest].groupby(['country_or_area','year','commodity_transaction']).sum()
df_groupby.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,quantity
country_or_area,year,commodity_transaction,Unnamed: 3_level_1
Afghanistan,1990,Aviation gasoline - Consumption by transport,3.0
Afghanistan,1990,Aviation gasoline - Consumption in domestic aviation,3.0
Afghanistan,1990,Aviation gasoline - Final consumption,3.0


In [8]:
# Using the mystical unstack method to magically turn the commodity_transaction column into a column per commodity_transaction type!
df_groupby = df_groupby.unstack()

In [9]:
df_groupby.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,quantity,quantity,quantity,quantity,quantity,quantity,quantity,quantity,quantity,quantity,quantity,quantity,quantity,quantity,quantity,quantity,quantity,quantity,quantity,quantity,quantity
Unnamed: 0_level_1,commodity_transaction,Additives and Oxygenates - Exports,Additives and Oxygenates - Imports,Additives and Oxygenates - Production,Additives and Oxygenates - Receipts from other sources,Additives and Oxygenates - Stock changes,Additives and Oxygenates - Total energy supply,Additives and Oxygenates - Transformation,Additives and Oxygenates - Transformation in oil refineries,Additives and Oxygenates - transfers and recycled products,Animal waste - Consumption by commerce and public services,...,White spirit and special boiling point industrial spirits - Transfers and recycled products,White spirit and special boiling point industrial spirits - Transformation,White spirit and special boiling point industrial spirits - Transformation in petrochemical plants,White spirit and special boiling point industrial spirits - consumption by other industries and construction,White spirit and special boiling point industrial spirits - final consumption,White spirit and special boiling point industrial spirits - production from plants,White spirit and special boiling point industrial spirits - production from refineries,Wind – Autoproducer,Wind – Main activity,animal waste - Transformation
country_or_area,year,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
Afghanistan,1990,,,,,,,,,,,...,,,,,21.0,,,,,
Afghanistan,1991,,,,,,,,,,,...,,,,,18.0,,,,,
Afghanistan,1992,,,,,,,,,,,...,,,,,11.0,,,,,


In [10]:
# Unstack leaves you with an unpleasent nested/ multiindex column structure, which might have some utility I'm not aware of, but I'd rather remove for now at least
df_groupby.columns = df_groupby.columns.droplevel()

In [11]:
# We now have a dataframe which is easier to navigate, although ideally I'd like it not to be a groupby at all
df_groupby

Unnamed: 0_level_0,commodity_transaction,Additives and Oxygenates - Exports,Additives and Oxygenates - Imports,Additives and Oxygenates - Production,Additives and Oxygenates - Receipts from other sources,Additives and Oxygenates - Stock changes,Additives and Oxygenates - Total energy supply,Additives and Oxygenates - Transformation,Additives and Oxygenates - Transformation in oil refineries,Additives and Oxygenates - transfers and recycled products,Animal waste - Consumption by commerce and public services,...,White spirit and special boiling point industrial spirits - Transfers and recycled products,White spirit and special boiling point industrial spirits - Transformation,White spirit and special boiling point industrial spirits - Transformation in petrochemical plants,White spirit and special boiling point industrial spirits - consumption by other industries and construction,White spirit and special boiling point industrial spirits - final consumption,White spirit and special boiling point industrial spirits - production from plants,White spirit and special boiling point industrial spirits - production from refineries,Wind – Autoproducer,Wind – Main activity,animal waste - Transformation
country_or_area,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Afghanistan,1990,,,,,,,,,,,...,,,,,21.0,,,,,
Afghanistan,1991,,,,,,,,,,,...,,,,,18.0,,,,,
Afghanistan,1992,,,,,,,,,,,...,,,,,11.0,,,,,
Afghanistan,1993,,,,,,,,,,,...,,,,,10.0,,,,,
Afghanistan,1994,,,,,,,,,,,...,,,,,9.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zimbabwe,2010,,,,,,,,,,,...,,,,,,,,,,
Zimbabwe,2011,,,,,,,,,,,...,,,,,,,,,,
Zimbabwe,2012,,,,,,,,,,,...,,,,,,,,,,
Zimbabwe,2013,,,,,,,,,,,...,,,,,,,,,,


In [12]:
# Would ideally like the index to not be a multiindex either
df_groupby.index

MultiIndex([('Afghanistan', 1990),
            ('Afghanistan', 1991),
            ('Afghanistan', 1992),
            ('Afghanistan', 1993),
            ('Afghanistan', 1994),
            ('Afghanistan', 1995),
            ('Afghanistan', 1996),
            ('Afghanistan', 1997),
            ('Afghanistan', 1998),
            ('Afghanistan', 1999),
            ...
            (   'Zimbabwe', 2005),
            (   'Zimbabwe', 2006),
            (   'Zimbabwe', 2007),
            (   'Zimbabwe', 2008),
            (   'Zimbabwe', 2009),
            (   'Zimbabwe', 2010),
            (   'Zimbabwe', 2011),
            (   'Zimbabwe', 2012),
            (   'Zimbabwe', 2013),
            (   'Zimbabwe', 2014)],
           names=['country_or_area', 'year'], length=5568)

In [13]:
# Apparently this will turn the multiindex into normal columns:
df_groupby.reset_index(inplace=True)

In [14]:
#df_groupby.drop('commodity_transaction',axis=1,inplace=True)
df_groupby.set_index('country_or_area',inplace=True)

The dataframe is now in a nice & clean (flat?) format for investigating as normal. 

This also seems to me like it's ready to be put into a SQL table, which will be a cool project. If we can get this data hosted in a SQL server, we can then connect it to a website. Using PHP and JavaScript, you could have user dropdowns to select for example country x in year y, and then return those values in a table, or maybe even make a live graph.

In [15]:
df_groupby

commodity_transaction,year,Additives and Oxygenates - Exports,Additives and Oxygenates - Imports,Additives and Oxygenates - Production,Additives and Oxygenates - Receipts from other sources,Additives and Oxygenates - Stock changes,Additives and Oxygenates - Total energy supply,Additives and Oxygenates - Transformation,Additives and Oxygenates - Transformation in oil refineries,Additives and Oxygenates - transfers and recycled products,...,White spirit and special boiling point industrial spirits - Transfers and recycled products,White spirit and special boiling point industrial spirits - Transformation,White spirit and special boiling point industrial spirits - Transformation in petrochemical plants,White spirit and special boiling point industrial spirits - consumption by other industries and construction,White spirit and special boiling point industrial spirits - final consumption,White spirit and special boiling point industrial spirits - production from plants,White spirit and special boiling point industrial spirits - production from refineries,Wind – Autoproducer,Wind – Main activity,animal waste - Transformation
country_or_area,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,1990,,,,,,,,,,...,,,,,21.0,,,,,
Afghanistan,1991,,,,,,,,,,...,,,,,18.0,,,,,
Afghanistan,1992,,,,,,,,,,...,,,,,11.0,,,,,
Afghanistan,1993,,,,,,,,,,...,,,,,10.0,,,,,
Afghanistan,1994,,,,,,,,,,...,,,,,9.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zimbabwe,2010,,,,,,,,,,...,,,,,,,,,,
Zimbabwe,2011,,,,,,,,,,...,,,,,,,,,,
Zimbabwe,2012,,,,,,,,,,...,,,,,,,,,,
Zimbabwe,2013,,,,,,,,,,...,,,,,,,,,,
