### Importing the required libraries

In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### Reading data from the CSV file

In [51]:
emission_table = pd.read_csv("greenhouse_gas_inventory_data_data.csv")

In [52]:
emission_table.head()

Unnamed: 0,country_or_area,year,value,category
0,Australia,2014,393126.946994,carbon_dioxide_co2_emissions_without_land_use_land_use_change_and_forestry_lulucf_in_kilotonne_co2_equivalent
1,Australia,2013,396913.93653,carbon_dioxide_co2_emissions_without_land_use_land_use_change_and_forestry_lulucf_in_kilotonne_co2_equivalent
2,Australia,2012,406462.847704,carbon_dioxide_co2_emissions_without_land_use_land_use_change_and_forestry_lulucf_in_kilotonne_co2_equivalent
3,Australia,2011,403705.528314,carbon_dioxide_co2_emissions_without_land_use_land_use_change_and_forestry_lulucf_in_kilotonne_co2_equivalent
4,Australia,2010,406200.993184,carbon_dioxide_co2_emissions_without_land_use_land_use_change_and_forestry_lulucf_in_kilotonne_co2_equivalent


As we can see, the category data colum is having a large width that may bot be fit in pandas dataframe, hence using the colwidth for the better visibility of data across the category column.

In [53]:
pd.set_option('display.max_colwidth', -1)

In [54]:
by_category  = emission_table.groupby(['category'])

In [55]:
category_count = by_category.count()

In [56]:
category_count

Unnamed: 0_level_0,country_or_area,year,value
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
carbon_dioxide_co2_emissions_without_land_use_land_use_change_and_forestry_lulucf_in_kilotonne_co2_equivalent,1074,1074,1074
greenhouse_gas_ghgs_emissions_including_indirect_co2_without_lulucf_in_kilotonne_co2_equivalent,949,949,949
greenhouse_gas_ghgs_emissions_without_land_use_land_use_change_and_forestry_lulucf_in_kilotonne_co2_equivalent,1074,1074,1074
hydrofluorocarbons_hfcs_emissions_in_kilotonne_co2_equivalent,975,975,975
methane_ch4_emissions_without_land_use_land_use_change_and_forestry_lulucf_in_kilotonne_co2_equivalent,1074,1074,1074
nitrogen_trifluoride_nf3_emissions_in_kilotonne_co2_equivalent,248,248,248
nitrous_oxide_n2o_emissions_without_land_use_land_use_change_and_forestry_lulucf_in_kilotonne_co2_equivalent,1074,1074,1074
perfluorocarbons_pfcs_emissions_in_kilotonne_co2_equivalent,831,831,831
sulphur_hexafluoride_sf6_emissions_in_kilotonne_co2_equivalent,1032,1032,1032
unspecified_mix_of_hydrofluorocarbons_hfcs_and_perfluorocarbons_pfcs_emissions_in_kilotonne_co2_equivalent,75,75,75


As we know we need to slice the category names in many parts to get the exact name of the Green House Gas. Here we can see a common thing in every category name =, i.e. *"_in_kilotonne_co2_equivalent"* at the end of every category. So my workflow for next few hours will comprise of the following things.

  * Slicing the common part from each category name and updating that with a copy of the main dataframe.
  * Slicing GHG and the other required hyperparameters like - indirect co2 or land usage forestry etc. 

### Breaking the problem in smaller part and preparing a small algo

In [57]:
strp = category_count.index

In [58]:
io = strp[0]
io[108]

't'

In [59]:
io

'carbon_dioxide_co2_emissions_without_land_use_land_use_change_and_forestry_lulucf_in_kilotonne_co2_equivalent'

In [60]:
hdd = len(io)
hdd

109

In [61]:
io.find("_in_kilotonne_co2_equivalent",0,hdd)

81

In [62]:
# Here we get success
io[:81]

'carbon_dioxide_co2_emissions_without_land_use_land_use_change_and_forestry_lulucf'

**Conclusion for the Algorithm is to run a for loop and update each category type one by onne, then update the main category dataframe.**

### Algorithm Begins

In [63]:
new_category_index = []
for string in strp:
    p = len(string)
    pos = string.find("_in_kilotonne_co2_equivalent",0,p)
    string = string[:pos]
    new_category_index.append(string)

In [64]:
new_category_index

['carbon_dioxide_co2_emissions_without_land_use_land_use_change_and_forestry_lulucf',
 'greenhouse_gas_ghgs_emissions_including_indirect_co2_without_lulucf',
 'greenhouse_gas_ghgs_emissions_without_land_use_land_use_change_and_forestry_lulucf',
 'hydrofluorocarbons_hfcs_emissions',
 'methane_ch4_emissions_without_land_use_land_use_change_and_forestry_lulucf',
 'nitrogen_trifluoride_nf3_emissions',
 'nitrous_oxide_n2o_emissions_without_land_use_land_use_change_and_forestry_lulucf',
 'perfluorocarbons_pfcs_emissions',
 'sulphur_hexafluoride_sf6_emissions',
 'unspecified_mix_of_hydrofluorocarbons_hfcs_and_perfluorocarbons_pfcs_emissions']

In [83]:
new_category_index_reborn = []
for lingo in new_category_index:
    q = len(lingo)
    pos = lingo.find("_emissions",0,p)
    lingo = lingo[:pos]
    new_category_index_reborn.append(lingo)

In [84]:
new_category_index_reborn

['carbon_dioxide_co2',
 'greenhouse_gas_ghgs',
 'greenhouse_gas_ghgs',
 'hydrofluorocarbons_hfcs',
 'methane_ch4',
 'nitrogen_trifluoride_nf3',
 'nitrous_oxide_n2o',
 'perfluorocarbons_pfcs',
 'sulphur_hexafluoride_sf6',
 'unspecified_mix_of_hydrofluorocarbons_hfcs_and_perfluorocarbons_pfcs']

In [87]:
splitted_indexes = []
for tenticles in new_category_index_reborn:
    splitted_indexes.append(tenticles.split("_"))

In [89]:
splitted_indexes

[['carbon', 'dioxide', 'co2'],
 ['greenhouse', 'gas', 'ghgs'],
 ['greenhouse', 'gas', 'ghgs'],
 ['hydrofluorocarbons', 'hfcs'],
 ['methane', 'ch4'],
 ['nitrogen', 'trifluoride', 'nf3'],
 ['nitrous', 'oxide', 'n2o'],
 ['perfluorocarbons', 'pfcs'],
 ['sulphur', 'hexafluoride', 'sf6'],
 ['unspecified',
  'mix',
  'of',
  'hydrofluorocarbons',
  'hfcs',
  'and',
  'perfluorocarbons',
  'pfcs']]

In [None]:
# That's all for today ------------------------ Signing Off