# Subsidy Efficiency Evaluation

### Database Import - SQL Query

In [None]:
-- SELECT * 
-- FROM Fossils.fossil_fuel_subsidies 
-- -- limit 10
-- ;

In [None]:
-- SELECT * 
-- FROM Fossils.fossil_fuel_subsidies_gdp
-- -- limit 10
-- ;

In [None]:
-- SELECT * 
-- FROM Fossils.fossil_fuel_subsidies_per_capita
-- -- limit 10
-- ;

In [None]:
SELECT DISTINCT 
    raw.Entity AS Entity, raw.Code AS Code, raw.Year AS Year,
    raw."12.c.1 - Fossil-fuel subsidies (consumption and production) (billions of nominal United States dollars) - ER_FFS_CMPT_CD" AS ER_FFS_CMPT_CD, 
    gdp."12.c.1 - Fossil-fuel subsidies (consumption and production) as a proportion of total GDP (%) - ER_FFS_CMPT_GDP" AS ER_FFS_CMPT_GDP, 
    pc."12.c.1 - Fossil-fuel subsidies (consumption and production) per capita (nominal United States dollars) - ER_FFS_CMPT_PC_CD" AS ER_FFS_CMPT_PC_CD
FROM Fossils.fossil_fuel_subsidies AS raw
INNER JOIN Fossils.fossil_fuel_subsidies_gdp AS gdp
    ON raw.Entity = gdp.Entity 
    AND raw.Code = gdp.Code 
    AND raw.Year = gdp.Year
--     USING (Entity, Code, Year)    
INNER JOIN Fossils.fossil_fuel_subsidies_per_capita AS pc
    ON raw.Entity = pc.Entity 
    AND raw.Code = pc.Code 
    AND raw.Year = pc.Year
--     USING (Entity, Code, Year)    
;

- **Note**: The counts do not agree & match up between the tables, so some rows are lost due to incomplete matching during join (eg. missing data from certain years; may do loose/full join to find the lossy data)

### Dataloading from Excelsheets

In [None]:
# Read from Excel
%ntbl pull datasets "Excel data/FossilFuelSubsidiesTracker_GlobalData.xlsx"
%ntbl pull datasets "Excel data/Fossil fuel consumption subsidies, 2010-2021.xlsx"

In [None]:
subsidies_xlsx = pd.ExcelFile("../../datasets/Excel data/FossilFuelSubsidiesTracker_GlobalData.xlsx")
consumptionSub_xlsx = pd.ExcelFile("../../datasets/Excel data/Fossil fuel consumption subsidies, 2010-2021.xlsx")

### Preliminary EDA & Preprocessing

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

In [None]:
# Formatting the year for easier analyses and later manipulation
df_join.Year = pd.to_datetime(df_join.Year.astype('int32'), format='%Y')
# df_join.Year = df_join.Year.astype('object')

In [None]:
# display(df_raw.head())
# display(df_gdp.head())
# display(df_pc.head())
display(df_join.head())

- **Note**: The data contain regional, continental/international, and global (UN) aggregates -- proper analyses would require careful selection (inclusion or exclusion) of the particular rows

In [None]:
# Basic summary of stats
df_join.info()
df_join.describe()

#### Simple Trends

In [None]:
# Overall global annual trend
display(df_join[df_join.Entity == 'World'])

In [None]:
# Trends from UN aggregates (continents & regions)
display(df_join[df_join.Code == ''])

In [None]:
# Canada's annual trend
df_canada = df_join[df_join.Entity == 'Canada']
display(df_canada)

In [None]:
# Filtering out main sets of interest
df_filt = df_join[(df_join.Entity == 'Canada') | (df_join.Entity == 'World') | (df_join.Code == '')]
# df_filt = df_join[[code in ['CAN', 'OWID_WRL', ''] for code in df_join.Code.tolist()]]
display(df_filt)

#### Checking distributions

In [None]:
# Basic distribution plots to look out for outliers & abnormalities
# The Year x-axis on the plot needs to be fixed as discrete time-points
sns.displot(df_join.Year.dt.year, discrete=True)

sns.displot(df_join, x="ER_FFS_CMPT_CD")
sns.displot(df_join, x="ER_FFS_CMPT_GDP")
sns.displot(df_join, x="ER_FFS_CMPT_PC_CD")

- **Note**: The data may be incomplete for certain years, leading to the non-uniform distribution
- **Note**: Consider removing the zeroes (empty/non-reported values)

#### Aggregations

In [None]:
df_gb_ent = df_join.groupby(['Entity', 'Code'])#.agg([sum, 'mean']).drop(['Year'], axis=1).reset_index()

In [None]:
# display(df_gb_ent.sum())#.drop(['Year'], axis=1))
display(df_gb_ent.mean())#.drop(['Year'], axis=1))

In [None]:
# Filtering out sets of interest
df
display(df_gb_ent.sum())

In [None]:
# Need to exclude UN aggregates for correct results
# df_gb_yr = df_join.groupby(['Year'])#.agg(['sum', 'mean']).reset_index()
df_gb_yr = df_join[(df_join.Code != '') & (df_join.Entity != 'World')].groupby(['Year'])

In [None]:
# display(df_gb_yr.sum())
display(df_gb_yr.mean())

### TODOs & Future Considerations
- Bubble plot of UN regions/continents and World against Canada
- Filter out zeroes and even out distributions