# Creating Correlations Data to map the over time relationship of energy consumption and GDP

In [1]:
#package import of DA standard packages
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#reading in the data from the updated consumptions table
consume_df = pd.read_csv('/Users/sebastianlorenzen-schmidt/neuefische/capstone_project/data/energy-consumption-by-source-and-region-solar-zero.csv')

In [3]:
consume_df

Unnamed: 0.1,Unnamed: 0,iso_code,country,year,nuclear_consumption,coal_consumption,hydro_consumption,oil_consumption,gas_consumption,wind_consumption,solar_consumption,other_renewable_consumption,biofuel_consumption,low_carbon_consumption,renewables_consumption
0,0,AGO,Angola,1990,0.0,0.0,2.051429,15.325936,5.834961,0.0,0.0,0.0,0.0,2.051429,2.051429
1,1,AGO,Angola,1991,0.0,0.0,2.042857,16.144753,6.142351,0.0,0.0,0.0,0.0,2.042857,2.042857
2,2,AGO,Angola,1992,0.0,0.0,2.377143,16.129702,6.182138,0.0,0.0,0.0,0.0,2.377143,2.377143
3,3,AGO,Angola,1993,0.0,0.0,2.517143,15.644819,6.073679,0.0,0.0,0.0,0.0,2.517143,2.517143
4,4,AGO,Angola,1994,0.0,0.0,2.531429,15.469970,5.638678,0.0,0.0,0.0,0.0,2.531429,2.531429
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7170,7170,,Yugoslavia,2012,,,,,,,0.0,,,,
7171,7171,,Yugoslavia,2013,,,,,,,0.0,,,,
7172,7172,,Yugoslavia,2014,,,,,,,0.0,,,,
7173,7173,,Yugoslavia,2015,,,,,,,0.0,,,,


In [4]:
consume_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7175 entries, 0 to 7174
Data columns (total 15 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Unnamed: 0                   7175 non-null   int64  
 1   iso_code                     6433 non-null   object 
 2   country                      7175 non-null   object 
 3   year                         7175 non-null   int64  
 4   nuclear_consumption          3787 non-null   float64
 5   coal_consumption             4267 non-null   float64
 6   hydro_consumption            3787 non-null   float64
 7   oil_consumption              4267 non-null   float64
 8   gas_consumption              4267 non-null   float64
 9   wind_consumption             3787 non-null   float64
 10  solar_consumption            7175 non-null   float64
 11  other_renewable_consumption  3787 non-null   float64
 12  biofuel_consumption          4308 non-null   float64
 13  low_carbon_consump

### Creating the missing columns for correlation calculation

In [5]:
consume_df['total_consumption'] =(consume_df['nuclear_consumption']
                                +consume_df['coal_consumption']
                                +consume_df['gas_consumption']
                                +consume_df['oil_consumption']
                                +consume_df['biofuel_consumption']
                                +consume_df['wind_consumption']
                                +consume_df['solar_consumption']
                                +consume_df['hydro_consumption']
                                +consume_df['other_renewable_consumption'])

consume_df['fossil_consumption'] = (consume_df['coal_consumption']
                                    +consume_df['oil_consumption']
                                    +consume_df['gas_consumption'])
                

In [6]:
#checking for missing values as a measure of precaution and to have an overview of the amount of missing data
#Missing data might come from countries, that don't exist in their form any more, that renamed themselves or simply have no data for specific years or categories. 
consume_df['total_consumption'].isna().value_counts()

False    3787
True     3388
Name: total_consumption, dtype: int64

In [7]:
consume_df['country'][consume_df['total_consumption'] == 0.0].nunique()

0

## Merging the GDP table to the Consumption table

In [8]:
gdp_frame = pd.read_csv('/Users/sebastianlorenzen-schmidt/neuefische/capstone_project/data/gdp-ppp-current-international-dollar-clean.csv')
gdp_frame.head()

Unnamed: 0,Country Name,Country Code,Year,GDP_PPP
0,Aruba,ABW,1990,1447709000.0
1,Africa Eastern and Southern,AFE,1990,565349500000.0
2,Afghanistan,AFG,1990,
3,Africa Western and Central,AFW,1990,354456400000.0
4,Angola,AGO,1990,38853490000.0


### standardizing the columns of the GDP table

In [9]:
cols = gdp_frame.columns.to_list()

cols = [col.replace(' ','_')for col in cols]
cols = [col.replace('Country_Code','iso_code')for col in cols]
cols = [col.lower() for col in cols]

gdp_frame.columns = cols

gdp_frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8246 entries, 0 to 8245
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   country_name  8246 non-null   object 
 1   iso_code      8246 non-null   object 
 2   year          8246 non-null   int64  
 3   gdp_ppp       7241 non-null   float64
dtypes: float64(1), int64(1), object(2)
memory usage: 257.8+ KB


### Merging the frames

In [10]:
consume_gdp_df = consume_df.merge(gdp_frame,how = 'inner', on =['year','iso_code'])
consume_gdp_df

Unnamed: 0.1,Unnamed: 0,iso_code,country,year,nuclear_consumption,coal_consumption,hydro_consumption,oil_consumption,gas_consumption,wind_consumption,solar_consumption,other_renewable_consumption,biofuel_consumption,low_carbon_consumption,renewables_consumption,total_consumption,fossil_consumption,country_name,gdp_ppp
0,0,AGO,Angola,1990,0.0,0.0,2.051429,15.325936,5.834961,0.0,0.0,0.0,0.0,2.051429,2.051429,23.212326,21.160898,Angola,3.885349e+10
1,1,AGO,Angola,1991,0.0,0.0,2.042857,16.144753,6.142351,0.0,0.0,0.0,0.0,2.042857,2.042857,24.329961,22.287104,Angola,4.056562e+10
2,2,AGO,Angola,1992,0.0,0.0,2.377143,16.129702,6.182138,0.0,0.0,0.0,0.0,2.377143,2.377143,24.688982,22.311839,Angola,3.906777e+10
3,3,AGO,Angola,1993,0.0,0.0,2.517143,15.644819,6.073679,0.0,0.0,0.0,0.0,2.517143,2.517143,24.235641,21.718499,Angola,3.040193e+10
4,4,AGO,Angola,1994,0.0,0.0,2.531429,15.469970,5.638678,0.0,0.0,0.0,0.0,2.531429,2.531429,23.640077,21.108648,Angola,3.146702e+10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6012,7143,YEM,Yemen,2015,,,,,,,0.0,,0.0,,,,,"Yemen, Rep.",
6013,7144,YEM,Yemen,2016,,,,,,,0.0,,0.0,,,,,"Yemen, Rep.",
6014,7145,YEM,Yemen,2017,,,,,,,0.0,,0.0,,,,,"Yemen, Rep.",
6015,7146,YEM,Yemen,2018,,,,,,,0.0,,0.0,,,,,"Yemen, Rep.",


## Adding ht eCorrelation Columns for the different correlations of GDP and Energy Sources

In [11]:
#using iso_code since it is on of the parameters we merged on, country is also an option for list iteration
codes_list = consume_df['iso_code'].unique().tolist()
print(codes_list)

#empty list for storing the individual data frames for seperation by country
li = []

#iterationg through the list of iso_codes to seperate the countries and create indivisual correlation and not have the corr() method do an overall correlation 
for code in codes_list:
    df = consume_gdp_df[consume_gdp_df['iso_code']== code]
    df['corr_gdp_totalcons'] = df['gdp_ppp'].corr(consume_gdp_df['total_consumption'], method= 'pearson')
    df['corr_gdp_fossilcons'] = df['gdp_ppp'].corr(consume_gdp_df['fossil_consumption'], method= 'pearson')
    df['corr_gdp_renewcons'] = df['gdp_ppp'].corr(consume_gdp_df['renewables_consumption'], method= 'pearson')
    df['corr_gdp_gascons'] = df['gdp_ppp'].corr(consume_gdp_df['gas_consumption'], method= 'pearson')
    df['corr_gdp_coalcons'] = df['gdp_ppp'].corr(consume_gdp_df['coal_consumption'], method= 'pearson')
    df['corr_gdp_oilcons'] = df['gdp_ppp'].corr(consume_gdp_df['oil_consumption'], method= 'pearson')
    df['corr_gdp_hydrocons'] = df['gdp_ppp'].corr(consume_gdp_df['hydro_consumption'], method= 'pearson')
    df['corr_gdp_windcons'] = df['gdp_ppp'].corr(consume_gdp_df['wind_consumption'], method= 'pearson')
    df['corr_gdp_biofuelcons'] = df['gdp_ppp'].corr(consume_gdp_df['biofuel_consumption'], method= 'pearson')
    df['corr_gdp_othercons'] = df['gdp_ppp'].corr(consume_gdp_df['other_renewable_consumption'], method= 'pearson')
    df['corr_gdp_nuclearcons'] = df['gdp_ppp'].corr(consume_gdp_df['nuclear_consumption'], method= 'pearson')


    li.append(df)

#concatenation the dataframes in the list to a single dataframe
consume_gdp_df2 = pd.concat(li, axis=0, ignore_index=True)

consume_gdp_df2
#Yes the slicing and copying of the dataframe is intended! 
#But still thanks pandas for pointing it out and remind me, that this is not always a best practice or a good idea!

['AGO', 'BEN', 'BWA', 'BFA', 'BDI', 'CPV', 'CMR', 'CAF', 'TCD', 'COM', 'COG', 'COD', 'CIV', 'DJI', 'ERI', 'SWZ', 'ETH', 'GAB', 'GMB', 'GHA', 'GIN', 'GNB', 'KEN', 'LSO', 'LBR', 'LBY', 'MDG', 'MWI', 'MLI', 'MRT', 'MUS', 'MOZ', 'NAM', 'NER', 'NGA', 'RWA', 'STP', 'SEN', 'SYC', 'SLE', 'SOM', 'SSD', 'SDN', 'TZA', 'TGO', 'TUN', 'UGA', 'ZMB', 'ZWE', 'AFG', nan, 'ALB', 'DZA', 'ASM', 'ATG', 'ARG', 'ARM', 'ABW', 'AUS', 'AUT', 'AZE', 'BHS', 'BHR', 'BGD', 'BRB', 'BLR', 'BEL', 'BLZ', 'BMU', 'BTN', 'BOL', 'BIH', 'BRA', 'VGB', 'BRN', 'BGR', 'KHM', 'CAN', 'CYM', 'CHL', 'CHN', 'COL', 'COK', 'CRI', 'HRV', 'CUB', 'CYP', 'CZE', 'DNK', 'DMA', 'DOM', 'ECU', 'EGY', 'SLV', 'GNQ', 'EST', 'FRO', 'FJI', 'FIN', 'FRA', 'GUF', 'PYF', 'GEO', 'DEU', 'GIB', 'GRC', 'GRL', 'GRD', 'GLP', 'GUM', 'GTM', 'GUY', 'HTI', 'HND', 'HKG', 'HUN', 'ISL', 'IND', 'IDN', 'IRN', 'IRQ', 'IRL', 'ISR', 'ITA', 'JAM', 'JPN', 'JOR', 'KAZ', 'KIR', 'OWID_KOS', 'KWT', 'KGZ', 'LAO', 'LVA', 'LBN', 'LTU', 'LUX', 'MAC', 'MYS', 'MDV', 'MLT', 'MTQ', 'M

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['corr_gdp_totalcons'] = df['gdp_ppp'].corr(consume_gdp_df['total_consumption'], method= 'pearson')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['corr_gdp_fossilcons'] = df['gdp_ppp'].corr(consume_gdp_df['fossil_consumption'], method= 'pearson')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
 

Unnamed: 0.1,Unnamed: 0,iso_code,country,year,nuclear_consumption,coal_consumption,hydro_consumption,oil_consumption,gas_consumption,wind_consumption,...,corr_gdp_fossilcons,corr_gdp_renewcons,corr_gdp_gascons,corr_gdp_coalcons,corr_gdp_oilcons,corr_gdp_hydrocons,corr_gdp_windcons,corr_gdp_biofuelcons,corr_gdp_othercons,corr_gdp_nuclearcons
0,0,AGO,Angola,1990,0.0,0.0,2.051429,15.325936,5.834961,0.0,...,0.986419,0.980582,0.209357,,0.984368,0.98247,,0.484164,0.869216,
1,1,AGO,Angola,1991,0.0,0.0,2.042857,16.144753,6.142351,0.0,...,0.986419,0.980582,0.209357,,0.984368,0.98247,,0.484164,0.869216,
2,2,AGO,Angola,1992,0.0,0.0,2.377143,16.129702,6.182138,0.0,...,0.986419,0.980582,0.209357,,0.984368,0.98247,,0.484164,0.869216,
3,3,AGO,Angola,1993,0.0,0.0,2.517143,15.644819,6.073679,0.0,...,0.986419,0.980582,0.209357,,0.984368,0.98247,,0.484164,0.869216,
4,4,AGO,Angola,1994,0.0,0.0,2.531429,15.469970,5.638678,0.0,...,0.986419,0.980582,0.209357,,0.984368,0.98247,,0.484164,0.869216,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6012,7143,YEM,Yemen,2015,,,,,,,...,,,,,,,,,,
6013,7144,YEM,Yemen,2016,,,,,,,...,,,,,,,,,,
6014,7145,YEM,Yemen,2017,,,,,,,...,,,,,,,,,,
6015,7146,YEM,Yemen,2018,,,,,,,...,,,,,,,,,,


In [12]:
#checking on the top positive relationships 
consume_gdp_df2[['corr_gdp_totalcons','country','year']][consume_gdp_df2['corr_gdp_totalcons']> 0.994]

#it is very surprising to find countries that have such a strong relation between energy consumption and GDP generation

Unnamed: 0,corr_gdp_totalcons,country,year
3321,0.997815,India,1990
3322,0.997815,India,1991
3323,0.997815,India,1992
3324,0.997815,India,1993
3325,0.997815,India,1994
...,...,...,...
5982,0.998179,Vietnam,2016
5983,0.998179,Vietnam,2017
5984,0.998179,Vietnam,2018
5985,0.998179,Vietnam,2019


In [13]:
#checking on who has decoupled/a negative relationship between GDP and energy consumption
consume_gdp_df2[['corr_gdp_totalcons','country','year']][consume_gdp_df2['corr_gdp_totalcons']< -0.8]

Unnamed: 0,corr_gdp_totalcons,country,year
2928,-0.832578,Germany,1990
2929,-0.832578,Germany,1991
2930,-0.832578,Germany,1992
2931,-0.832578,Germany,1993
2932,-0.832578,Germany,1994
...,...,...,...
5146,-0.806036,Slovakia,2016
5147,-0.806036,Slovakia,2017
5148,-0.806036,Slovakia,2018
5149,-0.806036,Slovakia,2019


In [14]:
#saving it to csv to share and export. Bag it and ship it!
consume_gdp_df2.to_csv(path_or_buf= 'data/correlations.csv')