In [75]:
import pandas as pd
import numpy as np
import geopandas as gpd

In [76]:
data = pd.read_csv("../data/raw/global_data_sustainable_energy.csv")
data.head()

Unnamed: 0,Entity,Year,Access to electricity (% of population),Access to clean fuels for cooking,Renewable-electricity-generating-capacity-per-capita,Financial flows to developing countries (US $),Renewable energy share in the total final energy consumption (%),Electricity from fossil fuels (TWh),Electricity from nuclear (TWh),Electricity from renewables (TWh),...,Primary energy consumption per capita (kWh/person),Energy intensity level of primary energy (MJ/$2017 PPP GDP),Value_co2_emissions_kt_by_country,Renewables (% equivalent primary energy),gdp_growth,gdp_per_capita,Density\n(P/Km2),Land Area(Km2),Latitude,Longitude
0,Afghanistan,2000,1.613591,6.2,9.22,20000.0,44.99,0.16,0.0,0.31,...,302.59482,1.64,760.0,,,,60,652230.0,33.93911,67.709953
1,Afghanistan,2001,4.074574,7.2,8.86,130000.0,45.6,0.09,0.0,0.5,...,236.89185,1.74,730.0,,,,60,652230.0,33.93911,67.709953
2,Afghanistan,2002,9.409158,8.2,8.47,3950000.0,37.83,0.13,0.0,0.56,...,210.86215,1.4,1029.999971,,,179.426579,60,652230.0,33.93911,67.709953
3,Afghanistan,2003,14.738506,9.5,8.09,25970000.0,36.66,0.31,0.0,0.63,...,229.96822,1.4,1220.000029,,8.832278,190.683814,60,652230.0,33.93911,67.709953
4,Afghanistan,2004,20.064968,10.9,7.75,,44.24,0.33,0.0,0.56,...,204.23125,1.2,1029.999971,,1.414118,211.382074,60,652230.0,33.93911,67.709953


In [77]:
url = "https://naciscdn.org/naturalearth/110m/cultural/ne_110m_admin_0_countries.zip"
world_countries = gpd.read_file(url)
world_countries.head()

Unnamed: 0,featurecla,scalerank,LABELRANK,SOVEREIGNT,SOV_A3,ADM0_DIF,LEVEL,TYPE,TLC,ADMIN,...,FCLASS_TR,FCLASS_ID,FCLASS_PL,FCLASS_GR,FCLASS_IT,FCLASS_NL,FCLASS_SE,FCLASS_BD,FCLASS_UA,geometry
0,Admin-0 country,1,6,Fiji,FJI,0,2,Sovereign country,1,Fiji,...,,,,,,,,,,"MULTIPOLYGON (((180.00000 -16.06713, 180.00000..."
1,Admin-0 country,1,3,United Republic of Tanzania,TZA,0,2,Sovereign country,1,United Republic of Tanzania,...,,,,,,,,,,"POLYGON ((33.90371 -0.95000, 34.07262 -1.05982..."
2,Admin-0 country,1,7,Western Sahara,SAH,0,2,Indeterminate,1,Western Sahara,...,Unrecognized,Unrecognized,Unrecognized,,,Unrecognized,,,,"POLYGON ((-8.66559 27.65643, -8.66512 27.58948..."
3,Admin-0 country,1,2,Canada,CAN,0,2,Sovereign country,1,Canada,...,,,,,,,,,,"MULTIPOLYGON (((-122.84000 49.00000, -122.9742..."
4,Admin-0 country,1,2,United States of America,US1,1,2,Country,1,United States of America,...,,,,,,,,,,"MULTIPOLYGON (((-122.84000 49.00000, -120.0000..."


In [78]:
world_countries = world_countries[
    ["NAME", "CONTINENT",  'SUBREGION', "POP_EST", 'ECONOMY', 'INCOME_GRP', 'GDP_MD', 'geometry']
].rename(
    columns=lambda x: x.lower()  # Lowercase column names
).assign(
    gdp_per_capita=lambda x: x["gdp_md"] / x["pop_est"] * 1e6
).query(
    'continent != "Antarctica"'
)
world_countries.head()

Unnamed: 0,name,continent,subregion,pop_est,economy,income_grp,gdp_md,geometry,gdp_per_capita
0,Fiji,Oceania,Melanesia,889953.0,6. Developing region,4. Lower middle income,5496,"MULTIPOLYGON (((180.00000 -16.06713, 180.00000...",6175.607026
1,Tanzania,Africa,Eastern Africa,58005463.0,7. Least developed region,5. Low income,63177,"POLYGON ((33.90371 -0.95000, 34.07262 -1.05982...",1089.156033
2,W. Sahara,Africa,Northern Africa,603253.0,7. Least developed region,5. Low income,907,"POLYGON ((-8.66559 27.65643, -8.66512 27.58948...",1503.515109
3,Canada,North America,Northern America,37589262.0,1. Developed region: G7,1. High income: OECD,1736425,"MULTIPOLYGON (((-122.84000 49.00000, -122.9742...",46194.708478
4,United States of America,North America,Northern America,328239523.0,1. Developed region: G7,1. High income: OECD,21433226,"MULTIPOLYGON (((-122.84000 49.00000, -120.0000...",65297.517508


In [79]:
name_mapping = {
    'United States of America': 'United States',
    'S. Sudan': 'South Sudan',
    'Bosnia and Herz.': 'Bosnia and Herzegovina',
    'Central African Rep.': 'Central African Republic',
    'Dominican Rep.': 'Dominican Republic',
    'Eq. Guinea': 'Equatorial Guinea',
    'eSwatini': 'Eswatini',
    'Solomon Is.': 'Solomon Islands'
}

# Replace values in the DataFrame
world_countries['name'] = world_countries['name'].replace(name_mapping)

In [80]:
merged_df = pd.merge(data, world_countries, how='left', left_on='Entity', right_on='name')
merged_df.head()

Unnamed: 0,Entity,Year,Access to electricity (% of population),Access to clean fuels for cooking,Renewable-electricity-generating-capacity-per-capita,Financial flows to developing countries (US $),Renewable energy share in the total final energy consumption (%),Electricity from fossil fuels (TWh),Electricity from nuclear (TWh),Electricity from renewables (TWh),...,Longitude,name,continent,subregion,pop_est,economy,income_grp,gdp_md,geometry,gdp_per_capita_y
0,Afghanistan,2000,1.613591,6.2,9.22,20000.0,44.99,0.16,0.0,0.31,...,67.709953,Afghanistan,Asia,Southern Asia,38041754.0,7. Least developed region,5. Low income,19291.0,"POLYGON ((66.51861 37.36278, 67.07578 37.35614...",507.100698
1,Afghanistan,2001,4.074574,7.2,8.86,130000.0,45.6,0.09,0.0,0.5,...,67.709953,Afghanistan,Asia,Southern Asia,38041754.0,7. Least developed region,5. Low income,19291.0,"POLYGON ((66.51861 37.36278, 67.07578 37.35614...",507.100698
2,Afghanistan,2002,9.409158,8.2,8.47,3950000.0,37.83,0.13,0.0,0.56,...,67.709953,Afghanistan,Asia,Southern Asia,38041754.0,7. Least developed region,5. Low income,19291.0,"POLYGON ((66.51861 37.36278, 67.07578 37.35614...",507.100698
3,Afghanistan,2003,14.738506,9.5,8.09,25970000.0,36.66,0.31,0.0,0.63,...,67.709953,Afghanistan,Asia,Southern Asia,38041754.0,7. Least developed region,5. Low income,19291.0,"POLYGON ((66.51861 37.36278, 67.07578 37.35614...",507.100698
4,Afghanistan,2004,20.064968,10.9,7.75,,44.24,0.33,0.0,0.56,...,67.709953,Afghanistan,Asia,Southern Asia,38041754.0,7. Least developed region,5. Low income,19291.0,"POLYGON ((66.51861 37.36278, 67.07578 37.35614...",507.100698


In [81]:
unmatched_entities = merged_df[merged_df['name'].isnull()]
unmatched_entities.Entity.unique()

array(['Antigua and Barbuda', 'Aruba', 'Bahrain', 'Barbados', 'Bermuda',
       'Cayman Islands', 'Comoros', 'Dominica', 'French Guiana',
       'Grenada', 'Kiribati', 'Maldives', 'Malta', 'Mauritius', 'Nauru',
       'Saint Kitts and Nevis', 'Saint Lucia',
       'Saint Vincent and the Grenadines', 'Samoa',
       'Sao Tome and Principe', 'Seychelles', 'Singapore', 'Tonga',
       'Tuvalu'], dtype=object)

In [82]:
important_cols = [
    'Entity',
    'Year',
    'Renewable energy share in the total final energy consumption (%)',
    'Access to electricity (% of population)',
    'Financial flows to developing countries (US $)',
    'Electricity from nuclear (TWh)',
    'Electricity from renewables (TWh)',
    'Electricity from fossil fuels (TWh)',
    'continent',
    'subregion',
    'pop_est', 
    'economy',
    'income_grp', 
    'gdp_md', 
    'geometry', 
    'gdp_per_capita_y'
]

preprocessed_df = merged_df[important_cols]
preprocessed_df.head()

Unnamed: 0,Entity,Year,Renewable energy share in the total final energy consumption (%),Access to electricity (% of population),Financial flows to developing countries (US $),Electricity from nuclear (TWh),Electricity from renewables (TWh),Electricity from fossil fuels (TWh),continent,subregion,pop_est,economy,income_grp,gdp_md,geometry,gdp_per_capita_y
0,Afghanistan,2000,44.99,1.613591,20000.0,0.0,0.31,0.16,Asia,Southern Asia,38041754.0,7. Least developed region,5. Low income,19291.0,"POLYGON ((66.51861 37.36278, 67.07578 37.35614...",507.100698
1,Afghanistan,2001,45.6,4.074574,130000.0,0.0,0.5,0.09,Asia,Southern Asia,38041754.0,7. Least developed region,5. Low income,19291.0,"POLYGON ((66.51861 37.36278, 67.07578 37.35614...",507.100698
2,Afghanistan,2002,37.83,9.409158,3950000.0,0.0,0.56,0.13,Asia,Southern Asia,38041754.0,7. Least developed region,5. Low income,19291.0,"POLYGON ((66.51861 37.36278, 67.07578 37.35614...",507.100698
3,Afghanistan,2003,36.66,14.738506,25970000.0,0.0,0.63,0.31,Asia,Southern Asia,38041754.0,7. Least developed region,5. Low income,19291.0,"POLYGON ((66.51861 37.36278, 67.07578 37.35614...",507.100698
4,Afghanistan,2004,44.24,20.064968,,0.0,0.56,0.33,Asia,Southern Asia,38041754.0,7. Least developed region,5. Low income,19291.0,"POLYGON ((66.51861 37.36278, 67.07578 37.35614...",507.100698


In [83]:
preprocessed_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3649 entries, 0 to 3648
Data columns (total 16 columns):
 #   Column                                                            Non-Null Count  Dtype   
---  ------                                                            --------------  -----   
 0   Entity                                                            3649 non-null   object  
 1   Year                                                              3649 non-null   int64   
 2   Renewable energy share in the total final energy consumption (%)  3455 non-null   float64 
 3   Access to electricity (% of population)                           3639 non-null   float64 
 4   Financial flows to developing countries (US $)                    1560 non-null   float64 
 5   Electricity from nuclear (TWh)                                    3523 non-null   float64 
 6   Electricity from renewables (TWh)                                 3628 non-null   float64 
 7   Electricity from fossil 

In [84]:
population_df = preprocessed_df[['Entity', 'pop_est']].copy()
population_df['pop_est'] = population_df['pop_est'].apply(lambda x: str(round(x)) if not pd.isnull(x) else np.nan)
population_df['pop_est'] = population_df['pop_est'].fillna('Not Available')
population_df = population_df.groupby("Entity")["pop_est"].first().reset_index()
population_df['pop_est'] = population_df['pop_est'].astype(str)
population_df

Unnamed: 0,Entity,pop_est
0,Afghanistan,38041754
1,Albania,2854191
2,Algeria,43053054
3,Angola,31825295
4,Antigua and Barbuda,Not Available
...,...,...
171,Uzbekistan,33580650
172,Vanuatu,299882
173,Yemen,29161922
174,Zambia,17861030


In [85]:
population_df.to_parquet("../data/preprocessed/population_df.parquet")

In [86]:
preprocessed_gdf = gpd.GeoDataFrame(preprocessed_df)
preprocessed_gdf = preprocessed_gdf[preprocessed_gdf['Entity'] != 'French Guiana']

In [87]:
preprocessed_gdf.to_parquet("../data/preprocessed/preprocessed_gdf.parquet")
world_countries.to_parquet("../data/preprocessed/world_countries.parquet")