# Import libraries

In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import seaborn as sns
import world_bank_data as wb

# Load data:

In [2]:
CO2_EMISSION_DATA = "EN.ATM.CO2E.KT"
COUNTRY_POPULATION_SIZE_DATA = "SP.POP.TOTL"
POVERTY_HEADCOUNT_RATIO_DATA = "SI.POV.DDAY"
COUNTRY_LAND_AREA_DATA = "AG.LND.TOTL.K2"
COUNTRY_GDP_DATA = "NY.GDP.MKTP.CD"

## CO2 emissions data:

In [3]:
co2_emissions_series = (
    wb.get_series(CO2_EMISSION_DATA, simplify_index=True)
    .reset_index()
    .rename(columns={CO2_EMISSION_DATA: "co2 emissions"})
)

In [4]:
co2_emissions_df = pd.DataFrame(co2_emissions_series)

In [5]:
co2_emissions_df.head()

Unnamed: 0,Country,Year,co2 emissions
0,Arab World,1960,56005.299798
1,Arab World,1961,62578.60559
2,Arab World,1962,70562.050726
3,Arab World,1963,85085.751513
4,Arab World,1964,99693.913771


## Country population data:

In [6]:
# year 2019 has the most recent non-missing data
country_population_series = (
    wb.get_series(COUNTRY_POPULATION_SIZE_DATA, simplify_index=True, date="2019")
    .reset_index()
    .rename(columns={COUNTRY_POPULATION_SIZE_DATA: "population size"})
)

In [7]:
country_population_df = pd.DataFrame(country_population_series)

In [8]:
country_population_df.head()

Unnamed: 0,Country,population size
0,Arab World,427870300.0
1,Caribbean small states,7401381.0
2,Central Europe and the Baltics,102378600.0
3,Early-demographic dividend,3290291000.0
4,East Asia & Pacific,2340628000.0


## Country poverty percentage data:

In [9]:
poverty_percentages_series = (
    wb.get_series(POVERTY_HEADCOUNT_RATIO_DATA, simplify_index=True)
    .reset_index()
    .rename(columns={POVERTY_HEADCOUNT_RATIO_DATA: "poverty percentage"})
)

In [10]:
poverty_percentages_df = pd.DataFrame(poverty_percentages_series)

In [11]:
poverty_percentages_df.head()

Unnamed: 0,Country,Year,poverty percentage
0,Arab World,1960,
1,Arab World,1961,
2,Arab World,1962,
3,Arab World,1963,
4,Arab World,1964,


## Country land area data:

In [12]:
# year 2017 has the most recent non-missing data
country_land_area_series = (
    wb.get_series(COUNTRY_LAND_AREA_DATA, simplify_index=True, date="2017")
    .reset_index()
    .rename(columns={COUNTRY_LAND_AREA_DATA: "land area"})
)

In [13]:
country_land_area_df = pd.DataFrame(country_land_area_series)

In [14]:
country_land_area_df.head()

Unnamed: 0,Country,land area
0,Arab World,11232650.0
1,Caribbean small states,404850.0
2,Central Europe and the Baltics,1105054.0
3,Early-demographic dividend,33107750.0
4,East Asia & Pacific,24396750.0


## Country GDP data:

In [15]:
country_gdp_series = (
    wb.get_series(COUNTRY_GDP_DATA, simplify_index=True)
    .reset_index()
    .rename(columns={COUNTRY_GDP_DATA: "GDP"})
)

In [16]:
country_gdp_df = pd.DataFrame(country_gdp_series)

In [17]:
country_gdp_df.head()

Unnamed: 0,Country,Year,GDP
0,Arab World,1960,
1,Arab World,1961,
2,Arab World,1962,
3,Arab World,1963,
4,Arab World,1964,


## Country income group data:

In [18]:
countries_df = wb.get_countries().reset_index()

In [19]:
countries_df.head()

Unnamed: 0,id,iso2Code,name,region,adminregion,incomeLevel,lendingType,capitalCity,longitude,latitude
0,ABW,AW,Aruba,Latin America & Caribbean,,High income,Not classified,Oranjestad,-70.0167,12.5167
1,AFG,AF,Afghanistan,South Asia,South Asia,Low income,IDA,Kabul,69.1761,34.5228
2,AFR,A9,Africa,Aggregates,,Aggregates,Aggregates,,,
3,AGO,AO,Angola,Sub-Saharan Africa,Sub-Saharan Africa (excluding high income),Lower middle income,IBRD,Luanda,13.242,-8.81155
4,ALB,AL,Albania,Europe & Central Asia,Europe & Central Asia (excluding high income),Upper middle income,IBRD,Tirane,19.8172,41.3317


# Preprocess data:

## Convert `dtyps`:

### CO2 emissions data:

In [20]:
co2_emissions_df.dtypes

Country           object
Year              object
co2 emissions    float64
dtype: object

In [21]:
co2_emissions_df["Year"] = co2_emissions_df["Year"].astype(int)

In [22]:
co2_emissions_df.dtypes

Country           object
Year               int64
co2 emissions    float64
dtype: object

### Country population data:

In [23]:
country_population_df.dtypes

Country             object
population size    float64
dtype: object

### Country poverty percentage data:

In [24]:
poverty_percentages_series.dtypes

Country                object
Year                   object
poverty percentage    float64
dtype: object

In [25]:
poverty_percentages_df["Year"] = poverty_percentages_df["Year"].astype(int)

In [26]:
poverty_percentages_df.dtypes

Country                object
Year                    int64
poverty percentage    float64
dtype: object

### Country land area data:

In [27]:
country_land_area_df.dtypes

Country       object
land area    float64
dtype: object

### Country GDP data:

In [28]:
country_gdp_df.dtypes

Country     object
Year        object
GDP        float64
dtype: object

In [29]:
country_gdp_df["Year"] = country_gdp_df["Year"].astype(int)

In [30]:
country_gdp_df.dtypes

Country     object
Year         int64
GDP        float64
dtype: object

## Drop non-country rows

In [31]:
non_countries = countries_df.loc[lambda x: x["region"] == "Aggregates"]["name"].values

In [32]:
non_countries = np.append(non_countries, "Latin America & Caribbean")
non_countries = np.append(non_countries, "Sub-Saharan Africa")

In [33]:
for df in [
    co2_emissions_df,
    country_population_df,
    poverty_percentages_df,
    country_land_area_df,
    country_gdp_df,
]:
    print(f"{df.columns[-1]} shape before: {df.shape}")
    df.drop(index=df[df["Country"].isin(non_countries)].index, inplace=True)
    print(f"{df.columns[-1]} shape after: {df.shape}")

co2 emissions shape before: (16104, 3)
co2 emissions shape after: (13237, 3)
population size shape before: (264, 2)
population size shape after: (217, 2)
poverty percentage shape before: (16104, 3)
poverty percentage shape after: (13237, 3)
land area shape before: (264, 2)
land area shape after: (217, 2)
GDP shape before: (16104, 3)
GDP shape after: (13237, 3)


## Create aggregated data:

In [34]:
co2_emissions_agg_df = (
    co2_emissions_df.groupby(by="Country")
    .agg({"co2 emissions": "sum"})
    .reset_index()
    .sort_values(by="co2 emissions", ascending=False)
)

In [35]:
co2_emissions_agg_df.head()

Unnamed: 0,Country,co2 emissions
206,United States,269847800.0
41,China,189678900.0
161,Russian Federation,133252600.0
98,Japan,54256950.0
89,India,43495700.0


## Merge data into two data frames:

In [88]:
columns_to_drop = [
    "id",
    "iso2Code",
    "name",
    "adminregion",
    "lendingType",
    "capitalCity",
    "longitude",
    "latitude",
]

In [91]:
overall_stats_df = (
    co2_emissions_agg_df.merge(right=country_land_area_df, on="Country")
    .merge(right=country_population_df, on="Country")
    .merge(right=countries_df, left_on="Country", right_on="name")
    .drop(columns=columns_to_drop)
    .sort_values(by="co2 emissions")
)

In [92]:
overall_stats_df.head()

Unnamed: 0,Country,co2 emissions,land area,population size,region,incomeLevel
216,Isle of Man,0.0,570.0,84584.0,Europe & Central Asia,High income
207,Guam,0.0,540.0,167294.0,East Asia & Pacific,High income
208,Virgin Islands (U.S.),0.0,350.0,106631.0,Latin America & Caribbean,High income
209,American Samoa,0.0,200.0,55312.0,East Asia & Pacific,Upper middle income
210,St. Martin (French part),0.0,54.4,38002.0,Latin America & Caribbean,High income
