In [1]:
# import dependencies
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
# grab our data from the raw github page
url = 'https://raw.githubusercontent.com/owid/co2-data/master/owid-co2-data.csv'
df = pd.read_csv(url)
df.shape

(25989, 60)

In [3]:
# We have some useless data in this dataframe (iso_codes like 'International Transport', 'North America (excl. USA)', etc.)
# These are being dropped since they are legitimately useless. We will also drop the 'OWID_WRL' iso_code which is the CO2
# emissions for the globe since it's not necessary for our analysis.

df = df[df['iso_code'].notna()]
df = df[df['iso_code'] != 'OWID_WRL']

# To get an idea for which countries we should forecast, lets sort the countries in descending order with highest emission
# totals for the year 2020 since it's the most recent

df[df['year']==2020].sort_values(['co2'], ascending=False)[:10]

Unnamed: 0,iso_code,country,year,co2,co2_per_capita,trade_co2,cement_co2,cement_co2_per_capita,coal_co2,coal_co2_per_capita,...,ghg_excluding_lucf_per_capita,methane,methane_per_capita,nitrous_oxide,nitrous_oxide_per_capita,population,gdp,primary_energy_consumption,energy_per_capita,energy_per_gdp
4723,CHN,China,2020,10667.887,7.412,,858.233,0.596,7421.101,5.156,...,,,,,,1439324000.0,,40404.443,28071.824,
24661,USA,United States,2020,4712.771,14.238,,40.795,0.123,888.649,2.685,...,,,,,,331002600.0,,24387.388,73677.319,
11277,IND,India,2020,2441.792,1.769,,122.908,0.089,1587.552,1.15,...,,,,,,1380004000.0,,8884.38,6437.936,
19597,RUS,Russia,2020,1577.136,10.807,,20.306,0.139,356.946,2.446,...,,,,,,145934500.0,,7865.165,53895.194,
12336,JPN,Japan,2020,1030.775,8.15,,25.376,0.201,402.979,3.186,...,,,,,,126476500.0,,4730.588,37402.909,
11595,IRN,Iran,2020,745.035,8.87,,23.877,0.284,5.114,0.061,...,,,,,,83992950.0,,3341.689,39785.35,
9619,DEU,Germany,2020,644.31,7.69,,13.287,0.159,199.077,2.376,...,,,,,,83783940.0,,3364.175,40152.984,
20269,SAU,Saudi Arabia,2020,625.508,17.967,,25.593,0.735,,,...,,,,,,34813870.0,,2933.478,84261.772,
21590,KOR,South Korea,2020,597.605,11.656,,22.871,0.446,282.883,5.518,...,,,,,,51269180.0,,3274.316,63865.188,
11409,IDN,Indonesia,2020,589.5,2.155,,33.804,0.124,300.518,1.099,...,,,,,,273523600.0,,2120.519,7752.6,


In [4]:
df.shape

(21689, 60)

In [5]:
# There are a lot of columns that are of little to no interest to us that contain a significant amount of null values
# With the following code we are dropping any column that has more than 1180 null values (the amount of null values in the
# column co2_growth_abs)
for column in df.columns:
    if df[column].isnull().sum() > 1180:
        df = df.drop(column, axis=1)

In [6]:
df.shape

(21689, 11)

In [7]:
# For our analysis we won't need countries that have null values for their CO2 emissions (since this is the whole point
# of our analysis in the first place)
df[df['co2'].isna()]

Unnamed: 0,iso_code,country,year,co2,co2_per_capita,co2_growth_prct,co2_growth_abs,cumulative_co2,share_global_co2,share_global_cumulative_co2,population
556,ATA,Antarctica,2008,,,0.0,,,,,
557,ATA,Antarctica,2009,,,0.0,,,,,
558,ATA,Antarctica,2010,,,0.0,,,,,
559,ATA,Antarctica,2011,,,0.0,,,,,
560,ATA,Antarctica,2012,,,0.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
25195,VEN,Venezuela,1907,,,0.0,,,,,2919560.0
25197,VEN,Venezuela,1909,,,0.0,,,,,2982568.0
25198,VEN,Venezuela,1910,,,0.0,,,,,3011475.0
25199,VEN,Venezuela,1911,,,0.0,,,,,3038601.0


In [8]:
# Keeping only the rows where df.co2 is NOT a NaN value
df = df[df['co2'].notna()]
df.shape

(20841, 11)

In [9]:
# Check to see that our dataframe contains all the relevant information for our analysis
for column in df.columns:
    print(f"Column {column} has {df[column].isnull().sum()} null values")

Column iso_code has 0 null values
Column country has 0 null values
Column year has 0 null values
Column co2 has 0 null values
Column co2_per_capita has 205 null values
Column co2_growth_prct has 218 null values
Column co2_growth_abs has 332 null values
Column cumulative_co2 has 0 null values
Column share_global_co2 has 0 null values
Column share_global_cumulative_co2 has 0 null values
Column population has 205 null values


In [10]:
df.to_csv('CO2_emissions.csv',index=False)

In [11]:
# Check to make sure the stored dataframe is the same size as the one we exported (simply a quality check to make sure it
# exported correctly and that we are in the correct folder)

df_check = pd.read_csv('CO2_emissions.csv')
df_check.shape

(20841, 11)

In [12]:
# Reading in geoJSON data 
import requests
r = requests.get('https://raw.githubusercontent.com/ahalota/Leaflet.CountrySelect/master/countries.geo.json')
geo_data = r.json()

In [13]:
# Export our geoJSON data to it's own JSON file to be read in later
import json
with open('geo_data.json', 'w') as f:
    json.dump(geo_data, f)

In [36]:
# narrow down the dataset to merge with our geoJSON data
df = df[['country','year','co2']]
df

Unnamed: 0,country,year,co2
0,Afghanistan,1949,0.015
1,Afghanistan,1950,0.084
2,Afghanistan,1951,0.092
3,Afghanistan,1952,0.092
4,Afghanistan,1953,0.106
...,...,...,...
25984,Zimbabwe,2016,10.738
25985,Zimbabwe,2017,9.582
25986,Zimbabwe,2018,11.854
25987,Zimbabwe,2019,10.949


In [18]:
# make a list of unique countries from our dataframe to compare to the country name in the geoJSON 'name'
unique_countries = df['country'].unique()

In [19]:
# display all of the unique countries
unique_countries

array(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola',
       'Anguilla', 'Antarctica', 'Antigua and Barbuda', 'Argentina',
       'Armenia', 'Aruba', 'Australia', 'Austria', 'Azerbaijan',
       'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus',
       'Belgium', 'Belize', 'Benin', 'Bermuda', 'Bhutan', 'Bolivia',
       'Bonaire Sint Eustatius and Saba', 'Bosnia and Herzegovina',
       'Botswana', 'Brazil', 'British Virgin Islands', 'Brunei',
       'Bulgaria', 'Burkina Faso', 'Burundi', 'Cambodia', 'Cameroon',
       'Canada', 'Cape Verde', 'Central African Republic', 'Chad',
       'Chile', 'China', 'Christmas Island', 'Colombia', 'Comoros',
       'Congo', 'Cook Islands', 'Costa Rica', "Cote d'Ivoire", 'Croatia',
       'Cuba', 'Curacao', 'Cyprus', 'Czechia',
       'Democratic Republic of Congo', 'Denmark', 'Djibouti', 'Dominica',
       'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador',
       'Equatorial Guinea', 'Eritrea', 'Estonia', 'Eswatini', 'Ethiop

In [40]:
# Loop through the length of our geo_data['features']
for i in range(len(geo_data['features'])):
    
    # Store the name of the i-th country from our geoJSON data as geo_name
    geo_name = geo_data['features'][i]['properties']['name']
    
    # If the geo_name is contained within our list of unique_countries from the dataframe...
    if geo_name in unique_countries:
        
        # Create a sub dataframe selecting only the rows where country == geo_name
        sub_df = df[df['country']==geo_name]
        
        # year is stored as a list of our values from the sub dataframe. Cast the year as a string (to export to updated JSON file)
        year = sub_df['year'].values.astype(str)
        
        # co2 is stores as a list of our values from the sub dataframe
        co2 = sub_df['co2'].values
        
        # create a dictionary of key,value pairs by zipping our lists together
        mydict=dict(zip(year,co2))
        
        # add the key 'years' and add the dictionary of year:co2 emissions to the geoJSON data
        geo_data['features'][i]['properties']['years'] = mydict
        
    
    else: 
        pass

In [41]:
# check to make sure it was added properly
geo_data['features'][0]['properties']

{'name': 'Afghanistan',
 'years': {'1949': 0.015,
  '1950': 0.084,
  '1951': 0.092,
  '1952': 0.092,
  '1953': 0.106,
  '1954': 0.106,
  '1955': 0.154,
  '1956': 0.183,
  '1957': 0.293,
  '1958': 0.33,
  '1959': 0.385,
  '1960': 0.414,
  '1961': 0.491,
  '1962': 0.689,
  '1963': 0.707,
  '1964': 0.839,
  '1965': 1.007,
  '1966': 1.091,
  '1967': 1.282,
  '1968': 1.223,
  '1969': 0.941,
  '1970': 1.67,
  '1971': 1.894,
  '1972': 1.53,
  '1973': 1.635,
  '1974': 1.913,
  '1975': 2.121,
  '1976': 1.981,
  '1977': 2.384,
  '1978': 2.153,
  '1979': 2.233,
  '1980': 1.756,
  '1981': 1.978,
  '1982': 2.095,
  '1983': 2.52,
  '1984': 2.822,
  '1985': 3.501,
  '1986': 3.134,
  '1987': 3.114,
  '1988': 2.857,
  '1989': 2.765,
  '1990': 2.603,
  '1991': 2.427,
  '1992': 1.379,
  '1993': 1.333,
  '1994': 1.282,
  '1995': 1.23,
  '1996': 1.165,
  '1997': 1.084,
  '1998': 1.029,
  '1999': 0.81,
  '2000': 0.758,
  '2001': 0.798,
  '2002': 1.052,
  '2003': 1.186,
  '2004': 0.889,
  '2005': 1.303,
  '2

In [42]:
# overwrite the geo_data.json file with the added dictionary
with open('geo_data.json', 'w') as f:
    json.dump(geo_data, f)

In [47]:
# check that the updated .json file was exported correctly
f = open('geo_data.json')
data = json.load(f)
data['features'][0]

{'type': 'Feature',
 'id': 'AFG',
 'properties': {'name': 'Afghanistan',
  'years': {'1949': 0.015,
   '1950': 0.084,
   '1951': 0.092,
   '1952': 0.092,
   '1953': 0.106,
   '1954': 0.106,
   '1955': 0.154,
   '1956': 0.183,
   '1957': 0.293,
   '1958': 0.33,
   '1959': 0.385,
   '1960': 0.414,
   '1961': 0.491,
   '1962': 0.689,
   '1963': 0.707,
   '1964': 0.839,
   '1965': 1.007,
   '1966': 1.091,
   '1967': 1.282,
   '1968': 1.223,
   '1969': 0.941,
   '1970': 1.67,
   '1971': 1.894,
   '1972': 1.53,
   '1973': 1.635,
   '1974': 1.913,
   '1975': 2.121,
   '1976': 1.981,
   '1977': 2.384,
   '1978': 2.153,
   '1979': 2.233,
   '1980': 1.756,
   '1981': 1.978,
   '1982': 2.095,
   '1983': 2.52,
   '1984': 2.822,
   '1985': 3.501,
   '1986': 3.134,
   '1987': 3.114,
   '1988': 2.857,
   '1989': 2.765,
   '1990': 2.603,
   '1991': 2.427,
   '1992': 1.379,
   '1993': 1.333,
   '1994': 1.282,
   '1995': 1.23,
   '1996': 1.165,
   '1997': 1.084,
   '1998': 1.029,
   '1999': 0.81,
   '20