## Table of Contents
1. Import Libraries and Load Data
2. Data Filtering
3. Data Cleaning

## 1. Import Libraries and Load Data

In [3]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import skimpy
from summarytools import dfSummary

In [5]:
df_co2 = pd.read_csv('/Users/samabrams/Data Analysis Projects/CO2_Emissions_Project/02 Data/Original Data/visualizing_global_co2_data.csv')

In [7]:
df_co2.shape

(50598, 79)

In [9]:
# Dataframe Summary with dfSummary
co2_summary = dfSummary(df_co2)
co2_summary

No,Variable,Stats / Values,Freqs / (% of Valid),Graph,Missing
1,country [object],1. Lower-middle-income countries 2. Tuvalu 3. Brunei 4. Cambodia 5. South America 6. Cote d'Ivoire 7. Iceland 8. Singapore 9. Sierra Leone 10. North America 11. other,"272 (0.5%) 272 (0.5%) 272 (0.5%) 272 (0.5%) 272 (0.5%) 272 (0.5%) 272 (0.5%) 272 (0.5%) 272 (0.5%) 272 (0.5%) 47,878 (94.6%)",,0 (0.0%)
2,year [int64],Mean (sd) : 1925.4 (59.9) min < med < max: 1750.0 < 1929.0 < 2021.0 IQR (CV) : 93.0 (32.2),272 distinct values,,0 (0.0%)
3,iso_code [object],1. nan 2. AUS 3. BRN 4. PNG 5. KHM 6. TUV 7. GUF 8. GLP 9. GBR 10. GIN 11. other,"8,456 (16.7%) 272 (0.5%) 272 (0.5%) 272 (0.5%) 272 (0.5%) 272 (0.5%) 272 (0.5%) 272 (0.5%) 272 (0.5%) 272 (0.5%) 39,694 (78.4%)",,"8,456 (16.7%)"
4,population [float64],Mean (sd) : 59268096.5 (322286565.6) min < med < max: 21.0 < 2323117.0 < 7909295104.0 IQR (CV) : 9482889.5 (0.2),"39,348 distinct values",,"10,590 (20.9%)"
5,gdp [float64],Mean (sd) : 267758587335.4 (2103150997479.2) min < med < max: 49980000.0 < 25979985920.0 < 113630171365376.0 IQR (CV) : 105777557120.0 (0.1),"14,561 distinct values",,"36,034 (71.2%)"
6,cement_co2 [float64],Mean (sd) : 8.4 (63.0) min < med < max: 0.0 < 0.0 < 1672.6 IQR (CV) : 0.7 (0.1),"5,261 distinct values",,"25,624 (50.6%)"
7,cement_co2_per_capita [float64],Mean (sd) : 0.1 (0.1) min < med < max: 0.0 < 0.0 < 2.6 IQR (CV) : 0.1 (0.5),691 distinct values,,"27,884 (55.1%)"
8,co2 [float64],Mean (sd) : 380.2 (1801.5) min < med < max: 0.0 < 3.1 < 37123.9 IQR (CV) : 43.6 (0.2),"15,779 distinct values",,"19,249 (38.0%)"
9,co2_growth_abs [float64],Mean (sd) : 5.7 (58.7) min < med < max: -1818.5 < 0.0 < 1859.8 IQR (CV) : 0.8 (0.1),"9,103 distinct values",,"21,588 (42.7%)"
10,co2_growth_prct [float64],Mean (sd) : 20.5 (699.6) min < med < max: -100.0 < 3.8 < 102318.5 IQR (CV) : 11.1 (0.0),"15,301 distinct values",,"25,566 (50.5%)"


In [15]:
pd.set_option("display.max_rows", None)
df_co2['year'].value_counts().sort_index()

year
1750     34
1751     34
1752     34
1753     34
1754     34
1755     34
1756     34
1757     34
1758     34
1759     34
1760     34
1761     34
1762     34
1763     34
1764     34
1765     34
1766     34
1767     34
1768     34
1769     34
1770     34
1771     34
1772     34
1773     34
1774     34
1775     34
1776     34
1777     34
1778     34
1779     34
1780     34
1781     34
1782     34
1783     34
1784     34
1785     35
1786     35
1787     35
1788     35
1789     35
1790     35
1791     35
1792     37
1793     37
1794     37
1795     37
1796     37
1797     37
1798     37
1799     37
1800     39
1801     39
1802     41
1803     39
1804     40
1805     39
1806     39
1807     40
1808     39
1809     39
1810     40
1811     40
1812     40
1813     40
1814     40
1815     40
1816     40
1817     40
1818     40
1819     41
1820     41
1821     41
1822     41
1823     41
1824     41
1825     41
1826     41
1827     41
1828     41
1829     42
1830     60
1831     60
1832     59

## 2. Filtering Data

In [20]:
df_co2_time = df_co2.copy()[(df_co2['year'] >=1851) & (df_co2['year'] <= 2019)]

In [22]:
df_co2_time.shape

(45900, 79)

In [24]:
df_co2.shape

(50598, 79)

In [40]:
# Create new dataframe filtered to 1851-2019 - 1851 is the first year with complete data for most countries, 2019 is the last pre-pandemic year
# Also filtering for several key columns instead of pulling all 49
columns_to_keep = ['country', 'year', 'iso_code', 'co2', 'co2_per_capita', 'co2_growth_abs', 'coal_co2', 'oil_co2', 'gas_co2', 'cumulative_co2', 'population']
df_co2_time = df_co2.copy().loc[(df_co2['year'] >= 1851) & (df_co2['year'] <= 2019), columns_to_keep]

In [42]:
df_co2_time.shape

(45900, 11)

In [44]:
# Summary of new dataframe
co2_time_summary = dfSummary(df_co2_time)
co2_time_summary

No,Variable,Stats / Values,Freqs / (% of Valid),Graph,Missing
1,country [object],1. Afghanistan 2. Niue 3. North Macedonia 4. North Korea 5. North America (excl. USA) 6. North America (GCP) 7. North America 8. Non-OECD (GCP) 9. Nigeria 10. Qatar 11. other,"169 (0.4%) 169 (0.4%) 169 (0.4%) 169 (0.4%) 169 (0.4%) 169 (0.4%) 169 (0.4%) 169 (0.4%) 169 (0.4%) 169 (0.4%) 44,210 (96.3%)",,0 (0.0%)
2,year [int64],Mean (sd) : 1935.1 (48.8) min < med < max: 1851.0 < 1935.0 < 2019.0 IQR (CV) : 84.0 (39.7),169 distinct values,,0 (0.0%)
3,iso_code [object],1. nan 2. PAK 3. NLD 4. ANT 5. NCL 6. NZL 7. NIC 8. NER 9. NGA 10. NIU 11. other,"6,692 (14.6%) 169 (0.4%) 169 (0.4%) 169 (0.4%) 169 (0.4%) 169 (0.4%) 169 (0.4%) 169 (0.4%) 169 (0.4%) 169 (0.4%) 37,687 (82.1%)",,"6,692 (14.6%)"
4,co2 [float64],Mean (sd) : 425.2 (1871.6) min < med < max: 0.0 < 4.6 < 37082.6 IQR (CV) : 56.8 (0.2),"15,108 distinct values",,"19,036 (41.5%)"
5,co2_per_capita [float64],Mean (sd) : 4.0 (15.7) min < med < max: 0.0 < 1.1 < 824.5 IQR (CV) : 4.3 (0.3),"8,571 distinct values",,"21,645 (47.2%)"
6,co2_growth_abs [float64],Mean (sd) : 6.7 (59.5) min < med < max: -863.6 < 0.0 < 1800.3 IQR (CV) : 1.1 (0.1),"8,830 distinct values",,"21,257 (46.3%)"
7,coal_co2 [float64],Mean (sd) : 161.1 (753.8) min < med < max: 0.0 < 0.6 < 15051.5 IQR (CV) : 16.0 (0.2),"9,876 distinct values",,"22,828 (49.7%)"
8,oil_co2 [float64],Mean (sd) : 116.1 (634.8) min < med < max: 0.0 < 1.6 < 12345.7 IQR (CV) : 14.8 (0.2),"10,324 distinct values",,"22,756 (49.6%)"
9,gas_co2 [float64],Mean (sd) : 51.0 (306.7) min < med < max: 0.0 < 0.0 < 7647.5 IQR (CV) : 1.6 (0.2),"5,953 distinct values",,"22,853 (49.8%)"
10,cumulative_co2 [float64],Mean (sd) : 12094.7 (67183.8) min < med < max: 0.0 < 69.2 < 1664542.0 IQR (CV) : 1097.2 (0.2),"19,527 distinct values",,"20,996 (45.7%)"


In [32]:
df_co2_time['country'].value_counts()

country
Afghanistan                                      169
Niue                                             169
North Macedonia                                  169
North Korea                                      169
North America (excl. USA)                        169
North America (GCP)                              169
North America                                    169
Non-OECD (GCP)                                   169
Nigeria                                          169
Qatar                                            169
Niger                                            169
Nicaragua                                        169
New Zealand                                      169
New Caledonia                                    169
Netherlands Antilles                             169
Netherlands                                      169
Norway                                           169
OECD (GCP)                                       169
OECD (Jones et al. 2023)              

In [46]:
# Remove countries that don't have a full set of entries for all years
places_to_remove = [
    "Panama Canal Zone (GCP)",
    "St. Kitts-Nevis-Anguilla (GCP)",
    "Ryukyu Islands (GCP)",
    "French Equatorial Africa (GCP)",
    "French West Africa (GCP)",
    "Leeward Islands (GCP)",
    "Kuwaiti Oil Fires (GCP)"]

df_co2_time2 = df_co2_time[~df_co2_time['country'].isin(places_to_remove)]


In [50]:
df_co2_time2.shape

(45799, 11)

In [52]:
# Check for Missing Values
df_co2_time2.isnull().sum().sort_values(ascending=True)

country               0
year                  0
iso_code           6591
population         8338
co2               19036
cumulative_co2    20895
co2_growth_abs    21156
co2_per_capita    21544
oil_co2           22756
coal_co2          22828
gas_co2           22853
dtype: int64

In [58]:
df_co2_time2.to_csv('/Users/samabrams/Data Analysis Projects/CO2_Emissions_Project/02 Data/Prepared Data/co2 time series.csv')

### Filtering for Most Complete Data

Because there are so many missing values, I'm going to further filter this data down to only those countries with the most complete datasets. I'm

In [63]:
# Group by country and count non-null values for `co2` and total years.
country_data = df_co2_time2.groupby('country').agg(
    co2_count=('co2', 'count'),
    total_years=('year', 'nunique'))

In [65]:
# Identify countries with complete data for 'co2'
complete_data_countries = country_data[country_data['co2_count'] == country_data['total_years']]

In [67]:
# Print the countries with complete data
if complete_data_countries.empty:
    print("No countries have complete data for the 'co2' column.")
else:
    print("Countries with complete data for the 'co2' column:")
    print(complete_data_countries.index.to_list())

Countries with complete data for the 'co2' column:
['Africa', 'Africa (GCP)', 'Andorra', 'Asia', 'Asia (GCP)', 'Asia (excl. China and India)', 'Australia', 'Austria', 'Belgium', 'Brunei', 'Cambodia', 'Canada', 'Central America (GCP)', "Cote d'Ivoire", 'Denmark', 'Europe', 'Europe (GCP)', 'Europe (excl. EU-27)', 'Europe (excl. EU-28)', 'European Union (27)', 'European Union (27) (GCP)', 'European Union (28)', 'France', 'French Guiana', 'Germany', 'Guadeloupe', 'Guinea', 'High-income countries', 'Hungary', 'Iceland', 'Low-income countries', 'Lower-middle-income countries', 'Martinique', 'Mauritania', 'Mayotte', 'Middle East (GCP)', 'Netherlands', 'Non-OECD (GCP)', 'North America', 'North America (GCP)', 'North America (excl. USA)', 'Norway', 'OECD (GCP)', 'Oceania', 'Oceania (GCP)', 'Papua New Guinea', 'Poland', 'Reunion', 'Sierra Leone', 'Singapore', 'South America', 'South America (GCP)', 'Spain', 'Sweden', 'Tuvalu', 'United Kingdom', 'United States', 'Upper-middle-income countries', '

In [69]:
# Count Countries
len(complete_data_countries)

59

In [71]:
# Create new dataframe with only these countries
complete_data_country_list = complete_data_countries.index.to_list()

df_co2_complete = df_co2_time2[df_co2_time2['country'].isin(complete_data_country_list)]

In [73]:
df_co2_complete.head()

Unnamed: 0,country,year,iso_code,co2,co2_per_capita,co2_growth_abs,coal_co2,oil_co2,gas_co2,cumulative_co2,population
273,Africa,1851,,0.0,0.0,0.0,,,,0.0,112843149.0
274,Africa,1852,,0.0,0.0,0.0,,,,0.0,113379285.0
275,Africa,1853,,0.0,0.0,0.0,,,,0.0,113793047.0
276,Africa,1854,,0.0,0.0,0.0,,,,0.0,114216849.0
277,Africa,1855,,0.0,0.0,0.0,,,,0.0,114631842.0


In [75]:
# Summary of new dataframe
co2_complete_summary = dfSummary(df_co2_complete)
co2_complete_summary

No,Variable,Stats / Values,Freqs / (% of Valid),Graph,Missing
1,country [object],1. Africa 2. Oceania (GCP) 3. Martinique 4. Mauritania 5. Mayotte 6. Middle East (GCP) 7. Netherlands 8. Non-OECD (GCP) 9. North America 10. North America (GCP) 11. other,"169 (1.7%) 169 (1.7%) 169 (1.7%) 169 (1.7%) 169 (1.7%) 169 (1.7%) 169 (1.7%) 169 (1.7%) 169 (1.7%) 169 (1.7%) 8,281 (83.1%)",,0 (0.0%)
2,year [int64],Mean (sd) : 1935.0 (48.8) min < med < max: 1851.0 < 1935.0 < 2019.0 IQR (CV) : 84.0 (39.7),169 distinct values,,0 (0.0%)
3,iso_code [object],1. nan 2. AND 3. GBR 4. TUV 5. SWE 6. ESP 7. SGP 8. SLE 9. REU 10. POL 11. other,"4,732 (47.5%) 169 (1.7%) 169 (1.7%) 169 (1.7%) 169 (1.7%) 169 (1.7%) 169 (1.7%) 169 (1.7%) 169 (1.7%) 169 (1.7%) 3,718 (37.3%)",,"4,732 (47.5%)"
4,co2 [float64],Mean (sd) : 1055.1 (2936.0) min < med < max: 0.0 < 41.0 < 37082.6 IQR (CV) : 528.2 (0.4),"6,544 distinct values",,0 (0.0%)
5,co2_per_capita [float64],Mean (sd) : 3.9 (5.9) min < med < max: 0.0 < 1.7 < 245.1 IQR (CV) : 6.0 (0.7),"4,427 distinct values",,"2,309 (23.2%)"
6,co2_growth_abs [float64],Mean (sd) : 16.8 (99.0) min < med < max: -863.6 < 0.0 < 1800.3 IQR (CV) : 5.9 (0.2),"4,677 distinct values",,"1,860 (18.7%)"
7,coal_co2 [float64],Mean (sd) : 531.0 (1329.2) min < med < max: 0.0 < 45.9 < 15051.5 IQR (CV) : 421.4 (0.4),"5,113 distinct values",,"3,713 (37.2%)"
8,oil_co2 [float64],Mean (sd) : 377.3 (1174.6) min < med < max: 0.0 < 7.8 < 12345.7 IQR (CV) : 119.6 (0.3),"4,312 distinct values",,"3,717 (37.3%)"
9,gas_co2 [float64],Mean (sd) : 168.4 (569.5) min < med < max: 0.0 < 0.0 < 7647.5 IQR (CV) : 24.9 (0.3),"2,586 distinct values",,"3,744 (37.5%)"
10,cumulative_co2 [float64],Mean (sd) : 34283.9 (114056.0) min < med < max: 0.0 < 511.7 < 1664542.0 IQR (CV) : 11254.7 (0.3),"6,103 distinct values",,"1,859 (18.6%)"


In [77]:
# Check for Missing Values
df_co2_complete.isnull().sum().sort_values(ascending=True)

country              0
year                 0
co2                  0
cumulative_co2    1859
co2_growth_abs    1860
co2_per_capita    2309
population        3154
coal_co2          3713
oil_co2           3717
gas_co2           3744
iso_code          4732
dtype: int64

In [79]:
df_co2_complete.shape

(9971, 11)

In [83]:
df_co2_complete.to_csv('/Users/samabrams/Data Analysis Projects/CO2_Emissions_Project/02 Data/Prepared Data/co2_time_series_complete.csv')