<center><h1>Exploratory Data Analysis (EDA)</h1></center>

In [1]:
# Data Manipulation
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import warnings
from ydata_profiling import ProfileReport

# Visualizations
import seaborn as sns
import missingno as mso
import matplotlib.pyplot as plt
import plotly.express as px

# Cloud Operations
from aws_data_processor import DataProcessor

warnings.filterwarnings("ignore") 
pd.set_option('display.max_columns', None)

# AWS tables created specifically for NOAA CO2, CH4, N2O, and SF6 data within the DynamoDB environment.
co2_table = 'CO2DataNOAA'
ch4_table = 'CH4DataNOAA'
n2o_table = 'N2ODataNOAA'
sf6_table = 'SF6DataNOAA'

<center><h1>Carbon Dioxide (CO<sub>2</sub>)</h1></center>

## Fetch Data From AWS

In [None]:
# Fetch and process all CO2 data from AWS database using a custom algorithm 
processor = DataProcessor(co2_table)
fetched_co2_data = processor.fetch_all_data()

# Check for empty database or DynamoDB access, otherwise proceed and process the dataframe a structured table specified in the SQL query
if fetched_co2_data.empty:
    print("No data fetched. Check DynamoDB access or table details.")
else:
    co2_df = processor.process_data(fetched_co2_data)

'Full table data retrieval completed.'

Unnamed: 0,datetime,site,ppm,latitude,longitude,altitude,elevation,intake_height,qcflag,year,month,day,season,co2_change_rate,rolling_avg_ppm,min_ppm_year,max_ppm_year,total_site_ppm_annual,gas
0,1968-01-16 20:04:00+00:00,NWR,322.36,40.05,-105.63,3526.0,3523.0,3.0,...,1968,1,16,Winter,0.0,322.36,322.36,322.36,82,CO2
1,1968-01-16 20:35:00+00:00,NWR,322.42,40.05,-105.63,3526.0,3523.0,3.0,...,1968,1,16,Winter,0.0600000000000022,322.39,322.36,322.42,82,CO2
2,1968-01-16 21:00:00+00:00,NWR,322.65,40.05,-105.63,3526.0,3523.0,3.0,...,1968,1,16,Winter,0.2299999999999613,322.476667,322.36,322.65,82,CO2
3,1968-01-16 21:03:00+00:00,NWR,322.46,40.05,-105.63,3526.0,3523.0,3.0,...,1968,1,16,Winter,-0.1899999999999977,322.4725,322.36,322.65,82,CO2
4,1968-01-16 21:30:00+00:00,NWR,322.56,40.05,-105.63,3526.0,3523.0,3.0,...,1968,1,16,Winter,0.1000000000000227,322.49,322.36,322.65,82,CO2


## Data Wrangling

In [8]:
co2_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1217013 entries, 0 to 1217012
Data columns (total 19 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   datetime               1217013 non-null  object 
 1   site                   1217013 non-null  object 
 2   ppm                    1217013 non-null  object 
 3   latitude               1217013 non-null  object 
 4   longitude              1217013 non-null  object 
 5   altitude               1217013 non-null  object 
 6   elevation              1217013 non-null  object 
 7   intake_height          1217013 non-null  object 
 8   qcflag                 1217013 non-null  object 
 9   year                   1217013 non-null  object 
 10  month                  1217013 non-null  object 
 11  day                    1217013 non-null  object 
 12  season                 1217013 non-null  object 
 13  co2_change_rate        1217013 non-null  object 
 14  rolling_avg_ppm   

#### Data Type Coercion

In [9]:
# Convert datetime to datetime type
co2_df['datetime'] = pd.to_datetime(co2_df['datetime'])

# Convert numeric columns that are currently object types to float or int as appropriate
co2_df['ppm'] = pd.to_numeric(co2_df['ppm'], errors='coerce')
co2_df['latitude'] = pd.to_numeric(co2_df['latitude'], errors='coerce')
co2_df['longitude'] = pd.to_numeric(co2_df['longitude'], errors='coerce')
co2_df['altitude'] = pd.to_numeric(co2_df['altitude'], errors='coerce')
co2_df['elevation'] = pd.to_numeric(co2_df['elevation'], errors='coerce')
co2_df['intake_height'] = pd.to_numeric(co2_df['intake_height'], errors='coerce')
co2_df['co2_change_rate'] = pd.to_numeric(co2_df['co2_change_rate'], errors='coerce')
co2_df['min_ppm_year'] = pd.to_numeric(co2_df['min_ppm_year'], errors='coerce')
co2_df['max_ppm_year'] = pd.to_numeric(co2_df['max_ppm_year'], errors='coerce')

# Convert 'year', 'month', and 'day' to integer
co2_df['year'] = co2_df['year'].astype(int)
co2_df['month'] = co2_df['month'].astype(int)
co2_df['day'] = co2_df['day'].astype(int)

# 'site', 'qcflag', 'season', and 'gas' can remain as object (string) since they are categorical
co2_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1217013 entries, 0 to 1217012
Data columns (total 19 columns):
 #   Column                 Non-Null Count    Dtype              
---  ------                 --------------    -----              
 0   datetime               1217013 non-null  datetime64[ns, UTC]
 1   site                   1217013 non-null  object             
 2   ppm                    1217013 non-null  float64            
 3   latitude               1217013 non-null  float64            
 4   longitude              1217013 non-null  float64            
 5   altitude               1217013 non-null  float64            
 6   elevation              1217013 non-null  float64            
 7   intake_height          1217013 non-null  float64            
 8   qcflag                 1217013 non-null  object             
 9   year                   1217013 non-null  int32              
 10  month                  1217013 non-null  int32              
 11  day                    1

## Fetch Data From PostgreSQL

**To prevent accruing AWS costs, during the EDA phase I am just going to query postgres to avoid costs for reading millions of rows of data. This will also be much faster on my local machine. AWS is mainly for deployment purposes and there is no need to continue to query millions of rows on a daily basis when there is a backup database stored locally.**

In [None]:
engine = create_engine('postgresql://postgres:password@localhost:5432/gml_ghg')
co2_df = pd.read_sql_query('SELECT * FROM "CO2DataNOAA"', engine)
engine.dispose()

In [4]:
print(co2_df.shape)
co2_df.head().style.format(precision=2).set_properties(**{'border': '1.5px solid blue'}).background_gradient(cmap='coolwarm')

(1330405, 16)


Unnamed: 0,id,datetime,site,ppm,latitude,longitude,altitude,elevation,intake_height,qcflag,year,month,day,season,co2_change_rate,gas
0,1,1968-01-16 12:04:00-08:00,NWR,322.36,40.05,-105.63,3526.0,3523.0,3.0,...,1968,1,16,Winter,0.0,CO2
1,2,1968-01-16 12:35:00-08:00,NWR,322.42,40.05,-105.63,3526.0,3523.0,3.0,...,1968,1,16,Winter,0.06,CO2
2,3,1968-01-16 13:00:00-08:00,NWR,322.65,40.05,-105.63,3526.0,3523.0,3.0,...,1968,1,16,Winter,0.23,CO2
3,4,1968-01-16 13:03:00-08:00,NWR,322.46,40.05,-105.63,3526.0,3523.0,3.0,...,1968,1,16,Winter,-0.19,CO2
4,5,1968-01-16 13:30:00-08:00,NWR,322.56,40.05,-105.63,3526.0,3523.0,3.0,...,1968,1,16,Winter,0.1,CO2


In [8]:
co2_df.describe().style.format(precision=2).set_properties(**{'border': '1.5px solid blue'}).background_gradient(cmap='coolwarm')

Unnamed: 0,id,ppm,latitude,longitude,altitude,elevation,intake_height,year,month,day,co2_change_rate
count,1330405.0,1330405.0,1330405.0,1330405.0,1330405.0,1330405.0,1330405.0,1330405.0,1330405.0,1330405.0,1330405.0
mean,665203.0,403.12,37.5,-93.36,771.56,624.89,146.67,2013.65,6.5,15.71,0.0
std,384054.99,19.28,21.15,38.34,798.73,850.87,153.88,7.82,3.45,8.79,4.07
min,1.0,315.71,-89.98,-177.38,-1.2,0.0,-3234.2,1968.0,1.0,1.0,-132.46
25%,332602.0,393.06,33.41,-113.72,340.7,52.4,17.1,2010.0,3.0,8.0,-0.71
50%,665203.0,404.62,40.05,-91.35,570.0,251.0,99.0,2015.0,6.0,16.0,-0.01
75%,997804.0,416.47,45.03,-78.35,850.0,611.43,244.0,2019.0,10.0,23.0,0.66
max,1330405.0,559.02,82.45,174.87,4469.0,4464.0,484.0,2024.0,12.0,31.0,140.13


#### Check For Missing Values

In [9]:
print(co2_df.isnull().sum())

id                 0
datetime           0
site               0
ppm                0
latitude           0
longitude          0
altitude           0
elevation          0
intake_height      0
qcflag             0
year               0
month              0
day                0
season             0
co2_change_rate    0
gas                0
dtype: int64


#### Check For Duplicate Values

In [10]:
def duplicates(df):
    before = df.shape[0]
    df = df.drop_duplicates()
    after = df.shape[0]

    total_duplicates = before - after
    print(f"There are a total of {total_duplicates} duplicates in this dataset.")
duplicates(co2_df)

There are a total of 0 duplicates in this dataset.


Since 'altitude' is just the sum of 'elevation' and 'intake_height', these features can be dropped as they don't add any useful information with regards to emission analysis. This information is useful for aerial intake operations so the user knows exactly where in the atmosphere this is being recorded from, but since all data for this project is taken from surface repos, only 'altitude' is needed to show exactly where the gas is being recorded. Also, since the qcflag was only necessary for database ingestion, it can now be dropped for the EDA phase.

In [11]:
co2_df = co2_df.drop(columns=['elevation', 'intake_height', 'qcflag'])

In [13]:
print(co2_df.shape)
co2_df.head(5).style.format(precision=2).set_properties(**{'border': '1.5px solid blue'}).background_gradient(cmap='coolwarm')

(1330405, 13)


Unnamed: 0,id,datetime,site,ppm,latitude,longitude,altitude,year,month,day,season,co2_change_rate,gas
0,1,1968-01-16 12:04:00-08:00,NWR,322.36,40.05,-105.63,3526.0,1968,1,16,Winter,0.0,CO2
1,2,1968-01-16 12:35:00-08:00,NWR,322.42,40.05,-105.63,3526.0,1968,1,16,Winter,0.06,CO2
2,3,1968-01-16 13:00:00-08:00,NWR,322.65,40.05,-105.63,3526.0,1968,1,16,Winter,0.23,CO2
3,4,1968-01-16 13:03:00-08:00,NWR,322.46,40.05,-105.63,3526.0,1968,1,16,Winter,-0.19,CO2
4,5,1968-01-16 13:30:00-08:00,NWR,322.56,40.05,-105.63,3526.0,1968,1,16,Winter,0.1,CO2


## Time Series Analysis