<center><h1>Exploratory Data Analysis (EDA)</h1></center>

In [None]:
# Data Manipulation
import pandas as pd
import numpy as np
import warnings
from aws_data_processor import DataProcessor

# Visualizations
import seaborn as sns
import missingno as mso
import matplotlib.pyplot as plt
import plotly.express as px

warnings.filterwarnings("ignore") 
pd.set_option('display.max_columns', None)

# AWS tables created specifically for NOAA CO2, CH4, N2O, and SF6 data within the DynamoDB environment.
co2_table = 'CO2DataNOAA'
ch4_table = 'CH4DataNOAA'
n2o_table = 'N2ODataNOAA'
sf6_table = 'SF6DataNOAA'

<center><h1>Carbon Dioxide (CO<sub>2</sub>)</h1></center>

## Data Wrangling

In [None]:
# Fetch and process all CO2 data from AWS database using a custom algorithm 
processor = DataProcessor(co2_table)
fetched_co2_data = processor.fetch_all_data()

# Check for empty database or DynamoDB access, otherwise proceed and process the dataframe a structured table specified in the SQL query
if fetched_co2_data.empty:
    print("No data fetched. Check DynamoDB access or table details.")
else:
    co2_df = processor.process_data(fetched_co2_data)

'Full table data retrieval completed.'

Unnamed: 0,datetime,site,ppm,latitude,longitude,altitude,elevation,intake_height,qcflag,year,month,day,season,co2_change_rate,rolling_avg_ppm,min_ppm_year,max_ppm_year,total_site_ppm_annual,gas
0,1968-01-16 20:04:00+00:00,NWR,322.36,40.05,-105.63,3526.0,3523.0,3.0,...,1968,1,16,Winter,0.0,322.36,322.36,322.36,82,CO2
1,1968-01-16 20:35:00+00:00,NWR,322.42,40.05,-105.63,3526.0,3523.0,3.0,...,1968,1,16,Winter,0.0600000000000022,322.39,322.36,322.42,82,CO2
2,1968-01-16 21:00:00+00:00,NWR,322.65,40.05,-105.63,3526.0,3523.0,3.0,...,1968,1,16,Winter,0.2299999999999613,322.476667,322.36,322.65,82,CO2
3,1968-01-16 21:03:00+00:00,NWR,322.46,40.05,-105.63,3526.0,3523.0,3.0,...,1968,1,16,Winter,-0.1899999999999977,322.4725,322.36,322.65,82,CO2
4,1968-01-16 21:30:00+00:00,NWR,322.56,40.05,-105.63,3526.0,3523.0,3.0,...,1968,1,16,Winter,0.1000000000000227,322.49,322.36,322.65,82,CO2


#### Data Type Coercion

In [8]:
co2_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1217013 entries, 0 to 1217012
Data columns (total 19 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   datetime               1217013 non-null  object 
 1   site                   1217013 non-null  object 
 2   ppm                    1217013 non-null  object 
 3   latitude               1217013 non-null  object 
 4   longitude              1217013 non-null  object 
 5   altitude               1217013 non-null  object 
 6   elevation              1217013 non-null  object 
 7   intake_height          1217013 non-null  object 
 8   qcflag                 1217013 non-null  object 
 9   year                   1217013 non-null  object 
 10  month                  1217013 non-null  object 
 11  day                    1217013 non-null  object 
 12  season                 1217013 non-null  object 
 13  co2_change_rate        1217013 non-null  object 
 14  rolling_avg_ppm   

#### Instert ET from ETL phase below and continue with EDA process without the need to use AWS, now that the code for AWS configuration is known, the only data that needs to be pulled is the data straight from NOAA GML. Once the full EDA phase is complete, including any feature engineering steps, then create a full preprocessor file to prepare the data for a neural network. This of what data is needed, what data is redundant. Some data such as min a max values for the year is useful for EDA, but not for the neural network. 

In [9]:
# Convert datetime to datetime type
co2_df['datetime'] = pd.to_datetime(co2_df['datetime'])

# Convert numeric columns that are currently object types to float or int as appropriate
co2_df['ppm'] = pd.to_numeric(co2_df['ppm'], errors='coerce')
co2_df['latitude'] = pd.to_numeric(co2_df['latitude'], errors='coerce')
co2_df['longitude'] = pd.to_numeric(co2_df['longitude'], errors='coerce')
co2_df['altitude'] = pd.to_numeric(co2_df['altitude'], errors='coerce')
co2_df['elevation'] = pd.to_numeric(co2_df['elevation'], errors='coerce')
co2_df['intake_height'] = pd.to_numeric(co2_df['intake_height'], errors='coerce')
co2_df['co2_change_rate'] = pd.to_numeric(co2_df['co2_change_rate'], errors='coerce')
co2_df['min_ppm_year'] = pd.to_numeric(co2_df['min_ppm_year'], errors='coerce')
co2_df['max_ppm_year'] = pd.to_numeric(co2_df['max_ppm_year'], errors='coerce')

# Convert 'year', 'month', and 'day' to integer
co2_df['year'] = co2_df['year'].astype(int)
co2_df['month'] = co2_df['month'].astype(int)
co2_df['day'] = co2_df['day'].astype(int)

# 'site', 'qcflag', 'season', and 'gas' can remain as object (string) since they are categorical
co2_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1217013 entries, 0 to 1217012
Data columns (total 19 columns):
 #   Column                 Non-Null Count    Dtype              
---  ------                 --------------    -----              
 0   datetime               1217013 non-null  datetime64[ns, UTC]
 1   site                   1217013 non-null  object             
 2   ppm                    1217013 non-null  float64            
 3   latitude               1217013 non-null  float64            
 4   longitude              1217013 non-null  float64            
 5   altitude               1217013 non-null  float64            
 6   elevation              1217013 non-null  float64            
 7   intake_height          1217013 non-null  float64            
 8   qcflag                 1217013 non-null  object             
 9   year                   1217013 non-null  int32              
 10  month                  1217013 non-null  int32              
 11  day                    1

In [30]:
# Set 'datetime' as the index to facilitate time-based grouping
co2_df.set_index('datetime', inplace=True)

In [23]:
print(co2_df.shape)
co2_df.head().style.format(precision=2).set_properties(**{'border': '1.5px solid blue'}).background_gradient(cmap='coolwarm')

(1217013, 19)


Unnamed: 0,datetime,site,ppm,latitude,longitude,altitude,elevation,intake_height,qcflag,year,month,day,season,co2_change_rate,rolling_avg_ppm,min_ppm_year,max_ppm_year,total_site_ppm_annual,gas
0,1968-01-16 20:04:00+00:00,NWR,322.36,40.05,-105.63,3526.0,3523.0,3.0,...,1968,1,16,Winter,0.0,322.36,322.36,322.36,82,CO2
1,1968-01-16 20:35:00+00:00,NWR,322.42,40.05,-105.63,3526.0,3523.0,3.0,...,1968,1,16,Winter,0.06,322.39,322.36,322.42,82,CO2
2,1968-01-16 21:00:00+00:00,NWR,322.65,40.05,-105.63,3526.0,3523.0,3.0,...,1968,1,16,Winter,0.23,322.48,322.36,322.65,82,CO2
3,1968-01-16 21:03:00+00:00,NWR,322.46,40.05,-105.63,3526.0,3523.0,3.0,...,1968,1,16,Winter,-0.19,322.47,322.36,322.65,82,CO2
4,1968-01-16 21:30:00+00:00,NWR,322.56,40.05,-105.63,3526.0,3523.0,3.0,...,1968,1,16,Winter,0.1,322.49,322.36,322.65,82,CO2


In [20]:
co2_df.describe()

Unnamed: 0,ppm,latitude,longitude,altitude,elevation,intake_height,year,month,day,co2_change_rate,rolling_avg_ppm,min_ppm_year,max_ppm_year,total_site_ppm_annual
count,1217013.0,1217013.0,1217013.0,1217013.0,1217013.0,1217013.0,1217013.0,1217013.0,1217013.0,1217013.0,1217013.0,1217013.0,1217013.0,1217013.0
mean,401.0242,37.08131,-92.73487,758.941,608.5974,150.3436,2012.775,6.577405,15.71893,0.003762656,401.0187,384.8675,434.8153,6639.868
std,18.61633,21.7714,39.02159,786.4138,837.8323,155.9775,7.604845,3.451018,8.795548,4.16998,18.12243,19.79547,31.74333,2889.133
min,315.71,-89.98,-177.38,-1.2,0.0,-3234.2,1968.0,1.0,1.0,-132.462,319.3918,315.71,322.36,1.0
25%,391.896,33.4057,-105.004,340.7,52.4,18.3,2010.0,4.0,8.0,-0.727,392.3632,372.578,419.226,6470.0
50%,402.761,40.05,-91.3529,486.0,251.0,107.0,2014.0,7.0,16.0,-0.006,402.8895,386.97,435.495,8052.0
75%,413.673,45.0345,-78.35,716.0,611.43,244.0,2018.0,10.0,23.0,0.678,413.5391,399.112,452.909,8450.0
max,559.019,82.4508,174.871,4469.0,4464.0,484.0,2023.0,12.0,31.0,140.13,488.3543,441.104,559.019,9104.0


#### Check For Missing Values

In [12]:
print(co2_df.isnull().sum())

datetime                 0
site                     0
ppm                      0
latitude                 0
longitude                0
altitude                 0
elevation                0
intake_height            0
qcflag                   0
year                     0
month                    0
day                      0
season                   0
co2_change_rate          0
rolling_avg_ppm          0
min_ppm_year             0
max_ppm_year             0
total_site_ppm_annual    0
gas                      0
dtype: int64


#### Check For Duplicate Values

In [21]:
def duplicates(df):
    before = df.shape[0]
    df = df.drop_duplicates()
    after = df.shape[0]

    total_duplicates = before - after
    print(f"There are a total of {total_duplicates} duplicates in this dataset.")
duplicates(co2_df)

There are a total of 0 duplicates in this dataset.


Since 'altitude' is just the sum of 'elevation' and 'intake_height', these features can be dropped as they don't add any useful information with regards to emission analysis. Also, since the qcflag was only necessary for database ingestion, it can now be dropped for the EDA phase.

In [27]:
co2_df = co2_df.drop(columns=['elevation', 'intake_height', 'qcflag'])

In [32]:
print(co2_df.shape)
co2_df.head()

(1217013, 15)


Unnamed: 0_level_0,site,ppm,latitude,longitude,altitude,year,month,day,season,co2_change_rate,rolling_avg_ppm,min_ppm_year,max_ppm_year,total_site_ppm_annual,gas
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1968-01-16 20:04:00+00:00,NWR,322.36,40.05,-105.63,3526.0,1968,1,16,Winter,0.0,322.36,322.36,322.36,82,CO2
1968-01-16 20:35:00+00:00,NWR,322.42,40.05,-105.63,3526.0,1968,1,16,Winter,0.06,322.39,322.36,322.42,82,CO2
1968-01-16 21:00:00+00:00,NWR,322.65,40.05,-105.63,3526.0,1968,1,16,Winter,0.23,322.476667,322.36,322.65,82,CO2
1968-01-16 21:03:00+00:00,NWR,322.46,40.05,-105.63,3526.0,1968,1,16,Winter,-0.19,322.4725,322.36,322.65,82,CO2
1968-01-16 21:30:00+00:00,NWR,322.56,40.05,-105.63,3526.0,1968,1,16,Winter,0.1,322.49,322.36,322.65,82,CO2


## Time Series Analysis

## Spatial Analysis

## Correlation Analysis