# Environment Setting Up

In [1]:
import os
from dotenv import load_dotenv

# Loading environment variables from .env
load_dotenv()

# Changing directory to main directory for easy data access
working_directory = os.getenv("WORKING_DIRECTORY")
os.chdir(working_directory)

# Checking the change
%pwd

'/workspaces/Live-Air-Quality'

In [2]:
from pathlib import Path

# Checking the change
print("Git folder exists:", Path(".git").exists())

Git folder exists: True


# 3. Data Check

In [7]:
import duckdb as ddb
import pandas as pd
from pathlib import Path

location = Path("research/sql/air_quality.db")
con = ddb.connect(location)

con

<duckdb.duckdb.DuckDBPyConnection at 0x76e624b34570>

In [11]:
table_address = "raw.air_quality_data"
data = con.query(f"""
                 SELECT *
                 FROM {table_address}
                 WHERE parameter in ('so2', 'pm10', 'pm25')
                 """).to_df()

print(data.shape)
data.head(2)

(719, 12)


Unnamed: 0,location_id,sensors_id,location,datetime,lat,lon,parameter,units,value,month,year,ingestion_datetime
0,384,673,CCNY-384,2025-06-01 05:00:00,40.8197,-73.9481,pm25,µg/m³,3.9,6,2025,2025-08-13 13:59:25.126
1,384,673,CCNY-384,2025-06-01 06:00:00,40.8197,-73.9481,pm25,µg/m³,3.9,6,2025,2025-08-13 13:59:25.126


In [13]:
data.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 719 entries, 0 to 718
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   location_id         719 non-null    int64         
 1   sensors_id          719 non-null    int64         
 2   location            719 non-null    object        
 3   datetime            719 non-null    datetime64[us]
 4   lat                 719 non-null    float64       
 5   lon                 719 non-null    float64       
 6   parameter           719 non-null    object        
 7   units               719 non-null    object        
 8   value               719 non-null    float64       
 9   month               719 non-null    object        
 10  year                719 non-null    int64         
 11  ingestion_datetime  719 non-null    datetime64[us]
dtypes: datetime64[us](2), float64(3), int64(3), object(4)
memory usage: 67.5+ KB


In [None]:
# Missing Data
missing_data = data.isnull().sum()
missing_data_percentage = (missing_data / len(data)) * 100

print(missing_data, '\n')
print(missing_data_percentage)

location_id           0
sensors_id            0
location              0
datetime              0
lat                   0
lon                   0
parameter             0
units                 0
value                 0
month                 0
year                  0
ingestion_datetime    0
dtype: int64 

location_id           0.0
sensors_id            0.0
location              0.0
datetime              0.0
lat                   0.0
lon                   0.0
parameter             0.0
units                 0.0
value                 0.0
month                 0.0
year                  0.0
ingestion_datetime    0.0
dtype: float64


In [16]:
data.describe()

Unnamed: 0,location_id,sensors_id,datetime,lat,lon,value,year,ingestion_datetime
count,719.0,719.0,719,719.0,719.0,719.0,719.0,719
mean,384.0,673.0,2025-06-16 04:55:04.589708,40.8197,-73.9481,11.146871,2025.0,2025-08-13 13:59:25.126000
min,384.0,673.0,2025-06-01 05:00:00,40.8197,-73.9481,2.1,2025.0,2025-08-13 13:59:25.126000
25%,384.0,673.0,2025-06-08 17:30:00,40.8197,-73.9481,6.7,2025.0,2025-08-13 13:59:25.126000
50%,384.0,673.0,2025-06-16 05:00:00,40.8197,-73.9481,10.2,2025.0,2025-08-13 13:59:25.126000
75%,384.0,673.0,2025-06-23 16:30:00,40.8197,-73.9481,14.1,2025.0,2025-08-13 13:59:25.126000
max,384.0,673.0,2025-07-01 04:00:00,40.8197,-73.9481,38.9,2025.0,2025-08-13 13:59:25.126000
std,0.0,0.0,,7.110374e-15,1.422075e-14,5.758563,0.0,


In [17]:
data.describe(include='O')

Unnamed: 0,location,parameter,units,month
count,719,719,719,719
unique,1,1,1,1
top,CCNY-384,pm25,µg/m³,6
freq,719,719,719,719


In [18]:
data[data.duplicated(subset=["location_id", "parameter", "units", "value", "datetime"])]

Unnamed: 0,location_id,sensors_id,location,datetime,lat,lon,parameter,units,value,month,year,ingestion_datetime


In [20]:
data.groupby(by="parameter").count()

Unnamed: 0_level_0,location_id,sensors_id,location,datetime,lat,lon,units,value,month,year,ingestion_datetime
parameter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
pm25,719,719,719,719,719,719,719,719,719,719,719


In [21]:
con.close()