# Environment Setting Up

In [1]:
import os
from dotenv import load_dotenv

# Loading environment variables from .env
load_dotenv()

# Changing directory to main directory for easy data access
working_directory = os.getenv("WORKING_DIRECTORY")
os.chdir(working_directory)

# Checking the change
%pwd

'/workspaces/Live-Air-Quality'

In [2]:
from pathlib import Path

# Checking the change
print("Git folder exists:", Path(".git").exists())

Git folder exists: True


# 3. Data Check

In [3]:
import duckdb as ddb
import pandas as pd
from pathlib import Path

location = Path("research/sql/air_quality.db")
con = ddb.connect(location)

con

<duckdb.duckdb.DuckDBPyConnection at 0x7441c81218f0>

In [4]:
table_address = "raw.air_quality_data"
data = con.query(f"""
                 SELECT *
                 FROM {table_address}
                 """).to_df()

print(data.shape)
data.head(2)

(1401, 12)


Unnamed: 0,location_id,sensors_id,location,datetime,lat,lon,parameter,units,value,month,year,ingestion_datetime
0,384,673,CCNY-384,2025-06-01 05:00:00,40.8197,-73.9481,pm25,µg/m³,3.9,6,2025,2025-08-13 13:59:25.126
1,384,673,CCNY-384,2025-06-01 06:00:00,40.8197,-73.9481,pm25,µg/m³,3.9,6,2025,2025-08-13 13:59:25.126


In [5]:
data.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1401 entries, 0 to 1400
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   location_id         1401 non-null   int64         
 1   sensors_id          1401 non-null   int64         
 2   location            1401 non-null   object        
 3   datetime            1401 non-null   datetime64[us]
 4   lat                 1401 non-null   float64       
 5   lon                 1401 non-null   float64       
 6   parameter           1401 non-null   object        
 7   units               1401 non-null   object        
 8   value               1401 non-null   float64       
 9   month               1401 non-null   object        
 10  year                1401 non-null   int64         
 11  ingestion_datetime  1401 non-null   datetime64[us]
dtypes: datetime64[us](2), float64(3), int64(3), object(4)
memory usage: 131.5+ KB


In [6]:
# Missing Data
missing_data = data.isnull().sum()
missing_data_percentage = (missing_data / len(data)) * 100

print(missing_data, '\n')
print(missing_data_percentage)

location_id           0
sensors_id            0
location              0
datetime              0
lat                   0
lon                   0
parameter             0
units                 0
value                 0
month                 0
year                  0
ingestion_datetime    0
dtype: int64 

location_id           0.0
sensors_id            0.0
location              0.0
datetime              0.0
lat                   0.0
lon                   0.0
parameter             0.0
units                 0.0
value                 0.0
month                 0.0
year                  0.0
ingestion_datetime    0.0
dtype: float64


In [7]:
data.describe()

Unnamed: 0,location_id,sensors_id,datetime,lat,lon,value,year,ingestion_datetime
count,1401.0,1401.0,1401,1401.0,1401.0,1401.0,1401.0,1401
mean,384.0,672.02641,2025-06-16 03:36:44.710920,40.8197,-73.9481,5.73801,2025.0,2025-08-13 13:59:25.126000
min,384.0,671.0,2025-06-01 05:00:00,40.8197,-73.9481,0.002,2025.0,2025-08-13 13:59:25.126000
25%,384.0,671.0,2025-06-08 13:00:00,40.8197,-73.9481,0.034,2025.0,2025-08-13 13:59:25.126000
50%,384.0,673.0,2025-06-15 22:00:00,40.8197,-73.9481,3.1,2025.0,2025-08-13 13:59:25.126000
75%,384.0,673.0,2025-06-23 19:00:00,40.8197,-73.9481,10.4,2025.0,2025-08-13 13:59:25.126000
max,384.0,673.0,2025-07-01 04:00:00,40.8197,-73.9481,38.9,2025.0,2025-08-13 13:59:25.126000
std,0.0,1.000008,,7.107965e-15,1.421593e-14,6.918959,0.0,


In [8]:
data.describe(include='O')

Unnamed: 0,location,parameter,units,month
count,1401,1401,1401,1401
unique,1,2,2,1
top,CCNY-384,pm25,µg/m³,6
freq,1401,719,719,1401


In [9]:
data[data.duplicated(subset=["location_id", "parameter", "units", "value", "datetime"])]

Unnamed: 0,location_id,sensors_id,location,datetime,lat,lon,parameter,units,value,month,year,ingestion_datetime


In [10]:
data.groupby(by="parameter").count()

Unnamed: 0_level_0,location_id,sensors_id,location,datetime,lat,lon,units,value,month,year,ingestion_datetime
parameter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
o3,682,682,682,682,682,682,682,682,682,682,682
pm25,719,719,719,719,719,719,719,719,719,719,719


In [11]:
con.close()