In [None]:
import os
import sys
import requests
import logging
import urllib
import json
import datetime as dt
from dateutil.relativedelta import relativedelta
import time

#####################################################################
class aqms_api_class(object):
    def __init__(self):
        self.logger = logging.getLogger("aqms_logger")
        self.url_api = "https://data.airquality.nsw.gov.au"
        self.headers = {'content-type': 'application/json', 'accept': 'application/json'}
        self.get_observations = 'api/Data/get_Observations'

    def get_Obs(self, ObsRequest):
        query = urllib.parse.urljoin(self.url_api, self.get_observations)
        response = requests.post(url=query, data=json.dumps(ObsRequest), headers=self.headers)
        return response

#####################################################################
def ObsRequest_init(site_id, start_date, end_date):
    ObsRequest = {}
    ObsRequest['Parameters'] = [
        'PM10', 'PM2.5', 'NO2', 'CO', 'OZONE',
        'WSP', 'WDR', 'SD1', 'TEMP', 'HUMID'
    ]
    ObsRequest['Sites'] = [site_id]
    ObsRequest['StartDate'] = start_date.strftime('%Y-%m-%d')
    ObsRequest['EndDate'] = end_date.strftime('%Y-%m-%d')
    ObsRequest['Categories'] = ['Averages']
    ObsRequest['SubCategories'] = ['Hourly']
    ObsRequest['Frequency'] = ['Hourly average']
    return ObsRequest

#####################################################################
if __name__ == '__main__':
    AQMS = aqms_api_class()

    site_ids = [39, 1141, 919, 2560, 107]  # Site IDs
    start_date = dt.date(2015 , 6, 1)
    end_date = dt.date(2025, 6, 1)

    current_date = start_date

    os.makedirs("batches", exist_ok=True)

    while current_date < end_date:
        next_month = current_date + relativedelta(months=1)

        for site_id in site_ids:
            print(f"ðŸ“¡ Fetching Site {site_id} from {current_date} to {next_month}")
            ObsRequest = ObsRequest_init(site_id, current_date, next_month)
            try:
                response = AQMS.get_Obs(ObsRequest)
                filename = f"batches/site_{site_id}_{current_date}.json"

                with open(filename, 'w', encoding='utf-8') as f:
                    f.write(response.text)
                print(f" Saved to {filename}")
            except Exception as e:
                print(f" Failed for Site {site_id} on {current_date}: {e}")

            time.sleep(1.5)  #

        current_date = next_month


![image.png](attachment:image.png)  
![image-2.png](attachment:image-2.png)  
![image-3.png](attachment:image-3.png)  
![image-4.png](attachment:image-4.png)  
![image-5.png](attachment:image-5.png)    
![image-6.png](attachment:image-6.png)  
![image-7.png](attachment:image-7.png)  
![image-11.png](attachment:image-11.png)  
![image-9.png](attachment:image-9.png)  
![image-10.png](attachment:image-10.png)  
![image-12.png](attachment:image-12.png)  

In [None]:
import os
import sys
import requests
import logging
import urllib
import json
import datetime as dt
from dateutil.relativedelta import relativedelta
import time

#####################################################################
class aqms_api_class(object):
    def __init__(self):
        self.logger = logging.getLogger("aqms_logger")
        self.url_api = "https://data.airquality.nsw.gov.au"
        self.headers = {'content-type': 'application/json', 'accept': 'application/json'}
        self.get_observations = 'api/Data/get_Observations'

    def get_Obs(self, ObsRequest):
        query = urllib.parse.urljoin(self.url_api, self.get_observations)
        response = requests.post(url=query, data=json.dumps(ObsRequest), headers=self.headers)
        return response

#####################################################################
def ObsRequest_init(site_id, start_date, end_date):
    ObsRequest = {}
    ObsRequest['Parameters'] = [
        'PM10', 'PM2.5', 'NO2', 'CO', 'OZONE',
        'WSP', 'WDR', 'SD1', 'TEMP', 'HUMID'
    ]
    ObsRequest['Sites'] = [site_id]
    ObsRequest['StartDate'] = start_date.strftime('%Y-%m-%d')
    ObsRequest['EndDate'] = end_date.strftime('%Y-%m-%d')
    ObsRequest['Categories'] = ['Averages']
    ObsRequest['SubCategories'] = ['Hourly']
    ObsRequest['Frequency'] = ['Hourly average']
    return ObsRequest

#####################################################################
if __name__ == '__main__':
    AQMS = aqms_api_class()

    site_ids = [39, 1141, 919, 2560, 107]  # Site IDs
    start_date = dt.date(2018 , 8, 1)
    end_date = dt.date(2025, 6, 1)

    current_date = start_date

    os.makedirs("batches", exist_ok=True)

    while current_date < end_date:
        next_month = current_date + relativedelta(months=1)

        for site_id in site_ids:
            print(f"ðŸ“¡ Fetching Site {site_id} from {current_date} to {next_month}")
            ObsRequest = ObsRequest_init(site_id, current_date, next_month)
            try:
                response = AQMS.get_Obs(ObsRequest)
                filename = f"batches/site_{site_id}_{current_date}.json"

                with open(filename, 'w', encoding='utf-8') as f:
                    f.write(response.text)
                print(f" Saved to {filename}")
            except Exception as e:
                print(f" Failed for Site {site_id} on {current_date}: {e}")

            time.sleep(1.5)  # be kind to the API

        current_date = next_month


Data fetched from 2018-08-01 to 2022-06-01

In [None]:
import os
import sys
import requests
import logging
import urllib
import json
import datetime as dt
from dateutil.relativedelta import relativedelta
import time

#####################################################################
class aqms_api_class(object):
    def __init__(self):
        self.logger = logging.getLogger("aqms_logger")
        self.url_api = "https://data.airquality.nsw.gov.au"
        self.headers = {'content-type': 'application/json', 'accept': 'application/json'}
        self.get_observations = 'api/Data/get_Observations'

    def get_Obs(self, ObsRequest):
        query = urllib.parse.urljoin(self.url_api, self.get_observations)
        response = requests.post(url=query, data=json.dumps(ObsRequest), headers=self.headers)
        return response

#####################################################################
def ObsRequest_init(site_id, start_date, end_date):
    ObsRequest = {}
    ObsRequest['Parameters'] = [
        'PM10', 'PM2.5', 'NO2', 'CO', 'OZONE',
        'WSP', 'WDR', 'SD1', 'TEMP', 'HUMID'
    ]
    ObsRequest['Sites'] = [site_id]
    ObsRequest['StartDate'] = start_date.strftime('%Y-%m-%d')
    ObsRequest['EndDate'] = end_date.strftime('%Y-%m-%d')
    ObsRequest['Categories'] = ['Averages']
    ObsRequest['SubCategories'] = ['Hourly']
    ObsRequest['Frequency'] = ['Hourly average']
    return ObsRequest

#####################################################################
if __name__ == '__main__':
    AQMS = aqms_api_class()

    site_ids = [39, 1141, 919, 2560, 107]  # Site IDs
    start_date = dt.date(2022 , 5, 1)
    end_date = dt.date(2025, 6, 1)

    current_date = start_date

    os.makedirs("batches", exist_ok=True)

    while current_date < end_date:
        next_month = current_date + relativedelta(months=1)

        for site_id in site_ids:
            print(f"ðŸ“¡ Fetching Site {site_id} from {current_date} to {next_month}")
            ObsRequest = ObsRequest_init(site_id, current_date, next_month)
            try:
                response = AQMS.get_Obs(ObsRequest)
                filename = f"batches/site_{site_id}_{current_date}.json"

                with open(filename, 'w', encoding='utf-8') as f:
                    f.write(response.text)
                print(f" Saved to {filename}")
            except Exception as e:
                print(f" Failed for Site {site_id} on {current_date}: {e}")

            time.sleep(1.5)  # be kind to the API

        current_date = next_month


Data fetched from 2022-05-01 to 2025-011-01

In [None]:
import os
import sys
import requests
import logging
import urllib
import json
import datetime as dt
from dateutil.relativedelta import relativedelta
import time

#####################################################################
class aqms_api_class(object):
    def __init__(self):
        self.logger = logging.getLogger("aqms_logger")
        self.url_api = "https://data.airquality.nsw.gov.au"
        self.headers = {'content-type': 'application/json', 'accept': 'application/json'}
        self.get_observations = 'api/Data/get_Observations'

    def get_Obs(self, ObsRequest):
        query = urllib.parse.urljoin(self.url_api, self.get_observations)
        response = requests.post(url=query, data=json.dumps(ObsRequest), headers=self.headers)
        return response

#####################################################################
def ObsRequest_init(site_id, start_date, end_date):
    ObsRequest = {}
    ObsRequest['Parameters'] = [
        'PM10', 'PM2.5', 'NO2', 'CO', 'OZONE',
        'WSP', 'WDR', 'SD1', 'TEMP', 'HUMID'
    ]
    ObsRequest['Sites'] = [site_id]
    ObsRequest['StartDate'] = start_date.strftime('%Y-%m-%d')
    ObsRequest['EndDate'] = end_date.strftime('%Y-%m-%d')
    ObsRequest['Categories'] = ['Averages']
    ObsRequest['SubCategories'] = ['Hourly']
    ObsRequest['Frequency'] = ['Hourly average']
    return ObsRequest

#####################################################################
if __name__ == '__main__':
    AQMS = aqms_api_class()

    site_ids = [39, 1141, 919, 2560, 107]  # Site IDs
    start_date = dt.date(2022 , 5, 1)
    end_date = dt.date(2025, 6, 1)

    current_date = start_date

    os.makedirs("batches", exist_ok=True)

    while current_date < end_date:
        next_month = current_date + relativedelta(months=1)

        for site_id in site_ids:
            print(f"ðŸ“¡ Fetching Site {site_id} from {current_date} to {next_month}")
            ObsRequest = ObsRequest_init(site_id, current_date, next_month)
            try:
                response = AQMS.get_Obs(ObsRequest)
                filename = f"batches/site_{site_id}_{current_date}.json"

                with open(filename, 'w', encoding='utf-8') as f:
                    f.write(response.text)
                print(f" Saved to {filename}")
            except Exception as e:
                print(f" Failed for Site {site_id} on {current_date}: {e}")

            time.sleep(1.5)  # be kind to the API

        current_date = next_month


In [None]:
df=pd.read_csv("combined.csv")
print(df.shape)
df.head()
df.isna().sum()
df.head()

(110400, 14)


Unnamed: 0,Site_Id,Date,Hour,HourDescription,Value,AirQualityCategory,DeterminingPollutant,Parameter.ParameterCode,Parameter.ParameterDescription,Parameter.Units,Parameter.UnitsDescription,Parameter.Category,Parameter.SubCategory,Parameter.Frequency
0,107,2017-06-01,1,12 am - 1 am,0.137018,,,CO,Carbon monoxide,ppm,parts per million,Averages,Hourly,Hourly average
1,107,2017-06-01,1,12 am - 1 am,61.126,,,HUMID,Humidity,%,percent,Averages,Hourly,Hourly average
2,107,2017-06-01,1,12 am - 1 am,1.063535,GOOD,,NO2,Nitrogen Dioxide,pphm,parts per hundred million,Averages,Hourly,Hourly average
3,107,2017-06-01,1,12 am - 1 am,1.1793,GOOD,,OZONE,Ozone,pphm,parts per hundred million,Averages,Hourly,Hourly average
4,107,2017-06-01,1,12 am - 1 am,6.899,GOOD,,PM10,PM10,Âµg/mÂ³,microgram per cubic meter,Averages,Hourly,Hourly average


In [None]:
import pandas as pd
df=pd.read_csv("combined.csv")
# Convert Date to datetime (only date part)
df['ParsedDate'] = pd.to_datetime(df['Date'], dayfirst=True, errors='coerce')

# Extract start hour string from HourDesc
df['StartHourStr'] = df['HourDescription'].str.extract(r'(^[\d]+ ?[ap]m)', expand=False)

# Convert start hour to integer (24-hour format)
df['HourInt'] = pd.to_datetime(df['StartHourStr'], format='%I %p', errors='coerce').dt.hour
df['Timestamp'] = df['ParsedDate'] + pd.to_timedelta(df['HourInt'], unit='h')
df = df.dropna(subset=['Timestamp'])

# Pivoting the data
pivoted_df = df.pivot_table(
    index=['Site_Id', 'Timestamp'],
    columns='Parameter.ParameterCode',
    values='Value',
    aggfunc='first'
).reset_index()

pivoted_df.columns.name = None
pivoted_df.to_csv("finalfiles/simplified.csv", index=False)


In [23]:
df=pd.read_csv("finalfiles/ simplified.csv")
df.head()
df.shape
print(df["Site_Id"].value_counts())

Site_Id
39      864
107     862
2560    862
Name: count, dtype: int64


In [None]:
import pandas as pd

# Load the simplified, pivoted data
df = pd.read_csv("finalfiles/simplified.csv", parse_dates=["Timestamp"])

# Sort the data by Site and Timestamp to ensure chronological order
df = df.sort_values(["Site_Id", "Timestamp"])

# Apply forward-fill within each site group
df_filled = df.groupby("Site_Id").apply(lambda group: group.ffill()).reset_index(drop=True)

# Save the cleaned file
df_filled.to_csv("finalfiles/cleaned.csv", index=False)
print(" Forward-filled dataset saved as 'finalfiles/cleaned.csv'")


âœ… Forward-filled dataset saved as 'finalfiles/2017_cleaned.csv'


  df_filled = df.groupby("Site_Id").apply(lambda group: group.ffill()).reset_index(drop=True)


In [None]:
import pandas as pd
    
# Load the cleaned dataset
df = pd.read_csv("finalfiles/2017_cleaned.csv", parse_dates=["Timestamp"])
# Shape of the dataset
print(f"#Dataset Shape: {df.shape}")
# Missing values check
print("#Missing Values per Column: (Already performeed ffill)")
print(df.isna().sum())

#Dataset Shape: (2588, 12)
#Missing Values per Column: (Already performeed ffill)
Site_Id      0
Timestamp    0
CO           0
HUMID        0
NO2          0
OZONE        0
PM10         0
PM2.5        0
SD1          0
TEMP         0
WDR          0
WSP          0
dtype: int64


In [38]:
df.describe()

Unnamed: 0,Site_Id,Timestamp,CO,HUMID,NO2,OZONE,PM10,PM2.5,SD1,TEMP,WDR,WSP
count,2588.0,2588,2588.0,2588.0,2588.0,2588.0,2588.0,2588.0,2588.0,2588.0,2588.0,2588.0
mean,901.333076,2017-06-23 03:24:53.972179200,0.256871,66.817367,1.388309,1.267129,13.142309,7.039517,35.863798,11.866009,237.074375,1.73092
min,39.0,2017-01-06 00:00:00,-0.02696,21.986,0.057273,-0.078,-8.184,-2.458,6.451,0.817,0.212,0.026
25%,39.0,2017-04-06 00:00:00,0.124004,49.898,0.570484,0.1782,7.928,2.561,16.87175,8.7755,207.501,0.7145
50%,107.0,2017-07-06 00:00:00,0.19224,68.53,1.28435,1.124087,11.111,5.56,25.9045,12.0325,241.065,1.4475
75%,2560.0,2017-10-06 00:00:00,0.308281,84.964,2.090433,2.266956,16.164,9.93275,50.85725,15.205,297.06675,2.35325
max,2560.0,2017-12-08 23:00:00,2.069617,99.318,4.301296,3.3792,151.62,59.231,141.939,25.579,359.896,8.051
std,1172.730318,,0.231401,20.282277,0.896284,1.049052,8.904825,6.87932,24.260289,4.355184,76.43672,1.316892


In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2588 entries, 0 to 2587
Data columns (total 12 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Site_Id    2588 non-null   int64         
 1   Timestamp  2588 non-null   datetime64[ns]
 2   CO         2588 non-null   float64       
 3   HUMID      2588 non-null   float64       
 4   NO2        2588 non-null   float64       
 5   OZONE      2588 non-null   float64       
 6   PM10       2588 non-null   float64       
 7   PM2.5      2588 non-null   float64       
 8   SD1        2588 non-null   float64       
 9   TEMP       2588 non-null   float64       
 10  WDR        2588 non-null   float64       
 11  WSP        2588 non-null   float64       
dtypes: datetime64[ns](1), float64(10), int64(1)
memory usage: 242.8 KB
