#Pipeline script for data collection, processing, and lakeFS storage. (ongoing)

In [1]:
import requests
import os
import pandas as pd

In [2]:
# Data collection
# config
url = 'http://air4thai.pcd.go.th/services/getNewAQI_JSON.php'

In [3]:
def air4thai_request_data(url):
    """
    Make a request to the API endpoint via url.

    Parameters:
    url: Your API endpoint where the data is accessible.

    Returns:
    df: 
    """

    request = requests.get(url=url)

    # transform data into dataframe
    request_json = request.json()
    data = request_json['stations']
    df = pd.DataFrame(data)

    return df

In [4]:
df = air4thai_request_data(url=url)

In [5]:
# Data Processing
expanded_aqi = pd.json_normalize(df['AQILast'])
df = pd.concat([df, expanded_aqi], axis=1)

# Convert data types
numeric_cols = ['PM25.color_id', 'PM25.aqi',
                 'PM10.color_id', 'PM10.aqi',
                 'O3.color_id', 'O3.aqi',
                 'CO.color_id', 'CO.aqi',
                 'NO2.color_id', 'NO2.aqi',
                 'SO2.color_id', 'SO2.aqi',
                 'AQI.color_id', 'AQI.aqi']

df[numeric_cols] = df[numeric_cols].astype(int)
df['time'] = df['time'].mode()[0]
df['date'] = df['date'].mode()[0]
df['datetime'] = pd.to_datetime(df['date'] + ' ' + df['time'])

# Remove unnecessary columns
df = df.drop(axis=1, columns=['date', 'time', 'forecast', 'AQILast',
                              'PM25.value', 'PM10.value', 'O3.value', 'CO.value', 'NO2.value', 'SO2.value'])

In [6]:
# get date and time at the moment
current_date = df['datetime'].dt.date[0]
current_time = df['datetime'].dt.time[0].strftime("%H-%M-%S")

In [7]:
# set path to the staging area (directory)
output_folder = 'staging_input_data'
output_file = f'{current_date}_{current_time}_aqi-data.csv'

output_path = os.path.join(output_folder, output_file)

In [12]:
# save file to staging area
df.to_csv(output_path, encoding='utf-8', index=False)
print(f'AQI data for "{current_date}" at "{current_time}" has been successfully saved in the "{output_folder}" folder.')

AQI data for "2025-05-01" at "23-00-00" has been successfully saved in the "staging_input_data" folder.
