In [1]:
import pandas as pd
import numpy as np
from numpy.lib.stride_tricks import sliding_window_view

In [3]:
df = pd.read_csv('data/airquality.csv', dtype={'station_id': str})
station_ids = df['station_id'].unique()
len(station_ids)

437

In [3]:
location_df = pd.read_csv('data/station.csv', dtype={'station_id': str})
location_df = location_df.drop(columns=['name_chinese', 'name_english', 'district_id'])
location_ids = location_df['station_id'].unique()
location_ids

array(['001001', '001002', '001003', '001004', '001005', '001006',
       '001007', '001008', '001009', '001010', '001011', '001012',
       '001013', '001014', '001015', '001016', '001017', '001018',
       '001019', '001020', '001021', '001022', '001023', '001024',
       '001025', '001026', '001027', '001028', '001029', '001030',
       '001031', '001032', '001033', '001034', '001035', '001036',
       '004002', '004003', '004007', '004008', '004009', '004011',
       '004014', '004017', '004018', '004019', '004020', '006001',
       '006002', '006003', '006004', '006005', '006006', '006007',
       '006008', '006010', '006011', '006012', '006013', '006014',
       '006015', '006016', '006017', '006019', '006020', '006021',
       '006022', '006023', '006024', '006025', '006026', '006027',
       '006028', '006040', '009016', '009017', '009018', '009019',
       '009020', '009021', '009022', '009023', '009024', '009025',
       '009026', '009027', '009028', '009029', '009030', '0090

In [4]:
df = pd.merge(df, location_df, how='left', on='station_id')
concentration_cols = ['PM25_Concentration', 'PM10_Concentration', 'NO2_Concentration', 
                     'CO_Concentration', 'O3_Concentration', 'SO2_Concentration']

mean_vals = df[concentration_cols].mean()
std_vals = df[concentration_cols].std()

for col in concentration_cols:
    df[col] = (df[col] - mean_vals[col]) / std_vals[col]

df

Unnamed: 0,station_id,time,PM25_Concentration,PM10_Concentration,NO2_Concentration,CO_Concentration,O3_Concentration,SO2_Concentration,latitude,longitude
0,001001,2014-05-01 00:00:00,0.997510,0.424657,0.424169,-0.350256,-0.105732,-0.391067,40.090679,116.173553
1,001001,2014-05-01 01:00:00,0.794916,0.468201,-0.116648,-0.350256,-0.099804,-0.376706,40.090679,116.173553
2,001001,2014-05-01 02:00:00,0.838329,0.318219,0.402659,-0.266886,-0.572079,-0.403376,40.090679,116.173553
3,001001,2014-05-01 03:00:00,0.867271,0.291126,0.713014,-0.266886,-0.917887,-0.401324,40.090679,116.173553
4,001001,2014-05-01 04:00:00,0.722561,0.036641,0.749888,-0.266886,-1.070042,-0.405427,40.090679,116.173553
...,...,...,...,...,...,...,...,...,...,...
2891388,372002,2015-04-30 18:00:00,-0.218055,-0.246873,-0.814178,0.004898,0.550314,-0.292595,24.417777,111.526388
2891389,372002,2015-04-30 19:00:00,-0.073345,-0.314606,-0.937091,-0.184351,0.234147,-0.313110,24.417777,111.526388
2891390,372002,2015-04-30 20:00:00,-0.116758,-0.333959,-0.414711,0.003231,-0.338906,-0.313110,24.417777,111.526388
2891391,372002,2015-04-30 21:00:00,-0.189113,-0.304930,-0.721994,-0.148502,-0.595791,-0.313110,24.417777,111.526388


In [5]:
longest = 0
longest_subdf = None
for station_id, subdf in df.groupby('station_id'):
    subdf = subdf.drop(columns=['station_id', 'longitude', 'latitude'])
    subdf = subdf.set_index('time')
    subdf.sort_index(inplace=True)
    dates = subdf.index.str[:-9].unique()
    if len(dates) > longest:
        longest = len(dates)
        longest_station = station_id
        longest_subdf = subdf

dates = longest_subdf.index.str[:-9].unique()
date_to_int = {date: i for i, date in enumerate(dates)}
int_to_date = {i: date for date, i in date_to_int.items()}

hour_to_int = {f'{i:02d}:00:00': i for i in range(24)}
int_to_hour = {i: hour for hour, i in hour_to_int.items()}

station_id_to_int = {sid: i for i, sid in enumerate(station_ids)}
int_to_station_id = {i: sid for sid, i in station_id_to_int.items()}

In [6]:
data = np.full((len(station_ids), 365, 24, 6), -10, dtype=np.float32)
for station_id, subdf in df.groupby('station_id'):
    subdf = subdf.drop(columns=['station_id', 'longitude', 'latitude'])
    subdf = subdf.set_index('time')
    subdf.sort_index(inplace=True)
    for time in subdf.index:
        date = time[:-9]
        hour = time[-8:]
        if date in date_to_int and hour in hour_to_int:
            date_idx = date_to_int[date]
            hour_idx = hour_to_int[hour]
            station_idx = station_id_to_int[station_id]
            data[station_idx, date_idx, hour_idx, 0] = subdf.at[time, 'PM25_Concentration']
            data[station_idx, date_idx, hour_idx, 1] = subdf.at[time, 'PM10_Concentration']
            data[station_idx, date_idx, hour_idx, 2] = subdf.at[time, 'NO2_Concentration']
            data[station_idx, date_idx, hour_idx, 3] = subdf.at[time, 'CO_Concentration']
            data[station_idx, date_idx, hour_idx, 4] = subdf.at[time, 'O3_Concentration']
            data[station_idx, date_idx, hour_idx, 5] = subdf.at[time, 'SO2_Concentration']
data[np.isnan(data)] = -10

data

array([[[[ 9.97510433e-01,  4.24657494e-01,  4.24169093e-01,
          -3.50255996e-01, -1.05732352e-01, -3.91066700e-01],
         [ 7.94916213e-01,  4.68200535e-01, -1.16647609e-01,
          -3.50255996e-01, -9.98042226e-02, -3.76706213e-01],
         [ 8.38329256e-01,  3.18218976e-01,  4.02659327e-01,
          -2.66886473e-01, -5.72078824e-01, -4.03375685e-01],
         ...,
         [ 7.22561121e-01,  1.59548128e+00, -1.11224198e+00,
          -7.67103612e-01,  5.81931055e-01, -5.12104988e-01],
         [-3.04880887e-01, -5.27483463e-01, -1.17984402e+00,
          -8.50473106e-01,  5.42410195e-01, -5.40825963e-01],
         [-4.64062035e-01, -7.41328180e-01, -1.18291688e+00,
          -8.50473106e-01,  5.20673692e-01, -5.61340928e-01]],

        [[-2.90409863e-01, -4.39429790e-01, -1.11224198e+00,
          -8.50473106e-01,  4.43607956e-01, -5.61340928e-01],
         [-3.91706944e-01, -5.79735100e-01, -1.03542137e+00,
          -7.67103612e-01,  3.68518263e-01, -5.42877436e-01],


In [7]:
data_flat = data.reshape(len(station_ids), 365*24, 6)
data_windowed = sliding_window_view(data_flat, window_shape=24, axis=1)
data_windowed = data_windowed.transpose(1, 0, 3, 2).copy()
original_data_windowed = data_windowed.copy()
data_windowed.shape

(8737, 437, 24, 6)

In [8]:
# Count and identify time series where all values are -10
null_time_series_count = np.sum(np.all(original_data_windowed == -10, axis=(2)))
print(f'Number of time series with all values -10: {null_time_series_count}')

# Get indices of null time series
null_time_series = np.array(np.where(np.all(original_data_windowed == -10, axis=(2)))).T
print(f'Shape of null_time_series: {null_time_series.shape}')

Number of time series with all values -10: 3729128
Shape of null_time_series: (3729128, 3)


In [26]:
null_samples = set()
for pair in null_time_series:
    null_samples.add((pair[0], pair[1]))

null_samples = np.array(list(null_samples))
null_samples.shape

(751081, 2)

In [10]:
original_data_windowed = original_data_windowed.transpose(0,1,3,2)
original_filtered = original_data_windowed[~np.all(original_data_windowed == -10, axis=(3)), :]
original_data_windowed = original_data_windowed.transpose(0,1,3,2)

print(f'Shape of filtered data: {original_filtered.shape}')
print(f'Number of -10 values in data_windowed: {np.sum(original_data_windowed == -10)}')
print(f'Number of -10 values in filtered data: {np.sum(original_filtered == -10)}')

Shape of filtered data: (19179286, 24)
Number of -10 values in data_windowed: 147439296
Number of -10 values in filtered data: 57940224


In [11]:
for cycle_idx in range(data_windowed.shape[0]):
    for station_idx in range(data_windowed.shape[1]):
        sample = data_windowed[cycle_idx, station_idx, :, :]
        sample = sample.transpose(1, 0)
        for channel in range(sample.shape[0]):
            arr = sample[channel, :].copy()
            
            if np.all(arr != -10) or np.all(arr == -10):
                continue
            left = 0
            right = 0
            while right < len(arr):
                while left < len(arr) and arr[left] != -10:
                    left += 1
                right = left
                while right < len(arr) and arr[right] == -10:
                    right += 1


                if left == 0:
                    arr[left:right] = arr[right]
                elif right == len(arr):
                    arr[left:right] = arr[left - 1]
                else:
                    arr[left:right] = (arr[left - 1] + arr[right]) / 2

            data_windowed[cycle_idx, station_idx, :, channel] = arr

        

In [21]:
print(f"Number of -10 values in data_windowed after filling: {np.sum(data_windowed == -10)}")
data_windowed = data_windowed.transpose(0, 1, 3, 2)
filtered = data_windowed[~np.all(original_data_windowed == -10, axis=(2)), :]
data_windowed = data_windowed.transpose(0, 1, 3, 2)

print(f'Shape of filtered data: {filtered.shape}')
print(f'Number of -10 values in data_windowed: {np.sum(data_windowed == -10)}')
print(f'Number of -10 values in filtered data: {np.sum(filtered == -10)}')

Number of -10 values in data_windowed after filling: 89499072
Shape of filtered data: (19179286, 24)
Number of -10 values in data_windowed: 89499072
Number of -10 values in filtered data: 0


In [21]:
np.save('data/data_windowed.npy', data_windowed)

In [None]:
np.save('data/null_stations.npy', null_samples)

In [23]:
static = np.zeros((len(station_ids), 2), dtype=np.float32)
for i in range(len(station_ids)):
    station_id = int_to_station_id[i]
    longitude = location_df.loc[location_df['station_id'] == station_id, 'longitude']
    latitude = location_df.loc[location_df['station_id'] == station_id, 'latitude']

    static[i, 0] = longitude.values[0]
    static[i, 1] = latitude.values[0]

static.shape

(437, 2)

In [None]:
np.save('data/static.npy', static)