In [4]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from util import *
from joblib import Parallel, delayed


def batch_get_sediment_quantity(date_range, site_codes):
    sediment_data = pd.read_csv('discharge/SSC_sediment.csv', parse_dates=['date']).fillna(0)
    discharge_data = pd.read_csv('discharge/SSC_discharge.csv', parse_dates=['date'])

    sediment_data = sediment_data[sediment_data['date'].isin(date_range)]
    discharge_data = discharge_data[discharge_data['date'].isin(date_range)]

    sediment_data.set_index(['date'], inplace=True)
    discharge_data.set_index(['date'], inplace=True)

    return sediment_data, discharge_data


def process_site(site, date_range, sediment_data, discharge_data):
    N = len(date_range)
    data = np.zeros((N, 2))
    for j, date in enumerate(date_range):
        if date in sediment_data.index and str(site) in sediment_data.columns:
            data[j, 0] = sediment_data.loc[date, str(site)]
        if date in discharge_data.index and str(site) in discharge_data.columns:
            data[j, 1] = discharge_data.loc[date, str(site)]
    return data


def generate_spatio_temporal_data(start_date, end_date, site_codes):
    start_date = datetime.strptime(start_date, '%Y-%m-%d')
    end_date = datetime.strptime(end_date, '%Y-%m-%d')
    date_range = pd.date_range(start_date, end_date, freq='D')
    sediment_data, discharge_data = batch_get_sediment_quantity(date_range, site_codes)
    results = Parallel(n_jobs=-1)(delayed(process_site)(site, date_range, sediment_data, discharge_data) for site in site_codes)
    data = np.stack(results, axis=0)

    return data

All data (2015-04-15 -> 2022-12-24)

In [5]:
start_date = '2015-04-15'
end_date = '2022-12-24'
site_codes = ["4178000", "4182000", "4183000", "4183500", "4184500", "4185000", "4185318", "4185440", "4186500", "4188100",
              "4188496", "4189000", "4190000", "4191058", "4191444", "4191500", "4192500", "4192574", "4192599", "4193500"]

spatio_temporal_data = generate_spatio_temporal_data(start_date, end_date, site_codes)
perpare_train()
print(f"Data shape: {spatio_temporal_data.shape}")
# (N, T, F) -> (20, 2811, 2) 对应 site 数量, 时间步数, 特征数
# F=0: sendiment, F=1: discharge
# print(spatio_temporal_data)
np.save('data/discharge/data_encoder.npy', spatio_temporal_data)

Dialogues and ST data generated successfully!
Data shape: (20, 2811, 2)


New train data (2017-12-19 -> 2022-12-24)

In [6]:
start_date = '2017-12-19'
end_date = '2022-12-24'
site_codes = ["4178000", "4182000", "4183000", "4183500", "4184500", "4185000", "4185318", "4185440", "4186500", "4188100",
              "4188496", "4189000", "4190000", "4191058", "4191444", "4191500", "4192500", "4192574", "4192599", "4193500"]

spatio_temporal_data = generate_spatio_temporal_data(start_date, end_date, site_codes)
perpare_train(start_date=start_date, st_name='st_data_new.pkl', gpt_name='st_gpt_new.json')
print(f"Data shape: {spatio_temporal_data.shape}")
# (N, T, F) -> (20, 2811, 2) 对应 site 数量, 时间步数, 特征数
# F=0: sendiment, F=1: discharge
# print(spatio_temporal_data)
np.save('data/discharge/data_encoder_new.npy', spatio_temporal_data)

Dialogues and ST data generated successfully!
Data shape: (20, 1832, 2)


In [8]:
start_date = '2015-04-15'
end_date = '2017-12-18'
site_codes = ["4178000", "4182000", "4183000", "4183500", "4184500", "4185000", "4185318", "4185440", "4186500", "4188100",
              "4188496", "4189000", "4190000", "4191058", "4191444", "4191500", "4192500", "4192574", "4192599", "4193500"]

spatio_temporal_data = generate_spatio_temporal_data(start_date, end_date, site_codes)
perpare_train(start_date=start_date, end_date=end_date, st_name='st_data_nan.pkl', gpt_name='st_gpt_nan.json')
print(f"Data shape: {spatio_temporal_data.shape}")
# (N, T, F) -> (20, 2811, 2) 对应 site 数量, 时间步数, 特征数
# F=0: sendiment, F=1: discharge
# print(spatio_temporal_data)
np.save('data/discharge/data_encoder_nan.npy', spatio_temporal_data)

Dialogues and ST data generated successfully!
Data shape: (20, 979, 2)
