In [1]:
import pandas as pd
import numpy as np
import os
from numpy.lib.stride_tricks import sliding_window_view

In [9]:
directory = "/Users/victorli/Downloads/Sensorscope/stbernard-meteo/"
original_columns = ['station_id', 'year', 'month', 'day', 'hour', 'minute', 'second', 'time_since_epoch', 'ambient_temp', 'surface_temp', 'solar_radiation', 'relative_humidity', 'soil_moisture', 'watermark', 'rain_meter', 'wind_speed', 'wind_direction']
dropped_columns = ['time_since_epoch', 'second', 'solar_radiation', 'wind_speed', 'relative_humidity', 'soil_moisture', 'watermark']
columns = [col for col in original_columns if col not in dropped_columns]

df = pd.DataFrame(columns=columns)

nan_rates = {col: [] for col in columns}

for filename in os.listdir(directory):
    file_path = os.path.join(directory, filename)
    if os.path.isfile(file_path):
        if file_path == "/Users/victorli/Downloads/Sensorscope/stbernard-meteo/.DS_Store":
            continue
        cur = pd.read_csv(file_path, sep=' ', header=None)
        cur.columns = original_columns
        cur = cur.drop(['time_since_epoch', 'second', 'solar_radiation', 'wind_speed', 'relative_humidity', 'soil_moisture', 'watermark'], axis=1)

        df = pd.concat([df, cur], ignore_index=True)

        for col in cur.columns:
            nan_rate = cur[col].isna().mean()
            nan_rates[col].append(nan_rate)

avg_nan_rates = {col: np.mean(nan_rates[col]) for col in columns}
for col, avg_rate in avg_nan_rates.items():
    print(f"{col}: {avg_rate:.4f}")

  df = pd.concat([df, cur], ignore_index=True)


station_id: 0.0000
year: 0.0000
month: 0.0000
day: 0.0000
hour: 0.0000
minute: 0.0000
ambient_temp: 0.0014
surface_temp: 0.0005
rain_meter: 0.0000
wind_direction: 0.0015


In [11]:
num_stations = 31
num_years = 1
num_months = 2
num_days = 30
num_hours = 24
num_minutes = 60

data = np.full((num_stations, num_years, num_months, num_days, num_hours, num_minutes, len(columns) - 6), np.nan)
print(data.shape)

for idx, row in df.iterrows():
    station = int(row['station_id']) - 2
    year = int(row['year']) - 2007
    month = int(row['month']) - 9
    day = int(row['day']) - 1
    hour = int(row['hour'])
    minute = int(row['minute'])

    # Select only the columns for data (excluding first 6: station_id, year, month, day, hour, minute)
    values = row[columns[6:]].values
    data[station, year, month, day, hour, minute, :] = values

(31, 1, 2, 30, 24, 60, 4)


In [12]:
reshaped_data = data.reshape(num_stations, num_years * num_months * num_days * num_hours * num_minutes, data.shape[-1])
print(reshaped_data.shape)

(31, 86400, 4)


In [13]:
data = sliding_window_view(reshaped_data, window_shape=24, axis=1)
print(data.shape)

(31, 86377, 4, 24)


In [23]:
null_samples_indices = []
for time_idx in range(data.shape[1]):
    if np.isnan(data[:, time_idx, :, :]).all():
        null_samples_indices.append(time_idx)
print(f"Number of samples with NaN values: {len(null_samples_indices)}")

data_non_null = np.delete(data, null_samples_indices, axis=1)

not_enough_sensors_indices = []
for time_idx in range(data_non_null.shape[1]):
    count = 0
    for sensor_idx in range(data_non_null.shape[0]):
        if not np.isnan(data_non_null[sensor_idx, time_idx, :, :]).all():
            count += 1
    
    if count < 23:
        not_enough_sensors_indices.append(time_idx)

print(f"Number of samples with not enough sensors: {len(not_enough_sensors_indices)}")

data_final = np.delete(data_non_null, not_enough_sensors_indices, axis=1)
data = data_final
print(data.shape)

Number of samples with NaN values: 25656
Number of samples with not enough sensors: 48756
(31, 11965, 4, 24)


In [24]:
null_set = set()

for sensor_idx in range(data.shape[0]):
    for time_idx in range(data.shape[1]):
        curr = data[sensor_idx, time_idx, :, :]
        if np.isnan(curr).all():
            null_set.add((time_idx, sensor_idx))

print(len(null_set))

95720


In [25]:
data_filled = data.copy()

for sensor_idx in range(data.shape[0]):
    for time_idx in range(data.shape[1]):
        if (time_idx, sensor_idx) not in null_set:
            sample = data[sensor_idx, time_idx, :, :]
            sample_filled = sample.copy()

            x = np.arange(sample.shape[1])  # sequence positions
            for i in range(sample.shape[0]):
                y = sample[i]
                mask = ~np.isnan(y)
                valid_vals = y[mask]
                
                if mask.sum() > 1:
                    sample_filled[i] = np.interp(x, x[mask], valid_vals)      # linear interpolation
                elif mask.sum() == 1:
                    sample_filled[i] = valid_vals[0]

            data_filled[sensor_idx, time_idx, :, :] = sample_filled

In [26]:
data_filled = data_filled.transpose(1, 0, 3, 2)
print(data_filled.shape)
print(np.array(list(null_set)).shape)

np.save('../data/stbernard_null_set.npy', np.array(list(null_set)))
np.save('../data/stbernard_data_filled.npy', data_filled)

(11965, 31, 24, 4)
(95720, 2)


In [27]:
static = pd.read_csv('/Users/victorli/Downloads/Sensorscope/stbernard-location/station_gsb_XY.txt', sep='\t', header=0)
static = static.iloc[:, [1, 2]]
static.columns = ['latitude', 'longitude']
np.save('../data/stbernard_static.npy', static)

In [28]:
for sensor_idx in range(data.shape[0]):
    for time_idx in range(data.shape[1]):
        if (time_idx, sensor_idx) not in null_set:
            sample = data[sensor_idx, time_idx, :, :]
            assert not np.isnan(sample).all()