In [2]:
import numpy as np
import pandas as pd

In [38]:
from sklearn.model_selection import train_test_split

path = 'Data/AirQualityComplete/'

df = pd.DataFrame()

air_quality = pd.read_csv(path + 'airquality.csv', dtype={'station_id': str})
station = pd.read_csv(path + 'station.csv', dtype={'station_id': str, 'district_id': str})
meteorology = pd.read_csv(path + 'meteorology.csv', dtype={'id': str}).rename(columns={'id': 'district_id'})
meteorology = meteorology[meteorology['district_id'].str.len() >= 5]

df = pd.merge(air_quality, station, how='left', on='station_id')
df = pd.merge(df, meteorology, how='left', on=['time', 'district_id'])

weather_map = {
    0: "Sunny",
    1: "Cloudy",
    2: "Overcast",
    3: "Rainy",
    4: "Sprinkle",
    5: "Moderate rain",
    6: "Heavier rain",
    7: "Rain storm",
    8: "Thunder storm",
    9: "Freezing rain",
    10: "Snowy",
    11: "Light snow",
    12: "Moderate snow",
    13: "Heavy snow",
    14: "Foggy",
    15: "Sand storm",
    16: "Dusty"
}
df['weather_label'] = df['weather'].map(weather_map)
weather_dummies = pd.get_dummies(df['weather_label'], prefix='', prefix_sep='').astype(int)
df = df.drop(['weather', 'weather_label'], axis=1)
df = pd.concat([df, weather_dummies], axis=1)
df.drop(columns=['district_id', 'name_chinese', 'name_english'], inplace=True)

df['time'] = pd.to_datetime(df['time'])
df['year'] = df['time'].dt.year
df['month'] = df['time'].dt.month
df['day'] = df['time'].dt.day
df['hour'] = df['time'].dt.hour
df = df.drop('time', axis=1)

df.fillna(-1, inplace=True)

df.to_csv(path + 'AirQualityComplete.csv', index=False)

train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)
train_df.to_csv(path + 'train.csv', index=False)
val_df.to_csv(path + 'val.csv', index=False)
test_df.to_csv(path + 'test.csv', index=False)


train_df.shape, val_df.shape, test_df.shape

((2313114, 34), (289139, 34), (289140, 34))

In [37]:
df.columns

Index(['station_id', 'PM25_Concentration', 'PM10_Concentration',
       'NO2_Concentration', 'CO_Concentration', 'O3_Concentration',
       'SO2_Concentration', 'latitude', 'longitude', 'temperature', 'pressure',
       'humidity', 'wind_speed', 'wind_direction', 'Cloudy', 'Dusty', 'Foggy',
       'Freezing rain', 'Heavier rain', 'Heavy snow', 'Light snow',
       'Moderate rain', 'Moderate snow', 'Overcast', 'Rain storm', 'Rainy',
       'Sand storm', 'Sprinkle', 'Sunny', 'Thunder storm', 'year', 'month',
       'day', 'hour'],
      dtype='object')