In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler


In [2]:

def minmaxScale(data,columns):
    scaler = MinMaxScaler()
    data[columns] = scaler.fit_transform(data[columns])
    return data

def stdScale(data,columns):
    scaler = StandardScaler()
    data[columns] = scaler.fit_transform(data[columns])
    return data

def get_data(path, mode):
    csv_data = pd.read_csv(path)
    columns = csv_data.columns.values
    columns = columns[:-5]
    columns = np.delete(columns,np.where(columns == 'Wind Direction (deg)'))
    columns = np.delete(columns,np.where(columns == 'Cloud Height 3rd Layer (FT)'))
    
    # 결측치 제거
    if mode == 'train':
        csv_data = csv_data[columns]
        print("{0} == 0 data num: {1}".format(columns[7],len(csv_data[csv_data[columns[7]] == 0].index)))
        csv_data = csv_data.replace({columns[7]:0},csv_data.mean()[columns[7]])
        print("{0} == 0 data num: {1}".format(columns[7],len(csv_data[csv_data[columns[7]] == 0].index)))
        print()
        
        print("{0} > 8 data num: {1}".format(columns[9],len(csv_data[csv_data[columns[9]] > 8].index)))
        csv_data = csv_data.replace({columns[9]:9},8)
        print("{0} > 8 data num: {1}".format(columns[9],len(csv_data[csv_data[columns[9]] > 8].index)))
        print()

        print("{0} == 0 data num: {1}".format(columns[11:13],len(csv_data[csv_data[columns[11]] == 0].index) + len(csv_data[csv_data[columns[12]] == 0].index)))
        csv_data = csv_data.replace({columns[11]:0,columns[12]:0},csv_data.mean()[columns[11:13]])
        print("{0} == 0 data num: {1}".format(columns[11:13],len(csv_data[csv_data[columns[11]] == 0].index) + len(csv_data[csv_data[columns[12]] == 0].index)))
        print()

        print("{0} > 30 data num: {1}".format(columns[13],len(csv_data[csv_data[columns[13]] > 30].index)))
        normal_mean = np.mean(csv_data[csv_data[columns[13]] <= 30][columns[13]].unique())
        csv_data.loc[csv_data[columns[13]] > 30, columns[13]] = normal_mean
        print("{0} > 30 data num: {1}".format(columns[13],len(csv_data[csv_data[columns[13]] > 30].index)))
        print()
    elif mode == 'valid' or mode == 'validation':
        csv_data = csv_data[columns]
        print("{0} == 0 data num: {1}".format(columns[6],len(csv_data[csv_data[columns[6]] == 0].index)))
        csv_data = csv_data.replace({columns[6]:0},csv_data.mean()[columns[6]])
        print("{0} == 0 data num: {1}".format(columns[6],len(csv_data[csv_data[columns[6]] == 0].index)))
        print()
        
        print("{0} > 8 data num: {1}".format(columns[8],len(csv_data[csv_data[columns[8]] > 8].index)))
        csv_data = csv_data.replace({columns[8]:9},8)
        print("{0} > 8 data num: {1}".format(columns[8],len(csv_data[csv_data[columns[8]] > 8].index)))
        print()

        print("{0} == 0 data num: {1}".format(columns[10:12],len(csv_data[csv_data[columns[10]] == 0].index) + len(csv_data[csv_data[columns[11]] == 0].index)))
        csv_data = csv_data.replace({columns[10]:0,columns[11]:0},csv_data.mean()[columns[10:12]])
        print("{0} == 0 data num: {1}".format(columns[10:12],len(csv_data[csv_data[columns[10]] == 0].index) + len(csv_data[csv_data[columns[11]] == 0].index)))
        print()

        print("{0} > 30 data num: {1}".format(columns[12],len(csv_data[csv_data[columns[12]] > 30].index)))
        normal_mean = np.mean(csv_data[csv_data[columns[12]] <= 30][columns[12]].unique())
        csv_data.loc[csv_data[columns[12]] > 30, columns[12]] = normal_mean
        print("{0} > 30 data num: {1}".format(columns[12],len(csv_data[csv_data[columns[12]] > 30].index)))
        print()
    
    return csv_data, columns

def interpol(data, columns):
    data = data.drop(columns=['ID'])
    not_interpolate = ['Year','Month','Day','Hour','ID','Weather Phenomenon (null)','Cloud Type 1st Layer (null)','Total Cloud Cover (1/8)']
    interpol = [i for i in columns if i not in not_interpolate]
    del not_interpolate[:5]
    
    data['Datetime'] = pd.to_datetime(data[['Year', 'Month', 'Day', 'Hour']])
    data.set_index('Datetime', inplace=True)
    
    data = data.resample('h').asfreq()
    data[interpol] = data[interpol].interpolate(method='time')
    data[not_interpolate] = data[not_interpolate].ffill()
    data['Year'] = data.index.year
    data['Month'] = data.index.month
    data['Day'] = data.index.day
    data['Hour'] = data.index.hour
    
    return data

def windowed(data, mode, window, hop):
    if mode == 'train':
        X, y = [], []
        temper = data['Temperature']
        data = data.drop(columns=['Temperature'])
        for i in range(len(data) - window):
            X.append(data[i:i+window].to_numpy())
            y.append(temper[i:i+window].to_numpy())
        return np.array(X), np.array(y)
    elif mode == 'valid' or mode == 'validation':
        X = []
        for i in range(len(data) - window):
            X.append(data[i:i+window].to_numpy())
        return np.array(X)

def prepare(data, columns, mode, window, hop):
    if mode == 'train':
        minmax_column = [columns[9],columns[17]]
        std_column = [columns[6],columns[7],columns[8],
                    columns[10],columns[11],columns[12],
                    columns[13],columns[14],columns[15],
                    columns[16],columns[17]]
    elif mode == 'valid' or mode == 'validation':
        minmax_column = [columns[8],columns[16]]
        std_column = [columns[5],columns[6],columns[7],
                    columns[9],columns[10],columns[11],
                    columns[12],columns[13],columns[14],
                    columns[15],columns[16]]
        
    data = minmaxScale(data, minmax_column)
    data = stdScale(data, std_column)
    data = interpol(data, columns)
    data = windowed(data, mode, window, hop)
    return data

In [3]:
path = 'train.csv'
mode ='train'
data, columns = get_data(path, mode)
print(data)

Visibility (m) == 0 data num: 569
Visibility (m) == 0 data num: 0

Cloud Cover 1st Layer (1/8) > 8 data num: 846
Cloud Cover 1st Layer (1/8) > 8 data num: 0

['Cloud Height 1st Layer (FT)' 'Cloud Height 2nd Layer (FT)'] == 0 data num: 192517
['Cloud Height 1st Layer (FT)' 'Cloud Height 2nd Layer (FT)'] == 0 data num: 0

Dew Point Temperature > 30 data num: 22649
Dew Point Temperature > 30 data num: 0

            ID  Year  Month  Day  Hour  Temperature  Wind Speed (KT)  \
0            6  1983      1    1     5         -3.0                0   
1            7  1983      1    1     6         -4.0                4   
2            8  1983      1    1     7         -4.3                2   
3            9  1983      1    1     8         -4.5                4   
4           11  1983      1    1    10         -3.0                2   
...        ...   ...    ...  ...   ...          ...              ...   
226478  323543  2019     12   31    19         -6.8                2   
226479  323544  201

In [4]:
window = 24
hop  = 6

if mode == 'train':
    minmax_column = [columns[9],columns[17]]
    std_column = [columns[6],columns[7],columns[8],
                columns[10],columns[11],columns[12],
                columns[13],columns[14],columns[15],
                columns[16],columns[17]]
elif mode == 'valid' or mode == 'validation':
    minmax_column = [columns[8],columns[16]]
    std_column = [columns[5],columns[6],columns[7],
                columns[9],columns[10],columns[11],
                columns[12],columns[13],columns[14],
                columns[15],columns[16]]

data = minmaxScale(data, minmax_column)
data = stdScale(data, std_column)
data = interpol(data, columns)
data = windowed(data, mode, window, hop)

In [5]:
path = 'test.csv'
mode ='valid'
data, columns = get_data(path, mode)
print(data)

Visibility (m) == 0 data num: 0
Visibility (m) == 0 data num: 0

Cloud Cover 1st Layer (1/8) > 8 data num: 31
Cloud Cover 1st Layer (1/8) > 8 data num: 0

['Cloud Height 1st Layer (FT)' 'Cloud Height 2nd Layer (FT)'] == 0 data num: 26695
['Cloud Height 1st Layer (FT)' 'Cloud Height 2nd Layer (FT)'] == 0 data num: 0

Dew Point Temperature > 30 data num: 0
Dew Point Temperature > 30 data num: 0

          ID  Year  Month  Day  Hour  Wind Speed (KT)  Visibility (m)  \
0          1  2020      1    1     1                2           10000   
1          2  2020      1    1     2                4           10000   
2          3  2020      1    1     3                6           10000   
3          4  2020      1    1     4                6           10000   
4          5  2020      1    1     5                6           10000   
...      ...   ...    ...  ...   ...              ...             ...   
27168  27169  2023     12   31    19                1           10000   
27169  27170  2023 