## Load data

In [5]:
import pandas as pd
import numpy as np

df_train = pd.read_csv('https://www.dropbox.com/s/5cgteeozyccaf6n/weatherHistory_training.csv?dl=1')
print(df_train.shape)
df_train

(70000, 2)


Unnamed: 0,Formatted Date,Temperature (C)
0,2006-01-01 00:00:00.000 +0100,0.577778
1,2006-01-01 01:00:00.000 +0100,1.161111
2,2006-01-01 02:00:00.000 +0100,1.666667
3,2006-01-01 03:00:00.000 +0100,1.711111
4,2006-01-01 04:00:00.000 +0100,1.183333
...,...,...
69995,2013-12-25 11:00:00.000 +0100,7.133333
69996,2013-12-25 12:00:00.000 +0100,8.800000
69997,2013-12-25 13:00:00.000 +0100,9.883333
69998,2013-12-25 14:00:00.000 +0100,9.883333


In [6]:
def slicing_window(df, df_start_idx, df_end_idx, input_size, label_size, label_name):
    features = [] # Khai báo list dùng để lưu trữ các X
    labels = []   # Khai báo list dùng để lưu trữ các y

    window_size = input_size + label_size # Tính kích thước của window

    # Nếu df_end_idx = chỉ mục cuối cùng bảng dữ liệu, cần phải dời xuống 1 khoảng = window size 
    if df_end_idx == None:
        df_end_idx = len(df) - window_size 
    
    # Duyệt qua từng mẫu dữ liệu
    for idx in range(df_start_idx, df_end_idx):
        feature_end_idx = idx + input_size # Tính chỉ mục kết thúc của X
        label_start_idx = idx + window_size - label_size # Tính chỉ mục bắt đầu của y

        feature = df[idx:feature_end_idx] # Lấy X
        label = df[label_name][label_start_idx:(idx+window_size)] # Lấy y

        features.append(feature) 
        labels.append(label)

    # Chuyển list thành np.ndarrray
    features = np.array(features)
    labels = np.array(labels)

    return features, labels

In [7]:
INPUT_SIZE = 48 # Dùng `input_size` giờ trước
LABEL_SIZE = 6  # Để dự đoán nhiệt độ trong `label_size` giờ
BATCH_SIZE = 32
FEATURE_FINAL = ['Temperature (C)']
label_name = 'Temperature (C)'

In [8]:
X_train, y_train = slicing_window(df_train[FEATURE_FINAL], 
                                  df_start_idx=0,
                                  df_end_idx=None,
                                  input_size=INPUT_SIZE,
                                  label_size=LABEL_SIZE,
                                  label_name = label_name)

print(X_train.shape)
print(y_train.shape)

(69946, 48, 1)
(69946, 6)


In [9]:
df_test = pd.read_csv('https://www.dropbox.com/s/cq8fypkq0w6728o/weatherHistory_testing.csv?dl=1')
df_test.head(10)

Unnamed: 0,Formatted Date,Temperature (C)
0,2013-12-25 16:00:00.000 +0100,7.777778
1,2013-12-25 17:00:00.000 +0100,7.194444
2,2013-12-25 18:00:00.000 +0100,7.127778
3,2013-12-25 19:00:00.000 +0100,7.111111
4,2013-12-25 20:00:00.000 +0100,7.066667
5,2013-12-25 21:00:00.000 +0100,6.6
6,2013-12-25 22:00:00.000 +0100,6.438889
7,2013-12-25 23:00:00.000 +0100,5.972222
8,2013-12-26 00:00:00.000 +0100,5.416667
9,2013-12-26 01:00:00.000 +0100,5.333333


In [10]:
X_test, y_test = slicing_window(df_test[FEATURE_FINAL], 
                                  df_start_idx=0,
                                  df_end_idx=None,
                                  input_size=INPUT_SIZE,
                                  label_size=LABEL_SIZE,
                                  label_name = label_name)

print(X_test.shape)
print(y_test.shape)

(26399, 48, 1)
(26399, 6)


In [11]:
import tensorflow as tf

# Khởi tạo tf.data.Dataset
train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(BATCH_SIZE)
test_ds  = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(BATCH_SIZE)

# Cấu hình các tham số tối ưu cho việc đọc dữ liệu
AUTOTUNE = tf.data.AUTOTUNE
train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds  = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

## Add NaN

In [17]:
import pandas as pd
import numpy as np

df_train = pd.read_csv('https://www.dropbox.com/s/5cgteeozyccaf6n/weatherHistory_training.csv?dl=1')
print(df_train.shape)

(70000, 2)


In [18]:
df_train.head(4)

Unnamed: 0,Formatted Date,Temperature (C)
0,2006-01-01 00:00:00.000 +0100,0.577778
1,2006-01-01 01:00:00.000 +0100,1.161111
2,2006-01-01 02:00:00.000 +0100,1.666667
3,2006-01-01 03:00:00.000 +0100,1.711111


In [7]:
df_train['Temperature (C)'].shape

(70000,)

In [8]:
mask_train = np.random.binomial(1, 0.85, (70000, ))
print(mask_train.shape)

(70000,)


In [15]:
mask_train_flip = 1 - mask_train

In [16]:
mask_train_flip = mask_train_flip*1000

In [19]:
df_train['Temperature (C)'] = df_train['Temperature (C)']*mask_train + mask_train_flip

In [20]:
df_train.head(10)

Unnamed: 0,Formatted Date,Temperature (C)
0,2006-01-01 00:00:00.000 +0100,0.577778
1,2006-01-01 01:00:00.000 +0100,1.161111
2,2006-01-01 02:00:00.000 +0100,1.666667
3,2006-01-01 03:00:00.000 +0100,1.711111
4,2006-01-01 04:00:00.000 +0100,1.183333
5,2006-01-01 05:00:00.000 +0100,1.205556
6,2006-01-01 06:00:00.000 +0100,2.222222
7,2006-01-01 07:00:00.000 +0100,2.072222
8,2006-01-01 08:00:00.000 +0100,2.2
9,2006-01-01 09:00:00.000 +0100,1000.0


In [21]:
df_train.to_csv('weatherHistory_training_nan.csv', index=False)

In [23]:
df_train = pd.read_csv('weatherHistory_training_nan.csv')
df_train.head(20)

Unnamed: 0,Formatted Date,Temperature (C)
0,2006-01-01 00:00:00.000 +0100,0.577778
1,2006-01-01 01:00:00.000 +0100,1.161111
2,2006-01-01 02:00:00.000 +0100,1.666667
3,2006-01-01 03:00:00.000 +0100,1.711111
4,2006-01-01 04:00:00.000 +0100,1.183333
5,2006-01-01 05:00:00.000 +0100,1.205556
6,2006-01-01 06:00:00.000 +0100,2.222222
7,2006-01-01 07:00:00.000 +0100,2.072222
8,2006-01-01 08:00:00.000 +0100,2.2
9,2006-01-01 09:00:00.000 +0100,


In [26]:
df_test = pd.read_csv('weatherHistory_testing.csv')
df_test.shape

(26453, 2)

In [28]:
mask_test = np.random.binomial(1, 0.85, (26453, ))
print(mask_test.shape)

(26453,)


In [31]:
mask_test_flip = 1 - mask_test
mask_test_flip = mask_test_flip*1000
df_test['Temperature (C)'] = df_test['Temperature (C)']*mask_test + mask_test_flip
df_test.head(20)

Unnamed: 0,Formatted Date,Temperature (C)
0,2013-12-25 16:00:00.000 +0100,7.777778
1,2013-12-25 17:00:00.000 +0100,7.194444
2,2013-12-25 18:00:00.000 +0100,7.127778
3,2013-12-25 19:00:00.000 +0100,7.111111
4,2013-12-25 20:00:00.000 +0100,7.066667
5,2013-12-25 21:00:00.000 +0100,6.6
6,2013-12-25 22:00:00.000 +0100,6.438889
7,2013-12-25 23:00:00.000 +0100,5.972222
8,2013-12-26 00:00:00.000 +0100,5.416667
9,2013-12-26 01:00:00.000 +0100,5.333333


In [32]:
df_test.to_csv('weatherHistory_testing_nan.csv', index=False)

In [34]:
df_test = pd.read_csv('weatherHistory_testing_nan.csv')
df_test.head(20)

Unnamed: 0,Formatted Date,Temperature (C)
0,2013-12-25 16:00:00.000 +0100,7.777778
1,2013-12-25 17:00:00.000 +0100,7.194444
2,2013-12-25 18:00:00.000 +0100,7.127778
3,2013-12-25 19:00:00.000 +0100,7.111111
4,2013-12-25 20:00:00.000 +0100,7.066667
5,2013-12-25 21:00:00.000 +0100,6.6
6,2013-12-25 22:00:00.000 +0100,6.438889
7,2013-12-25 23:00:00.000 +0100,5.972222
8,2013-12-26 00:00:00.000 +0100,5.416667
9,2013-12-26 01:00:00.000 +0100,5.333333


## Split data

In [1]:
import pandas as pd
import numpy as np

df_train = pd.read_csv('https://www.dropbox.com/s/5cgteeozyccaf6n/weatherHistory_training.csv?dl=1')
print(df_train.shape)
df_train.head(5)

(70000, 2)


Unnamed: 0,Formatted Date,Temperature (C)
0,2006-01-01 00:00:00.000 +0100,0.577778
1,2006-01-01 01:00:00.000 +0100,1.161111
2,2006-01-01 02:00:00.000 +0100,1.666667
3,2006-01-01 03:00:00.000 +0100,1.711111
4,2006-01-01 04:00:00.000 +0100,1.183333


In [21]:
def split_data(df, input_size, FEATURE_FINAL):
    segments = []
    for idx in range(0, 70000, input_size):
        begin = idx
        end = begin + input_size        
        segment = df[FEATURE_FINAL][begin:end].to_numpy()
        segments.append(segment)

    # Chuyển list thành np.ndarrray
    segments = np.array(segments)
    return segments

In [23]:
input_size = 500
FEATURE_FINAL = ['Temperature (C)']

(140, 500, 1)


In [25]:
segments_train_x = split_data(df_train, input_size, FEATURE_FINAL)
print(segments_train_x.shape)
print(segments_train_x[0,:20,0])

(140, 500, 1)
[0.57777778 1.16111111 1.66666667 1.71111111 1.18333333 1.20555556
 2.22222222 2.07222222 2.2        2.73333333 2.78888889 3.82222222
 4.91111111 6.20555556 7.43888889 6.95       5.97222222 5.90555556
 4.95555556 5.47222222]


In [26]:
df_train_nan = pd.read_csv('weatherHistory_training_nan.csv')
segments_train_x_nan = split_data(df_train_nan, input_size, FEATURE_FINAL)
print(segments_train_x_nan.shape)
print(segments_train_x_nan[0,:20,0])

(140, 500, 1)
[0.57777778 1.16111111 1.66666667 1.71111111 1.18333333 1.20555556
 2.22222222 2.07222222 2.2               nan 2.78888889        nan
 4.91111111 6.20555556        nan 6.95       5.97222222 5.90555556
 4.95555556 5.47222222]
