In [14]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

In [15]:
def read_data_csv(data_path):
    df = pd.read_csv(data_path)
    return df

In [16]:
def train_test_split_by_column(df, column_split, m1):
    X_train_df = pd.DataFrame()
    X_test_df = pd.DataFrame()

    for city in df[column_split].unique():
        df_temp = df.loc[df[column_split] == city]
        threshold = int(df_temp.shape[0] * 0.8)

        train_temp = df_temp[:threshold]
        test_temp = df_temp[threshold:]

        X_train_df = pd.concat([X_train_df, train_temp], axis=0)
        X_test_df = pd.concat([X_test_df, test_temp], axis=0)

    y_train_df = X_train_df[m1].values
    y_test_df = X_test_df[m1].values

    X_train_df.set_index((i for i in range(len(X_train_df))), inplace=True)
    X_test_df.set_index((i for i in range(len(X_test_df))), inplace=True)

    # X_train_df.drop([m1],axis=1,inplace=True)
    # X_test_df.drop([m1],axis=1,inplace=True)
    return X_train_df, y_train_df, X_test_df, y_test_df

In [17]:
def window_slide(train):

    window_size = 24
    X = []
    Y = []
    for city in train["City"].unique():
        df_city = train[train["City"] == city]
        label = df_city["AQI"]
        label = np.reshape(label, (len(label), 1))
        df_city.drop(["AQI"], axis=1, inplace=True)
        # label_index=df_city.index[0]+window_size-1
        for i in range(window_size, len(df_city)):
            X.append(df_city.iloc[i - window_size : i, :].values)
            Y.append(label[i, :])
    return np.array(X), np.array(Y)


def window_slide_one_city(train):

    window_size = 24
    X = []
    Y = []
    label = train["AQI"]
    train.drop(["AQI"], axis=1, inplace=True)
    label = np.reshape(label, (len(label), 1))
    for i in range(window_size, len(train)):
        X.append(train.iloc[i - window_size : i, :].values)
        Y.append(label[i, :])
    return np.array(X), np.array(Y)

In [18]:
# #Scale dữ liệu về đoạn 0-1
def scale_data(df_train, df_test, list_scale_features):
    scaler = MinMaxScaler(feature_range=(0, 1))
    values_train = df_train[list_scale_features].values
    scaled_values_train = scaler.fit_transform(values_train)
    df_train[list_scale_features] = scaled_values_train

    values_test = df_test[list_scale_features].values
    scaled_values_test = scaler.transform(values_test)
    df_test[list_scale_features] = scaled_values_test

    return df_train, df_test, scaler

Sử dụng bộ dữ liệu Taiwan đã được xử lý


In [19]:
df_Taiwan = read_data_csv(
    "D:\AQI-Forecasting\data\data processed\Taiwan_data_processed.csv"
)
df_Taiwan.head()

Unnamed: 0,time,station,CO,NO,NO2,NOx,O3,PM10,PM2.5,SO2,AQI
0,1/1/2015 0:00,Banqiao,0.79,1.2,16.0,17.0,37.0,177.0,25.0,12.0,151.333333
1,1/1/2015 1:00,Banqiao,0.8,1.3,16.0,17.0,36.0,178.0,25.0,11.0,151.666667
2,1/1/2015 2:00,Banqiao,0.71,1.0,13.0,14.0,38.0,163.0,25.0,8.0,148.444444
3,1/1/2015 3:00,Banqiao,0.66,0.8,11.0,12.0,39.0,147.0,25.0,6.5,144.166667
4,1/1/2015 4:00,Banqiao,0.53,0.6,10.0,11.0,38.0,131.0,25.0,5.5,139.466667


In [20]:
# Tách dữ liệu thành tập train và test
X_train_Taiwan, y_train_Taiwan, X_test_Taiwan, y_test_Taiwan = (
    train_test_split_by_column(df_Taiwan, "station", "AQI")
)
print("Number of X train city: ", X_train_Taiwan["station"].nunique())
print("Number of X test city: ", X_test_Taiwan["station"].nunique())
print("X train: ", X_train_Taiwan.shape)
print("y train: ", y_train_Taiwan.shape)
print("X test: ", X_test_Taiwan.shape)
print("y test: ", y_test_Taiwan.shape)

Number of X train city:  24
Number of X test city:  24
X train:  (167899, 11)
y train:  (167899,)
X test:  (41981, 11)
y test:  (41981,)


In [21]:
X_train_m1_Taiwan = X_train_Taiwan[X_train_Taiwan["station"] == "Banqiao"].drop(["station"], axis=1)
X_train_m2_Taiwan = X_train_Taiwan[X_train_Taiwan["station"] == "Tucheng"].drop(["station"], axis=1)
X_train_m3_Taiwan = X_train_Taiwan[X_train_Taiwan["station"] == "Xinzhuang"].drop(["station"], axis=1)

X_test_m1_Taiwan = X_test_Taiwan[X_test_Taiwan["station"] == "Banqiao"].drop(["station"], axis=1)
X_test_m2_Taiwan = X_test_Taiwan[X_test_Taiwan["station"] == "Tucheng"].drop(["station"], axis=1)
X_test_m3_Taiwan = X_test_Taiwan[X_test_Taiwan["station"] == "Xinzhuang"].drop(["station"], axis=1)

In [22]:
X_train_m1_Taiwan = X_train_m1_Taiwan.drop(["time"], axis=1)
X_train_m2_Taiwan = X_train_m2_Taiwan.drop(["time"], axis=1)
X_train_m3_Taiwan = X_train_m3_Taiwan.drop(["time"], axis=1)
X_test_m1_Taiwan = X_test_m1_Taiwan.drop(["time"], axis=1)
X_test_m2_Taiwan = X_test_m2_Taiwan.drop(["time"], axis=1)
X_test_m3_Taiwan = X_test_m3_Taiwan.drop(["time"], axis=1)
print(X_train_m1_Taiwan.shape)
print(X_train_m2_Taiwan.shape)
print(X_train_m3_Taiwan.shape)
print(X_test_m1_Taiwan.shape)
print(X_test_m2_Taiwan.shape)
print(X_test_m3_Taiwan.shape)

(7008, 9)
(7008, 9)
(7008, 9)
(1752, 9)
(1752, 9)
(1752, 9)


In [23]:
scaled_features = ["PM2.5", "PM10", "SO2", "CO", "O3", "NO2", "NOx", "NO"]
df_train_m1_Taiwan, df_test_m1_Taiwan, scaler = scale_data(X_train_m1_Taiwan, X_test_m1_Taiwan, scaled_features)
df_train_m2_Taiwan, df_test_m2_Taiwan, scaler = scale_data(X_train_m2_Taiwan, X_test_m2_Taiwan, scaled_features)
df_train_m3_Taiwan, df_test_m3_Taiwan, scaler = scale_data(X_train_m3_Taiwan, X_test_m3_Taiwan, scaled_features)
df_train_m1_Taiwan.head()

Unnamed: 0,CO,NO,NO2,NOx,O3,PM10,PM2.5,SO2,AQI
0,0.125,0.007533,0.182879,0.05283,0.267054,0.994253,0.277778,0.378882,151.333333
1,0.126712,0.008004,0.182879,0.05283,0.259797,1.0,0.277778,0.347826,151.666667
2,0.111301,0.006591,0.143969,0.041509,0.274311,0.913793,0.277778,0.254658,148.444444
3,0.10274,0.00565,0.118029,0.033962,0.281567,0.821839,0.277778,0.208075,144.166667
4,0.080479,0.004708,0.105058,0.030189,0.274311,0.729885,0.277778,0.177019,139.466667


In [24]:
# y_train = np.reshape(y_train_df, (len(y_train_df), 1))
# y_test = np.reshape(y_test_df, (len(y_test_df), 1))

X_train_m1_final_Taiwan, y_train_m1_final_Taiwan = window_slide_one_city(df_train_m1_Taiwan)
X_test_m1_final_Taiwan, y_test_m1_final_Taiwan = window_slide_one_city(df_test_m1_Taiwan)
print(X_train_m1_final_Taiwan.shape)

X_train_m2_final_Taiwan, y_train_m2_final_Taiwan = window_slide_one_city(df_train_m2_Taiwan)
X_test_m2_final_Taiwan, y_test_m2_final_Taiwan = window_slide_one_city(df_test_m2_Taiwan)
print(X_train_m2_final_Taiwan.shape)

X_train_m3_final_Taiwan, y_train_m3_final_Taiwan = window_slide_one_city(df_train_m3_Taiwan)
X_test_m3_final_Taiwan, y_test_m3_final_Taiwan = window_slide_one_city(df_test_m3_Taiwan)
print(X_train_m3_final_Taiwan.shape)

(6984, 24, 8)
(6984, 24, 8)
(6984, 24, 8)


In [25]:
X_train_final_Taiwan = np.concatenate([X_train_m1_final_Taiwan, X_train_m2_final_Taiwan, X_train_m3_final_Taiwan])
print(X_train_final_Taiwan.shape)
y_train_final_Taiwan = np.concatenate([y_train_m1_final_Taiwan, y_train_m2_final_Taiwan, y_train_m3_final_Taiwan])
print(y_train_final_Taiwan.shape)

(20952, 24, 8)
(20952, 1)


Sử dụng bộ dữ liệu India đã được xử lý

In [26]:
df_India = read_data_csv("D:\AQI-Forecasting\data\data processed\India_data_processed.csv")
df_India.head()

Unnamed: 0,City,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,AQI
0,Aizawl,42.0,51.28,4.27,0.97,6.66,19.88,0.37,3.35,6.95,54.0
1,Aizawl,41.17,49.96,4.51,1.27,7.24,21.55,0.38,3.44,6.95,54.0
2,Aizawl,24.97,42.04,7.25,5.45,14.56,20.25,0.5,3.93,6.95,54.0
3,Aizawl,26.95,38.86,7.31,2.52,12.13,21.94,0.52,3.93,5.7975,54.0
4,Aizawl,17.42,37.15,7.25,1.58,11.14,25.71,0.49,4.36,4.645,54.0


In [27]:
# Tách dữ liệu thành tập train và test
X_train_India, y_train_India, X_test_India, y_test_India = train_test_split_by_column(df_India, "City", "AQI")
print("Number of X train city: ", X_train_India["City"].nunique())
print("Number of X test city: ", X_test_India["City"].nunique())
print("X train: ", X_train_India.shape)
print("y train: ", y_train_India.shape)
print("X test: ", X_test_India.shape)
print("y test: ", y_test_India.shape)

Number of X train city:  20
Number of X test city:  20
X train:  (360361, 11)
y train:  (360361,)
X test:  (90101, 11)
y test:  (90101,)


In [28]:
X_train_m1_India = X_train_India[X_train_India["City"] == "Bengaluru"].drop(["City"], axis=1)
X_train_m2_India = X_train_India[X_train_India["City"] == "Mumbai"].drop(["City"], axis=1)
X_train_m3_India = X_train_India[X_train_India["City"] == "Delhi"].drop(["City"], axis=1)

X_test_m1_India = X_test_India[X_test_India["City"] == "Bengaluru"].drop(["City"], axis=1)
X_test_m2_India = X_test_India[X_test_India["City"] == "Mumbai"].drop(["City"], axis=1)
X_test_m3_India = X_test_India[X_test_India["City"] == "Delhi"].drop(["City"], axis=1)

In [29]:
scaled_features = ["PM2.5", "PM10", "SO2", "CO", "O3", "NO2", "NOx", "NO", "NH3"]
df_train_m1_India, df_test_m1_India, scaler = scale_data(X_train_m1_India, X_test_m1_India, scaled_features)
df_train_m2_India, df_test_m2_India, scaler = scale_data(X_train_m2_India, X_test_m2_India, scaled_features)
df_train_m3_India, df_test_m3_India, scaler = scale_data(X_train_m3_India, X_test_m3_India, scaled_features)
df_train_m1_India.head()

Unnamed: 0,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,AQI
43786,0.057638,0.100978,0.007003,0.034083,0.023223,0.039275,0.003803,0.020374,0.061693,78.002106
43787,0.057615,0.100974,0.007561,0.033282,0.0231,0.03662,0.004203,0.021282,0.060292,78.004213
43788,0.057592,0.10097,0.005679,0.029719,0.019975,0.035817,0.006005,0.021534,0.065296,78.006319
43789,0.057569,0.100966,0.006585,0.028338,0.019792,0.034664,0.002602,0.020828,0.058741,78.008425
43790,0.057546,0.100962,0.006655,0.026957,0.019056,0.033615,0.003002,0.021433,0.051186,78.010532


In [30]:
# y_train = np.reshape(y_train_df, (len(y_train_df), 1))
# y_test = np.reshape(y_test_df, (len(y_test_df), 1))

X_train_m1_final_India, y_train_m1_final_India = window_slide_one_city(df_train_m1_India)
X_test_m1_final_India, y_test_m1_final_India = window_slide_one_city(df_test_m1_India)
print(X_train_m1_final_India.shape)

X_train_m2_final_India, y_train_m2_final_India = window_slide_one_city(df_train_m2_India)
X_test_m2_final_India, y_test_m2_final_India = window_slide_one_city(df_test_m2_India)
print(X_train_m2_final_India.shape)

X_train_m3_final_India, y_train_m3_final_India = window_slide_one_city(df_train_m3_India)
X_test_m3_final_India, y_test_m3_final_India = window_slide_one_city(df_test_m3_India)
print(X_train_m3_final_India.shape)

(38529, 24, 9)
(38529, 24, 9)
(38529, 24, 9)


In [31]:
X_train_final_India = np.concatenate([X_train_m1_final_India, X_train_m2_final_India, X_train_m3_final_India])
print(X_train_final_India.shape)
y_train_final_India = np.concatenate([y_train_m1_final_India, y_train_m2_final_India, y_train_m3_final_India])
print(y_train_final_India.shape)
X_test_final_India = np.concatenate(
    [X_test_m1_final_India, X_test_m2_final_India, X_test_m3_final_India]
)
print(X_test_final_India.shape)
y_test_final_India = np.concatenate(
    [y_test_m1_final_India, y_test_m2_final_India, y_test_m3_final_India]
)
print(y_test_final_India.shape)

(115587, 24, 9)
(115587, 1)
(28845, 24, 9)
(28845, 1)


In [35]:
print("First 5 rows of X_train_final_India:")
print(X_train_final_India[:5])

First 5 rows of X_train_final_India:
[[[0.05763804 0.10097793 0.00700324 ... 0.00380304 0.0203742  0.06169319]
  [0.05761492 0.10097383 0.00756071 ... 0.00420336 0.02128196 0.0602922 ]
  [0.0575918  0.10096974 0.00567924 ... 0.0060048  0.02153412 0.06529571]
  ...
  [0.0571526  0.10089195 0.02034772 ... 0.01261009 0.02143325 0.17204901]
  [0.05712948 0.10088785 0.00853629 ... 0.00560448 0.02375309 0.18180441]
  [0.05710637 0.10088376 0.00686387 ... 0.00320256 0.02365223 0.19155981]]

 [[0.05761492 0.10097383 0.00756071 ... 0.00420336 0.02128196 0.0602922 ]
  [0.0575918  0.10096974 0.00567924 ... 0.0060048  0.02153412 0.06529571]
  [0.05756869 0.10096564 0.00658514 ... 0.00260208 0.02082808 0.05874112]
  ...
  [0.05712948 0.10088785 0.00853629 ... 0.00560448 0.02375309 0.18180441]
  [0.05710637 0.10088376 0.00686387 ... 0.00320256 0.02365223 0.19155981]
  [0.05708325 0.10087966 0.00693356 ... 0.45796637 0.02415654 0.20131521]]

 [[0.0575918  0.10096974 0.00567924 ... 0.0060048  0.021534

In [None]:
print(y_train_final_India.head)