In [1]:
import numpy as np
import pandas as pd
pd.set_option('max_columns', 165)
pd.set_option('max_row', 100)

In [2]:
def load_data():
    train = pd.read_csv('D:/WorkSpace/HomeWork/hw1/train.csv', encoding='utf-8', header = 0)
    train = pd.merge(pd.DataFrame(train.iloc[:, 0]), train.iloc[:, 2:], left_index=True, right_index = True)
    test = pd.read_csv('D:/WorkSpace/HomeWork/hw1/test.csv', encoding='utf-8', header = 'infer')
    return train, test

In [3]:
def data_reshape(data):
    #将数据集重新堆叠为行表示时间，列表示各个特征
    for i in range(240):
        start = i*18
        end = start+18
        if i == 0:
            trans_data = data.iloc[start:end, :].T.iloc[2:, :]
        else:
            join = data.iloc[start:end].T.iloc[2:, :]
            join.columns = trans_data.columns
            trans_data = pd.concat((trans_data, join))
            
    trans_data = trans_data.reset_index(drop=True)
    trans_data.columns = data.iloc[0:18, 1].values
    trans_data.RAINFALL[trans_data.RAINFALL == 'NR'] = '0'             #将不下雨定义为降水量0
    
    for name in trans_data.columns:
        trans_data[name] = trans_data[name].astype(np.float32)
    
    return trans_data

In [4]:
def train_transform(data):
    data.iloc[:, 0] = pd.to_datetime(data.iloc[:, 0], format = '%Y.%m.%d')
    trans_data = data_reshape(data)
    
    #将数据变形，由于测试集中用前9个时间段来预测当前时间的pm2.5值，所以将前9个时间段的各个特征数据全部作为特征列
    #由于训练集给的是每个月的前20天的24个小时，即第480行与第481行数据的观察时间不是连续的，所以需要将每个月的数据分开拼接
    size = 480
    for sub in range(12):
        start = sub * size
        end = min(start + size, trans_data.shape[0])
        sub_data = trans_data.iloc[start:end, :]           #每个月的数据是一个单独的sub_data
        
        for i in range(9, 480):
            b = sub_data.iloc[i,:]
            
            for j in range(9):
                c = sub_data.iloc[i-1-j, :]
                b = pd.concat((b, c))
                
            b = pd.DataFrame(b).T
            a = b if i == 9 else pd.concat((a, b))
        
        new_data = a if sub == 0 else pd.concat((new_data, a))
    
    #将当前时间的PM2.5数值作为label， 其他信息删除
    train_x = new_data.iloc[:, 18:180]
    train_y = pd.DataFrame(new_data.iloc[:, 9])
    print('feature.shape = ' + str(train_x.shape))
    print('label.shape = ' + str(train_y.shape))
    return train_x, train_y

In [5]:
def test_transform(data):
    trans_data = data_reshape(data)
    #经过转换后，测试集中每9行表示一个时间点的数据，其中第1行表示n-9的时间点，第2行表示n-8的时间点，以此类推
    size = 9
    for i in range(np.int(trans_data.shape[0]/size)):
        start = i * size
        end = start + size
        sub_data = trans_data.iloc[start:end, :]
        sub_data = sub_data.reset_index(drop=True)
        
        for j in list(reversed(range(9))):
            b = sub_data.iloc[j,:]
            a = b if j == 8 else pd.concat((a, b))
            
        a = pd.DataFrame(a).T
        test_x = a if i == 0 else pd.concat((test_x, a))
    test_x = test_x.reset_index(drop=True)
    print(test_x.shape)
    return test_x

In [6]:
train, test = load_data()
train_x, train_y = train_transform(train)
test_x = test_transform(test)
train_x.to_csv('D:/WorkSpace/HomeWork/hw1/train_x.csv', index = False)
train_y.to_csv('D:/WorkSpace/HomeWork/hw1/train_y.csv', index = False)
test_x.to_csv('D:/WorkSpace/HomeWork/hw1/test_x.csv', index = False)

feature.shape = (5652, 162)
label.shape = (5652, 1)
