In [1]:
import os
import sys

import numpy as np
import urllib.request
import tensorflow as tf

from pdb import set_trace
import zipfile
import csv

import copy

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

SOURCE_URL = "https://cloud.tsinghua.edu.cn/d/f519a587a6d943fa9aa0/files/?p=%2F%E5%8C%97%E4%BA%AC%E7%A9%BA%E6%B0%94%E8%B4%A8%E9%87%8F.zip&dl=1"

Preprocessing 

In [2]:
def maybe_download(filename, work_directory):
    """Download the data from website, unless it's already here."""
    if not tf.io.gfile.exists(work_directory):
        tf.io.gfile.makedirs(work_directory)
    filepath = os.path.join(work_directory, filename)
    # set_trace()
    if not tf.io.gfile.exists(filepath):
        filepath, _ = urllib.request.urlretrieve(SOURCE_URL, filepath)
    with tf.io.gfile.GFile(filepath) as f:
        print('Successfully downloaded', filename)
    return filepath

def extract_files(filepath):
    """Extract the images into a 4D uint8 numpy array [index, y, x, depth]."""
    # filepath = Path(filepath)
    print('Extracting', filepath)
    #if not tf.io.gfile.exists(filepath):
    #    tf.io.gfile.makedirs(filepath)
        
    with zipfile.ZipFile(filepath+".zip", 'r') as zip_ref:
        #zip_ref.extractall("../air_quality/")
        
        for member in zip_ref.infolist():
            # set_trace()
            
            member.filename = member.filename.encode("cp437").decode("utf8")

            zip_ref.extract(member, "../air_quality/")
#             set_trace()
# #             if member.filename == "北京空气质量"：
# #                 os.rename(member.filename, "pm_2_5_data")
#             set_trace()
    #os.chdir(filepath) # change directory from working dir to dir with files

    for item in os.listdir(filepath): # loop through items in dir
        #set_trace()
        if item.endswith(".zip"): # check for ".zip" extension
            #file_name = os.path.abspath(item) # get full path of files
            zipfile_path = os.path.join(filepath, item)
            zip_ref = zipfile.ZipFile(zipfile_path) # create zipfile object
            zip_ref.extractall(filepath) # extract file to dir
            zip_ref.close() # close file
            #os.remove(file_name) # delete zipped file

def download_and_extract():
    dest = "../air_quality"
    file_name = "北京空气质量.zip"
    filepath = maybe_download(file_name, dest)
    filepath = filepath[:-4]
    extract_files(filepath)
# download_and_extract()

In [3]:

def normalize_data_pm2_5(data):
    data_new = copy.deepcopy(data)
#     set_trace()
    data_new[data_new > 250] = -5
    data_new[data_new > 150] = -4
    data_new[data_new > 115] = -3
    data_new[data_new > 75] = -2
    data_new[data_new > 35] = -1
    data_new[data_new > 0] = 0
    data_new = abs(data_new).astype(int)
    
    return data_new

def process_data(data, pm2_5=0):
#     data = list(map(int, data))
    N, d = data.shape
    
    new_data = np.zeros((N, d))
#     new_data = []
   
    m = []
    for j, row in enumerate(data):
        row = [float(i) if i != "" else 0 for i in row]
        row = np.array(row)
       
        if np.any(row==0):
#             set_trace()
            index = np.where(row==0)
#             set_trace()
            if len(index[0]) == len(row):
                m.append(j)
                continue
            #for ind in index[0]:
                
            mu = np.sum(row)/(len(row)-len(index[0]))
            row[index[0]] = mu
#             set_trace()
            if mu <= 0:
                set_trace()
#         set_trace()
        new_data[j, :] = row
        
        
    if m:
        for ind in m:    
            new_data[ind] = np.sum(new_data, axis=0)/(len(new_data)-len(m))
    
    if pm2_5:
        new_data = normalize_data_pm2_5(new_data)
    
    return new_data



def check_data(row, item, year):
    check = [True if i=="" else False for i in row[3:]]
    check_true = True
    if np.all(np.array(check)):
        if item not in REMOVED_CSV[year]:
           
            check_true = False
    return check_true

def add_removed(item, year):
    split = item.split("_")
    split[-2] = "extra"
    item_1 = "_"
    item_1 = item_1.join(split)
    REMOVED_CSV[year].add(item_1)
    split[-2] = "all"
    item_2 = "_"
    item_2 = item_2.join(split)
    REMOVED_CSV[year].add(item_2)
    

def restructure_data(data):
    num_doc = len(data)

    N, d = data[0].shape
    
    new_data = np.zeros((num_doc, N, d))
    
    for i, dat in enumerate(data):
        new_data[i, :, :] = dat
    return new_data

def clean(folderpath, consistent=True):

    print("start reading csv in folder: ", folderpath)
    pm2_5s = []
    SO2s = []
    NO2s = []
    COs = []
    O3s = []
    year = folderpath.split("-")[1][:4]
    
    for item in os.listdir(folderpath):
        
        split = item.split("_")
        if "beijing" not in item:
            REMOVED_CSV[year].add(item)
            continue
#         set_trace()
        if consistent:
            if split[-2] == "extra":
                split_1 = split
                split_1[-2] = "all"
                item_1 = "_"
                item_1 = item_1.join(split_1)

                if item_1 not in os.listdir(folderpath):
#                     set_trace()
                    REMOVED_CSV[year].add(item)
            else:
                split_2 = split
                split_2[-2] = "extra"
                item_2 = "_"
                item_2 = item_2.join(split_2)
                if item_2 not in os.listdir(folderpath):
#                     set_trace()
                    REMOVED_CSV[year].add(item)
        
        
        if item in REMOVED_CSV[year]:
            continue
        
        filepath_csv = folderpath + "/" + item
#         print(filepath_csv)
        
        pm2_5 = []
        SO2 = []
        NO2 = []
        CO = []
        O3 = []
        d = 0
        
        with open(filepath_csv, encoding="utf8") as csv_file:
            csv_reader = csv.reader(csv_file, delimiter=",")
    #         set_trace()
            prev_header = []
            for i, row in enumerate(csv_reader):
#                 set_trace()
                try:
                    row[2]=="SO2"
                        
                except:
                    if item not in REMOVED_CSV[year]:
                        add_removed(item, year)
#                         REMOVED_CSV[year].add(item)
                    break
                
                if len(row[3:]) != 35:
                    if item not in REMOVED_CSV[year]:
                        add_removed(item, year)
#                         REMOVED_CSV[year].add(item)
                    break

               
                    
                if i==0:
                    header = row[3:]
                    if prev_header:
                        if header != prev_header:
                            set_trace()
                            break
                    prev_header = header
                    continue
                else:
                    if row[2] in ["PM2.5", "SO2", "NO2", "CO", "O3"]:
                        if not check_data(row[3:], item, year):
                            add_removed(item, year)
#                             REMOVED_CSV[year].add(item)
                            break
                        else:
                            if row[2]=="PM2.5":
                                pm2_5.append(row[3:])
                            elif row[2]=="SO2":
                                SO2.append(row[3:])
                            elif row[2]=="NO2":
                                NO2.append(row[3:])
                            elif row[2]=="CO":
                                CO.append(row[3:])
                            elif row[2]=="O3":
                                O3.append(row[3:])

        split = filepath_csv.split("_")
        if split[-2] == "all":
            if len(pm2_5) != 24:
                add_removed(item, year)
                continue
        elif split[-2] == "extra":
            if len(SO2) != 24:
                add_removed(item, year)
                continue
            if len(NO2) != 24:
                add_removed(item, year)
                continue
            if len(CO) != 24:
                add_removed(item, year)
                continue
            if len(O3) != 24:
                add_removed(item, year)
                continue
    print("complete read folder")


In [4]:
REMOVED_CSV = {}
REMOVED_CSV["2014"] = {"beijing_all_20141231.csv", "beijing_extra_20141231.csv"}
REMOVED_CSV["2015"] = set()
REMOVED_CSV["2016"] = set()
REMOVED_CSV["2017"] = set()
REMOVED_CSV["2018"] = set()
REMOVED_CSV["2019"] = set()
REMOVED_CSV["2020"] = set()

folderpath = "../air_quality/北京空气质量"
for key, values in REMOVED_CSV.items():
    print("length of REMOVED_CSV in year %s: %d" %(key, len(values)))
print("")  


if tf.io.gfile.exists(folderpath):
    for item in os.listdir(folderpath):
        if ".zip" not in item and "beijing" in item:
            clean(folderpath+"/"+item)

for key, values in REMOVED_CSV.items():
    print("length of REMOVED_CSV in year %s: %d" %(key, len(values)))
print("")            

# print("Start checking REMOVED_CSV pairs")
# for key, values in REMOVED_CSV.items():
#     for value_a in values:
#         if "beijing" in value_a:
#             two = 1
#             split = value_a.split("_")
#             for value_b in values:
#                 if value_a != value_b:
#                     if split[-1] in value_b:
#                         set_trace()
#                         two += 1
#                         break                    
#             if two != 2:
#                 set_trace()
# print("Complete check")

length of REMOVED_CSV in year 2014: 2
length of REMOVED_CSV in year 2015: 0
length of REMOVED_CSV in year 2016: 0
length of REMOVED_CSV in year 2017: 0
length of REMOVED_CSV in year 2018: 0
length of REMOVED_CSV in year 2019: 0
length of REMOVED_CSV in year 2020: 0

start reading csv in folder:  ../air_quality/北京空气质量/beijing_20140101-20141231
complete read folder
start reading csv in folder:  ../air_quality/北京空气质量/beijing_20150101-20151231
complete read folder
start reading csv in folder:  ../air_quality/北京空气质量/beijing_20160101-20161231
complete read folder
start reading csv in folder:  ../air_quality/北京空气质量/beijing_20170101-20171231
complete read folder
start reading csv in folder:  ../air_quality/北京空气质量/beijing_20180101-20181231
complete read folder
start reading csv in folder:  ../air_quality/北京空气质量/beijing_20190101-20191231
complete read folder
start reading csv in folder:  ../air_quality/北京空气质量/beijing_20200101-20200509
complete read folder
length of REMOVED_CSV in year 2014: 154


In [8]:


def retrieve_data_alls(folderpath):

    print("start reading csv in folder: ", folderpath)
    pm2_5s = []
    year = folderpath.split("-")[1][:4]

    items = []
    alls = []
    extras = []
    
    for item in os.listdir(folderpath):
        if "beijing" not in item:
            continue
        if item not in REMOVED_CSV[year]:
            items.append(item)
            split = item.split("_")
            if split[-2] == "extra":
                extras.append(item)
            else:
                alls.append(item)

    alls_pairs = []
    for i in range(len(alls)-1):
        split_1 = alls[i].split("_")
        num_csv_1 = (int)(split_1[-1][:8])
        split_2 = alls[i+1].split("_")
        num_csv_2 = (int)(split_2[-1][:8])

        if num_csv_2 - 1 == num_csv_1:
            alls_pairs.append((alls[i], alls[i+1]))
        

    for (item_1, item_2) in alls_pairs:

        
        filepath_csv_1 = folderpath + "/" + item_1
        filepath_csv_2 = folderpath + "/" + item_2
        
        pm2_5 = []
        d = 0
        
        with open(filepath_csv_1, encoding="utf8") as csv_file_1:
           
        
            csv_reader_1 = csv.reader(csv_file_1, delimiter=",")

            for i, row in enumerate(csv_reader_1):

                if i==0:
                    header = row[3:]
                    continue
                else:
                    if row[2] in ["PM2.5", "SO2", "NO2", "CO", "O3"]:
                        if not check_data(row[3:], item, year):
                            add_removed(item, year)
                            break
                        else:
                            if row[2]=="PM2.5":
                                pm2_5.append(row[3:])
        with open(filepath_csv_2, encoding="utf8") as csv_file_2:
            csv_reader_2 = csv.reader(csv_file_2, delimiter=",")

            for i, row in enumerate(csv_reader_2):

                if i==0:
                    header = row[3:]
                    continue
                else:
                    if row[2] in ["PM2.5", "SO2", "NO2", "CO", "O3"]:
                        if not check_data(row[3:], item, year):
                            add_removed(item, year)
                            break
                        else:
                            if row[2]=="PM2.5":
                                pm2_5.append(row[3:])
                        

        if pm2_5:
            pm2_5 = np.array(pm2_5)
            pm2_5 = process_data(pm2_5, 1)
            pm2_5s.append(pm2_5)

    
    pm2_5s = restructure_data(pm2_5s)

    
    print("complete read folder")
    return pm2_5s

def retrieve_data_extras(folderpath):

    print("start reading csv in folder: ", folderpath)
    pm2_5s = []
    SO2s = []
    NO2s = []
    COs = []
    O3s = []
    
    year = folderpath.split("-")[1][:4]
    items = []
    alls = []
    extras = []
    
    for item in os.listdir(folderpath):
        if "beijing" not in item:
            continue
        if item not in REMOVED_CSV[year]:
            items.append(item)

    for item in items:
        split = item.split("_")
        if split[-2] == "all":
            split_1 = split
            split_1[-2] = "extra"
            item_1 = "_"
            item_1 = item_1.join(split_1)
            if item_1 in items:
                extras.append(item_1)
                alls.append(item)
                
    assert(len(extras)==len(alls))
    extra_all_pairs = []
    
    for i in range(len(alls)-1):
        split_1 = extras[i].split("_")
        num_csv_1 = (int)(split_1[-1][:8])
        split_2 = alls[i+1].split("_")
        num_csv_2 = (int)(split_2[-1][:8])
        
        if num_csv_2 - 1 == num_csv_1:
            extra_all_pairs.append((extras[i], alls[i+1]))
        
#     set_trace()
    for (item_1, item_2) in extra_all_pairs:

        
        filepath_csv_1 = folderpath + "/" + item_1
        filepath_csv_2 = folderpath + "/" + item_2
        
        
        pm2_5 = []
        SO2 = []
        NO2 = []
        CO = []
        O3 = []
        d = 0
        
        with open(filepath_csv_1, encoding="utf8") as csv_file_1:
           
        
            csv_reader_1 = csv.reader(csv_file_1, delimiter=",")

            for i, row in enumerate(csv_reader_1):

                if i==0:
                    header = row[3:]
                    continue
                else:
                    if row[2] in ["PM2.5", "SO2", "NO2", "CO", "O3"]:
                        if not check_data(row[3:], item, year):
                            add_removed(item, year)
                            break
                        else:
                            if row[2]=="PM2.5":
                                pm2_5.append(row[3:])
                            elif row[2]=="SO2":
                                SO2.append(row[3:])
                            elif row[2]=="NO2":
                                NO2.append(row[3:])
                            elif row[2]=="CO":
                                CO.append(row[3:])
                            elif row[2]=="O3":
                                O3.append(row[3:])
        with open(filepath_csv_2, encoding="utf8") as csv_file_2:
            csv_reader_2 = csv.reader(csv_file_2, delimiter=",")

            for i, row in enumerate(csv_reader_2):

                if i==0:
                    header = row[3:]
                    continue
                else:
                    if row[2] in ["PM2.5", "SO2", "NO2", "CO", "O3"]:
                        if not check_data(row[3:], item, year):
                            add_removed(item, year)
                            break
                        else:
                            if row[2]=="PM2.5":
                                pm2_5.append(row[3:])
                            elif row[2]=="SO2":
                                SO2.append(row[3:])
                            elif row[2]=="NO2":
                                NO2.append(row[3:])
                            elif row[2]=="CO":
                                CO.append(row[3:])
                            elif row[2]=="O3":
                                O3.append(row[3:])
                        

        if pm2_5:
            pm2_5 = np.array(pm2_5)
            pm2_5 = process_data(pm2_5, 1)
            pm2_5s.append(pm2_5)

        if SO2:
            SO2 = np.array(SO2)
#             set_trace()
            SO2 = process_data(SO2)
#             set_trace()
            SO2s.append(SO2)

        if NO2:
            NO2 = np.array(NO2)
            NO2 = process_data(NO2)
            NO2s.append(NO2)

        if CO:
            CO = np.array(CO)
            CO = process_data(CO)
            COs.append(CO)

        if O3:
            O3 = np.array(O3)
            O3 = process_data(O3)
            O3s.append(O3)
    
    pm2_5s = restructure_data(pm2_5s)
    SO2s = restructure_data(SO2s)
    NO2s = restructure_data(NO2s)
    COs = restructure_data(COs)
    O3s = restructure_data(O3s)
    
    print("complete read folder")
    return pm2_5s, SO2s, NO2s, COs, O3s
#     return pm2_5s

pm2_5s_1_years = []
pm2_5s_2_years = []
SO2s_years = []
NO2s_years = []
COs_years = []
O3s_years = []

if tf.io.gfile.exists(folderpath):
    for item in os.listdir(folderpath):
        if ".zip" not in item and "beijing" in item:
            pm2_5s_1 = retrieve_data_alls(folderpath+"/"+item)
            pm2_5s_1_years.append(pm2_5s_1)
            
            pm2_5s_2, SO2s, NO2s, COs, O3s = retrieve_data_extras(folderpath+"/"+item)  
            pm2_5s_2_years.append(pm2_5s_2)
            SO2s_years.append(SO2s)
            NO2s_years.append(NO2s)
            COs_years.append(COs)
            O3s_years.append(O3s)

start reading csv in folder:  ../air_quality/北京空气质量/beijing_20140101-20141231
complete read folder
start reading csv in folder:  ../air_quality/北京空气质量/beijing_20140101-20141231
complete read folder
start reading csv in folder:  ../air_quality/北京空气质量/beijing_20150101-20151231
complete read folder
start reading csv in folder:  ../air_quality/北京空气质量/beijing_20150101-20151231
complete read folder
start reading csv in folder:  ../air_quality/北京空气质量/beijing_20160101-20161231
complete read folder
start reading csv in folder:  ../air_quality/北京空气质量/beijing_20160101-20161231
complete read folder
start reading csv in folder:  ../air_quality/北京空气质量/beijing_20170101-20171231
complete read folder
start reading csv in folder:  ../air_quality/北京空气质量/beijing_20170101-20171231
complete read folder
start reading csv in folder:  ../air_quality/北京空气质量/beijing_20180101-20181231
complete read folder
start reading csv in folder:  ../air_quality/北京空气质量/beijing_20180101-20181231
complete read folder
start read

In [9]:
# pm2_5s_2_years[0]
O3s_years[0]

array([[[46.        , 46.        , 48.        , ..., 30.        ,
         19.        , 11.        ],
        [41.        , 40.        , 44.        , ..., 27.        ,
         36.79411765, 31.        ],
        [29.        , 32.        , 35.        , ..., 25.        ,
         32.09090909,  2.        ],
        ...,
        [50.        , 32.        , 47.        , ..., 18.        ,
          2.        ,  3.        ],
        [34.        , 20.        , 26.        , ...,  3.        ,
         24.08823529,  2.        ],
        [30.        , 33.        , 12.        , ...,  2.        ,
         13.        , 10.        ]],

       [[44.        , 52.        , 34.        , ..., 20.        ,
         27.        , 31.        ],
        [45.        , 48.        , 44.        , ..., 30.        ,
         33.78787879, 40.        ],
        [51.        , 57.        , 55.        , ..., 39.        ,
         28.        , 47.        ],
        ...,
        [78.        , 69.        , 84.        , ..., 6

In [10]:
def normalize_concat(arr_data):
    norm_arr_data = []
    for data in arr_data:
        norm_arr_data.append(normalize(data))
#     set_trace()
    return concat(norm_arr_data)

def normalize(data):
    new_data = []
    for dat in data:
#         set_trace()
        temp = (dat-np.min(dat))/(np.max(dat)-np.min(dat))
        new_data.append(temp)
    
#     set_trace()
    return new_data

def concat(arr_data):
    years = len(arr_data[0])
    new_data = []
    for year in range(years):
        temp = []
        for data in arr_data:
            temp.append(data[year])
        
        new_data.append(np.concatenate(temp, 0))
#         set_trace()
    return new_data

feature_data = normalize_concat((SO2s_years, NO2s_years, O3s_years, COs_years))


In [11]:
print("years:", len(feature_data))
print("data for 2015:", feature_data[0].shape)

years: 7
data for 2015: (836, 24, 35)


In [21]:
def concat_years(arr_data):
    years = len(arr_data)
#     new_data = 
    sum_doc = 0
    N, h, f = arr_data[0].shape
    for year in range(years):
#         arr_data =
        sum_doc += arr_data[year].shape[0]
#     set_trace()
    new_data = np.zeros((sum_doc, h, f))
    ind = 0
    for year in range(years):
        ind_last = arr_data[year].shape[0]
#         set_trace()
        new_data[ind:ind+ind_last, :, :] = arr_data[year]
        ind = ind + ind_last
#     set_trace()
    return new_data    
train_data = concat_years(pm2_5s_1_years[1:])

Training Model: seq2seq <br>
Encoder

In [None]:

# class EncoderRNN(nn.Module):
#     def __init__(self, input_size, hidden_size):
#         super(EncoderRNN, self).__init__()
#         self.hidden_size = hidden_size

# #         self.embedding = nn.Embedding(input_size, hidden_size)
#         self.lstm = nn.LSTM(input_size, hidden_size)

#     def forward(self, input, hn, cn):
# #         embedded = self.embedding(input).view(1, 1, -1)
# #         output = embedded
#         output = input.view(24,1,6)
# #         set_trace()
#         output, hidden = self.lstm(output.float(), (hn.detach(), cn.detach()))
#         return output, hidden

#     def initHidden(self):
# #         h_hidden = torch.zeros(1, 1, self.hidden_size)
# #         c_hidden = torch.zeros(1, 1, self.hidden_size)
# #         if torch.cuda.is_available():
# #             return h_hidden.cuda(), c_hidden.cuda()
# #         else:
# #             return h_hidden, c_hidden
#         return torch.zeros(1, 1, self.hidden_size, device=device)

Decoder

In [None]:
# class DecoderRNN(nn.Module):
#     def __init__(self, hidden_size, output_size):
#         super(DecoderRNN, self).__init__()
#         self.hidden_size = hidden_size

# #         self.embedding = nn.Embedding(output_size, hidden_size)
#         self.lstm = nn.LSTM(hidden_size, hidden_size)
#         self.out = nn.Linear(hidden_size, output_size)
#         self.softmax = nn.LogSoftmax(dim=1)

#     def forward(self, input, hn, cn):
#         output = input.view(24, self.hidden_size)
#         output = F.relu(output)
#         output, hidden = self.lstm(output, (hn, cn))
#         output = self.softmax(self.out(output[0]))
#         return output, hidden

#     def initHidden(self):
        
#         return torch.zeros(1, 1, self.hidden_size, device=device)

In [None]:
# class DecoderAttn(nn.Module):
#     def __init__(self, hidden_size, output_size, layers=1, dropout=0.1, bidirectional=True):
#         super(DecoderAttn, self).__init__()

#         if bidirectional:
#             self.directions = 2
#         else:
#             self.directions = 1
#         self.output_size = output_size
#         self.hidden_size = hidden_size
#         self.num_layers = layers
#         self.dropout = dropout
# #         self.embedder = nn.Embedding(output_size,hidden_size)
#         self.dropout = nn.Dropout(dropout)
#         self.score_learner = nn.Linear(hidden_size*self.directions, 
#                                    hidden_size*self.directions)
#         self.lstm = nn.LSTM(input_size=hidden_size,hidden_size=hidden_size,
#                         num_layers=layers,dropout=dropout,
#                         bidirectional=bidirectional,batch_first=False)
#         self.context_combiner = nn.Linear((hidden_size*self.directions)
#                                       +(hidden_size*self.directions), hidden_size)
#         self.tanh = nn.Tanh()
#         self.output = nn.Linear(hidden_size, output_size)
#         self.soft = nn.Softmax(dim=1)
#         self.log_soft = nn.LogSoftmax(dim=1)


#     def forward(self, input_data, h_hidden, c_hidden, encoder_hiddens):

# #         embedded_data = self.embedder(input_data)
# #         embedded_data = self.dropout(embedded_data)
# #         batch_size = embedded_data.shape[1]
#         hiddens, outputs = self.lstm(input_data, (h_hidden, c_hidden))
#         top_hidden = outputs[0].view(self.num_layers,self.directions,
#                                  hiddens.shape[1],
#                                  self.hidden_size)[self.num_layers-1]
#         top_hidden = top_hidden.permute(1,2,0).contiguous().view(batch_size,-1, 1)

#         prep_scores = self.score_learner(encoder_hiddens.permute(1,0,2))
#         scores = torch.bmm(prep_scores, top_hidden)
#         attn_scores = self.soft(scores)
#         con_mat = torch.bmm(encoder_hiddens.permute(1,2,0),attn_scores)
#         h_tilde = self.tanh(self.context_combiner(torch.cat((con_mat,
#                                                          top_hidden),dim=1)
#                                               .view(batch_size,-1)))
#         pred = self.output(h_tilde)
#         pred = self.log_soft(pred)


#         return pred, outputs

Train Function

In [22]:
import random

teacher_forcing_ratio = 0.5


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion):
    hn = encoder.initHidden()
    cn = encoder.initHidden()
    
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

#     encoder_outputs = torch.zeros(input_length, 24, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, (hn, cn) = encoder(
            input_tensor[ei], hn, cn)
        
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
    
    decoder_input = target_tensor[0, :]
           
    for di in range(target_length):
        decoder_output, (hn, cn) = decoder(decoder_input, hn, cn) 
        loss += criterion(decoder_output, target_tensor[di, :].view(-1,).long())

        if use_teacher_forcing:
            decoder_input = target_tensor[di, :]
        else:
            decoder_output = decoder_output.view(6, 35 ,6)
            topv, topi = decoder_output.topk(1, dim=2)
            decoder_input = topi.squeeze().detach()
#         print(decoder_output.shape)
#         print(decoder_input.shape)
#         set_trace()
        
    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [23]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
#     set_trace()
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [24]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [25]:
def reshape_data(data, i):
    data1 = data[:, :, :i]
    data2 = data[:, :, i+1:]
    new_data = np.concatenate((data1, data2), axis=2)
#     set_trace()
#     new_data = new_data.reshape((-1, 24, 34))
    return new_data

def save_checkpoint(epoch, encoder, decoder, encoder_optimizer, decoder_optimizer):
    """
    Save model checkpoint.

    :param epoch: epoch number
    :param model: model
    :param optimizer: optimizer
    """
    state = {'epoch': epoch,
             'encoder': encoder.state_dict(),
             'decoder': decoder.state_dict(),
             'encoder_optimizer': encoder_optimizer,
             'decoder_optimizer': decoder_optimizer}
    filename = '../checkpoint_lstm.pth.tar'
    torch.save(state, filename)

def trainIters(year_data, encoder, decoder, encoder_optimizer, decoder_optimizer, 
               start_epoch, epochs, print_every=1, plot_every=1, learning_rate=0.01):
    encoder.train()
    decoder.train()
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

#     training_pairs = [tensorsFromPair(random.choice(pairs))
#                       for i in range(n_iters)]
    criterion = nn.NLLLoss()
    g, t, f= year_data.shape
    
    for epoch in range(start_epoch, epochs+1):
        
#         for i in range(f):
#             print("training for pm2.5: %d" %(i+1))
        input_tensor = year_data[:,:24,:]
        input_tensor = torch.from_numpy(input_tensor).to(device)
        target_tensor = year_data[:, 24:24+6, :]
        target_tensor = torch.from_numpy(target_tensor).to(device)

        loss = train(input_tensor, target_tensor, encoder,
                 decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss
        
        if epoch % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, epoch / epochs),
                                         epoch, epoch / epochs * 100, print_loss_avg))

        if epoch % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0
            
        save_checkpoint(epoch, encoder, decoder, encoder_optimizer, decoder_optimizer)


    showPlot(plot_losses)

In [27]:
import os

class DecoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, n_layers=1):
        super(DecoderRNN, self).__init__()
        
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.input_size = input_size
        self.seq_len = 6
        self.output_size = output_size
        self.lstm = nn.LSTM(input_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.out_1 = nn.Linear(1, 6)
        self.softmax = nn.LogSoftmax(dim=2)

    def forward(self, input, hn, cn):
        output = input.view(self.seq_len, 1, self.input_size)
        output = F.relu(output)
        output, hidden = self.lstm(output.float(), (hn, cn))
        output = output.view(self.seq_len, self.hidden_size)
        
        output = self.out(output).view(self.seq_len, self.output_size, 1)
        output = self.out_1(output)
        output = self.softmax(output)
        output = output.view(-1, 6)
#         set_trace()
#         output = output.view(self.seq_len, self.input_size)
        return output, hidden

    def initHidden(self):
        
        return torch.zeros(self.n_layers, 1, self.hidden_size, device=device)

class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, n_layers=1, dropout=0.1):
        super(EncoderRNN, self).__init__()
        
        self.input_size = input_size
        self.seq_len = 24
        self.n_layers = n_layers
        
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size)

    def forward(self, input, hn, cn):
        
        output = input.view(self.seq_len, 1, self.input_size)
        output, hidden = self.lstm(output.float(), (hn, cn))
        
#         set_trace()
        return output, hidden

    def initHidden(self):

        return torch.zeros(self.n_layers, 1, self.hidden_size, device=device)


hidden_size = 256
input_size_enc = 35
input_size_dec = 35
output_size = 35
epochs = 50
learning_rate = 0.001

checkpoint = '../checkpoint_lstm.pth.tar'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")

encoder = EncoderRNN(input_size_enc, hidden_size).to(device)
decoder = DecoderRNN(input_size_dec, hidden_size, output_size).to(device)

if os.path.exists(checkpoint):
    checkpoint = torch.load(checkpoint)    
    encoder.load_state_dict(checkpoint["encoder"])
    decoder.load_state_dict(checkpoint["decoder"])
    
    epoch = checkpoint["epoch"]+1
    encoder_optimizer = checkpoint["encoder_optimizer"]
    decoder_optimizer = checkpoint["decoder_optimizer"]
else:
    epoch = 1
#     encoder = EncoderRNN(input_size_enc, hidden_size).to(device)
    # attn_decoder1 = AttnDecoderRNN(hidden_size, output_size, dropout_p=0.1).to(device)
#     decoder = DecoderRNN(input_size_dec, hidden_size, output_size).to(device)
    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)

if epoch <= epochs:
    print("start training from epoch %d" %epoch)
    trainIters(train_data, encoder, decoder, encoder_optimizer, decoder_optimizer, epoch, epochs)
else:
    print("epoch trained (%d) exceeds maximum epochs (%d)" %(epoch, epochs))

print("finish training")

start training from epoch 31
0m 12s (- 0m 7s) (31 62%) 1.7976
0m 24s (- 0m 13s) (32 64%) 1.7977
0m 36s (- 0m 18s) (33 66%) 1.7976
0m 48s (- 0m 22s) (34 68%) 1.7976
1m 0s (- 0m 25s) (35 70%) 1.8207
1m 12s (- 0m 28s) (36 72%) 1.8206
1m 24s (- 0m 29s) (37 74%) 1.7975
1m 36s (- 0m 30s) (38 76%) 1.8207
1m 47s (- 0m 30s) (39 78%) 1.8206
1m 59s (- 0m 29s) (40 80%) 1.8206
2m 11s (- 0m 28s) (41 82%) 1.8206
2m 22s (- 0m 27s) (42 84%) 1.8206
2m 34s (- 0m 25s) (43 86%) 1.7975
2m 46s (- 0m 22s) (44 88%) 1.7976
2m 57s (- 0m 19s) (45 90%) 1.8207
3m 9s (- 0m 16s) (46 92%) 1.7975
3m 21s (- 0m 12s) (47 94%) 1.7976
3m 32s (- 0m 8s) (48 96%) 1.7976
3m 44s (- 0m 4s) (49 98%) 1.8207
3m 55s (- 0m 0s) (50 100%) 1.8206
finish training


Evaluate

In [28]:
def accuracy(pred, target):
    set_trace()
    return pred

def evaluate(input_tensor, target_tensor, encoder, decoder):
    encoder.eval()
    decoder.eval()
    
    accu_all = []
    
    loss = 0
    criterion = nn.NLLLoss()
    
    with torch.no_grad():
        hn = encoder.initHidden()
        cn = encoder.initHidden()

        input_length = input_tensor.size(0)
        target_length = target_tensor.size(0)

    #     encoder_outputs = torch.zeros(input_length, 24, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, (hn, cn) = encoder(
                input_tensor[ei], hn, cn)


        decoder_input = target_tensor[0, :]

        for di in range(target_length):
            decoder_output, (hn, cn) = decoder(decoder_input, hn, cn) 
            loss += criterion(decoder_output, target_tensor[di, :].view(-1,).long())

            decoder_output = decoder_output.view(6, 35 ,6)
            topv, topi = decoder_output.topk(1, dim=2)
            
            accu_all.append(accuracy(topi, target_tensor[di, :]))
            decoder_input = topi.squeeze().detach()
    
        loss.backward()


    return loss.item() / target_length

In [None]:
input_tensor = torch.from_numpy(pm2_5s_1_years[0][:,:24,:]).to(device)
target_tensor = torch.from_numpy(pm2_5s_1_years[0][:,24:24+6,:]).to(device)
evaluate(input_tensor, target_tensor, encoder, decoder)

> <ipython-input-28-6c65b19567e6>(3)accuracy()
-> return pred
(Pdb) pred.shape
torch.Size([6, 35, 1])
(Pdb) target.shape
torch.Size([6, 35])
(Pdb) pred
tensor([[[0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [1],
         [0],
         [0],
         [0],
         [0],
         [1],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [1],
         [0],
         [0]],

        [[0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [1],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
      

In [None]:
pm2_5s_1_years[0]