In [2]:
######## >>>>>>>>>>> For the google colab to be able to access the meta files you must add the APS360 team shared folder to your drive by right clicking on it <<<<<<<<<<< ##############
#mount googledrive
from google.colab import drive
drive.mount('/content/gdrive')


Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
import pandas as pd
# Prints how many nulls there are in max and min temp
def test_null_csv_daily(station, start_year, end_year, out_name =  None):
    """
      Args:
          station (string): eg. "ON_6158355"
          start_year (int): Starting year
          end_year (int)  : Ending year
          start_date (int): start_date = 0 is day 1 of starting year (Where we want the sampling to start)
          out_name (optional string) : change the name of the output file
    """
    if out_name == None:
      out_name = station + '_' + str(start_year) + '-' + str(end_year)
    master_path = '/content/gdrive/My Drive/APS360 Team/milestone 1/'
    src_path = master_path + 'datasets/'
    newdf = pd.read_csv(src_path + out_name + ".csv")
    print("Max temp nulls: " + str(newdf['Max Temp (°C)'].isnull().sum()))
    print("Min temp nulls: " + str(newdf['Min Temp (°C)'].isnull().sum()))

In [0]:
import pandas as pd
# Interpolates null values in min and max temp cols
def inter_nulls_csv_daily(station, start_year, end_year, out_name =  None):
    """
      Args:
          station (string): eg. "ON_6158355"
          start_year (int): Starting year
          end_year (int)  : Ending year
          start_date (int): start_date = 0 is day 1 of starting year (Where we want the sampling to start)
          out_name (optional string) : change the name of the output file
    """
    if out_name == None:
      out_name = station + '_' + str(start_year) + '-' + str(end_year)
    master_path = '/content/gdrive/My Drive/APS360 Team/milestone 1/'
    src_path = master_path + 'datasets/'
    newdf = pd.read_csv(src_path + out_name + ".csv")
    newdf['Max Temp (°C)'] = newdf['Max Temp (°C)'].interpolate()
    newdf['Min Temp (°C)']= newdf['Min Temp (°C)'].interpolate()
    newdf.to_csv( src_path +  out_name + ".csv")

In [0]:
#Raw csv downloaded must be place in /raw folder
#Merged csv will be stored at /datasets folder
# Also interpolates the null max and min temp
def make_csv_daily(station, start_year, end_year, out_name =  None):
  """
    Args:
        station (string): eg. "ON_6158355"
        start_year (int): Starting year
        end_year (int)  : Ending year
        start_date (int): start_date = 0 is day 1 of starting year (Where we want the sampling to start)
        out_name (optional string) : change the name of the output file
    """
  if out_name == None:
    out_name = station + '_' + str(start_year) + '-' + str(end_year)
  master_path = '/content/gdrive/My Drive/APS360 Team/milestone 1/'
  src_path = master_path + 'raw/'
  dest_path = master_path + 'datasets/'
  fout = open(dest_path + out_name + ".csv","w+")
  in_base = "en_climate_daily_" + station + '_' #eg: 'en_climate_daily_ON_6158355_'
  in_end = '_P1D.csv'
  # first file:
  for line in open(src_path + in_base + str(start_year) + in_end):
      fout.write(line)
  # now the rest:    
  for num in range(start_year + 1, end_year + 1):
      f = open(src_path + in_base + str(num) + in_end)
      f.__next__() # skip the header
      for line in f:
          fout.write(line)
      f.close() # not really needed
  fout.close()
  inter_nulls_csv_daily(station, start_year, end_year, out_name)

In [6]:
####### run once #######
make_csv_daily("ON_6158355", 2007, 2016) # run once
make_csv_daily("ON_6158355", 2017, 2018) # run once
test_null_csv_daily("ON_6158355", 2007, 2016)
test_null_csv_daily("ON_6158355", 2017, 2018)

Max temp nulls: 0
Min temp nulls: 0
Max temp nulls: 0
Min temp nulls: 0


In [0]:
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
class WeatherDataset(Dataset):
    """Weather dataset."""

    def __init__(self, station, start_year, end_year, start_date = 0, end_date = None, num_days = 7, make_csv = False, out_name = None):
        """
        Args:
            station (string): eg. "ON_6158355"
            start_year (int): Starting year
            end_year (int)  : Ending year
            start_date (int): start_date = 0 is day 1 of starting year (Where we want the sampling to start)
            end_date (optional int) : end_date = 7 is day 8 of starting year(min = 7 because of LABEL!). If provided changes the end date from last day of last year.
            num_days (optional int) : num_days is the interval of days before the label.
            make_csv (optional bool): If true it will call make_csv_daily function to create the csv from /raw datasets into /datasets
            out_name (optional string) : change the name of the output file which it reads from
        """
        self.num_days = num_days
        if(out_name == None):
          self.out_name = station + '_' + str(start_year) + '-' + str(end_year)
        else:
          self.out_name = out_name
        master_path = '/content/gdrive/My Drive/APS360 Team/milestone 1/'
        dest_path = master_path + 'datasets/'
        if (make_csv):
          make_csv_daily(station, start_year, end_year, out_name =  out_name)

        self.cur_csv = pd.read_csv(dest_path + self.out_name +'.csv')

        self.start_date = start_date
        if( end_date == None):
          self.end_date = len(self.cur_csv) - start_date
        else:
          self.end_date = end_date

    def __len__(self):
        return self.end_date - self.start_date + 1 - self.num_days - 1

    def __getitem__(self, idx):
        data = self.cur_csv.loc[ idx + self.start_date : idx + self.start_date + self.num_days - 1 , ['Max Temp (°C)', 'Min Temp (°C)'] ]
        data = np.asarray(data)
        data = data.astype('float')

        label = self.cur_csv.loc[ idx + self.start_date + self.num_days, ['Max Temp (°C)', 'Min Temp (°C)'] ]
        label = np.asarray(label)
        label = label.astype('float')

        data = data.flatten()

        #print('Data: {}'.format(data))
        #print('Data shape: {}'.format(data.shape))
        #print('Labels shape: {}'.format(label.shape))
        #print('Labels: {}'.format(label[:2]))
        sample = [data, label]

        return sample

In [0]:
start_year_train = 2007
end_year_train = 2016 #changed from 2019 because in our proposal we said 07-16 is train
start_date_train = 0
end_date_train = None
num_days_train = 7
station = "ON_6158355"
trainingSet = WeatherDataset(station, start_year_train, end_year_train, start_date_train, end_date_train, num_days=num_days_train, make_csv = False)

In [9]:
trainingSet[2]

[array([ 8.1,  2.4, 10.1,  4.8, 11.9,  7.7, 10. ,  4.1,  4.8,  3.3,  5.4,
         0.5,  2.5, -3.7]), array([-1.9, -5.7])]

In [0]:
start_year_val = 2017
end_year_val = 2018
start_date_val = 0
end_date_val = None
num_days_val = 7
# station stays the same
validationSet = WeatherDataset(station, start_year_val, end_year_val, start_date_val, end_date_val, num_days_val, make_csv = False)

In [11]:
import torch
train_loader = torch.utils.data.DataLoader(trainingSet, batch_size=2, 
                                            num_workers=1, shuffle=True)
for i, data in enumerate(train_loader, 0):
  input, label = data
  print("input:")
  print(input)
  print("label:")
  print(label)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
       dtype=torch.float64)
label:
tensor([[14.1000,  6.4000],
        [ 5.4000, -1.3000]], dtype=torch.float64)
input:
tensor([[ 8.6000,  3.7000, 13.1000,  3.8000,  6.3000,  0.0000,  0.0000, -5.9000,
          3.4000, -5.2000,  7.5000,  0.9000,  2.0000, -3.5000],
        [ 6.1000, -1.1000,  1.6000, -4.2000,  4.7000, -1.1000,  6.9000,  0.0000,
          7.8000,  1.3000,  3.0000,  1.1000, 10.0000,  2.3000]],
       dtype=torch.float64)
label:
tensor([[6.7000, 1.8000],
        [9.3000, 6.4000]], dtype=torch.float64)
input:
tensor([[11.1000,  3.6000, 10.8000,  8.2000, 10.7000,  6.1000, 11.9000,  6.1000,
         10.4000,  6.3000,  9.7000,  6.4000,  8.4000,  7.1000],
        [27.1000, 18.7000, 27.5000, 19.4000, 24.3000, 18.3000, 26.5000, 17.9000,
         28.8000, 19.8000, 25.6000, 18.4000, 21.9000, 14.5000]],
       dtype=torch.float64)
label:
tensor([[10.6000,  8.2000],
        [20.4000, 11.8000]], dtype=torch.float64)
inpu

# RNN TRAINING

In [0]:
import os 
from torch.utils.data.sampler import SubsetRandomSampler 
from torchvision import datasets, models, transforms 
import torch.optim as optim
import numpy as np 
import matplotlib.pyplot as plt
import time 
import torch
import torch.nn as nn
import torch.nn.functional as F 
import torch.optim 
import torchvision
from torch.utils.data.sampler import SubsetRandomSampler
import torchvision.transforms as transforms 
use_cuda = True

#we prolly need cuda? to speed things up?

### Helper functions

In [0]:
def get_model_name(name, batch_size, learning_rate, epoch):
    """ Generate a name for the model consisting of all the hyperparameter values

    Args:
        config: Configuration object containing the hyperparameters
    Returns:
        path: A string with the hyperparameter name and value concatenated
    """
    path = "/content/gdrive/My Drive/APS360 Team/milestone 1/Checkpoints/weatherANNmodel_{0}_bs{1}_lr{2}_epoch{3}".format(name,
                                                   batch_size,
                                                   learning_rate,
                                                   epoch)
    return path

In [0]:
# not sure if this is needed

def normalize_label(labels):
    """
    Given a tensor containing 2 possible values, normalize this to 0/1

    Args:
        labels: a 1D tensor containing two possible scalar values
    Returns:
        A tensor normalize to 0/1 value
    """
    max_val = torch.max(labels)
    min_val = torch.min(labels)
    norm_labels = (labels - min_val)/(max_val - min_val)
    return norm_labels

In [0]:
def plot_training_curve(path):
    """ Plots the training curve for a model run, given the csv files
    containing the train/validation error/loss.

    Args:
        path: The base path of the csv files produced during training
    """
    import matplotlib.pyplot as plt
    train_err = np.loadtxt("{}_train_err.csv".format(path))
    val_err = np.loadtxt("{}_val_err.csv".format(path))
    train_loss = np.loadtxt("{}_train_loss.csv".format(path))
    val_loss = np.loadtxt("{}_val_loss.csv".format(path))
    plt.title("Train vs Validation Error")
    n = len(train_err) # number of epochs
    plt.plot(range(1,n+1), train_err, label="Train")
    plt.plot(range(1,n+1), val_err, label="Validation")
    plt.xlabel("Epoch")
    plt.ylabel("Error")
    plt.legend(loc='best')
    plt.show()
    plt.title("Train vs Validation Loss")
    plt.plot(range(1,n+1), train_loss, label="Train")
    plt.plot(range(1,n+1), val_loss, label="Validation")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend(loc='best')
    plt.show()

### Visualization of data

In [0]:
# laterrrrrrr

### Architecture

In [0]:
import logging
import datetime

import pandas as pd
import torch
import torch.nn as nn
from torch.autograd import Variable
import matplotlib.pyplot as plt
import matplotlib
# matplotlib.style.use('ggplot')

# # idk what line 12 does but i guess I am keeping it
# log = logging.getLogger(__name__)


In [0]:
# Figure out RNN architecture
# potentially helpful links:
# https://chandlerzuo.github.io/blog/2017/11/darnn
# https://gist.github.com/lizadaly/2b19fd6f758ffd2e8ab9ec7d1f3f4b2c             <- USING THIS AS A TEMPLATE FOR TIME BASED RNN
# 

In [0]:
# BATCH_SIZE = 12
# HIDDEN_SIZE = 512
# NUM_LAYERS = 2
# NUM_EPOCHS = 100
# LEARNING_RATE = 0.005
# DROPOUT = 0.2

In [0]:
# # hmmmm -> should seqlen be the number of inputs we put in? tbh a lil confused
# # input size will always be 14, output size will always be 2 
# # not sure what n_layers does but thats fine
# # dont think we utilize dropout either
# class weatherRNN(nn.Module):

#     def __init__(self, input_size = 14, hidden_size = 30, output_size = 2, n_layers=1, dropout=DROPOUT): #,dropout=DROPOUT
#         super(weatherRNN, self).__init__()
#         self.input_size = input_size
#         self.hidden_size = hidden_size
#         self.output_size = output_size
#         self.n_layers = n_layers

#         self.encoder = nn.Embedding(input_size, hidden_size)
#         self.m = nn.Sequential(nn.ReLU(),
#                                nn.Dropout(p=0.2),
#                                nn.ReLU())
#         self.decoder = nn.Linear(hidden_size, output_size)

#     def forward(self, inp, hidden = None):
#         inp = self.encoder(inp)
#         # output = BATCH_SIZE, SEQLEN, HIDDEN_SIZE
#         # ex. torch.Size([30, 50, 512])
#         output = self.m(inp)

#         # Now BATCHSIZE * SEQLEN, HIDDEN_SIZE
#         # torch.Size([1500, 512])
#         output = output.contiguous().view(-1, hidden.size(2))

#         # Should now be BATCH_SIZE * SEQLEN, VOCAB_SIZE
#         # torch.Size([1500, 154])
#         logits = self.decoder(output)

#         return logits, hidden

#     def init_hidden(self):

#         return Variable(torch.zeros(self.n_layers, BATCH_SIZE, self.hidden_size).cuda())

In [0]:
class weatherRNN(nn.Module):

    def __init__(self, hidden_size = 10, input_size = 2, output_size = 2, n_layers = 1): #,dropout=DROPOUT
        super(weatherRNN, self).__init__()
        self.name = "weatherRNN"
        self.rnn = nn.RNN(input_size, hidden_size, n_layers, batch_first=True) # there is RNN, GRU, LSTM
        self.decoder = nn.Linear(hidden_size, output_size)

    def forward(self, inp,  hidden = None):
        if(hidden == None):
          out, _ = self.rnn(inp)
        else:
          out, _ = self.rnn(inp, hidden)
        out = self.decoder(out[:, -1, :])
        return out

### Training

In [0]:
import os 
from torch.utils.data.sampler import SubsetRandomSampler 
from torchvision import datasets, models, transforms 
import torch.optim as optim
import numpy as np 
import matplotlib.pyplot as plt
import time 
import torch
import torch.nn as nn
import torch.nn.functional as F 
import torch.optim 
import torchvision
from torch.utils.data.sampler import SubsetRandomSampler
import torchvision.transforms as transforms 

In [0]:
# Error defined as prediction being off more than 5°C
def compute_error(outputs, labels):
  err = 0.0
  number_items = 0
  for j, out in enumerate(outputs, 0):
      err += (abs(labels[j][0] - out[0]) > 5)
      err += (abs(labels[j][1] - out[1]) > 5)
      number_items += 2

  return err, number_items

In [0]:
# Error defined as mean difference of prediction and reallity
def compute_error(outputs, labels):
  err = 0.0
  number_items = 0
  for j, out in enumerate(outputs, 0):
      err += abs(labels[j][0] - out[0])
      err += abs(labels[j][1] - out[1])
      number_items += 2

  return err, number_items

In [0]:
def get_accuracy(net, loader, criterion):

    total_loss = 0.0
    total_err = 0.0
    number_items = 0

    for i, data in enumerate(loader, 0):
      inputs, labels = data
      inputs = inputs.view(inputs.shape[0], int(inputs.shape[1]/2), 2) #########
      #############################################
      #To Enable GPU Usage
      if use_cuda and torch.cuda.is_available():
        inputs = inputs.cuda()
        labels = labels.cuda()
      #############################################
      outputs = net(inputs.float())
      loss = criterion(outputs, labels.float()) 
      total_loss += loss.item()
      
      cur_err, cur_num = compute_error(outputs, labels)
      total_err += cur_err
      number_items += cur_num
    err = float(total_err) / number_items
    loss = float(total_loss) / (i + 1)
    return err, loss

In [0]:
use_cuda = True
def train_rnn_network(net, trainingSet, validationSet, num_epochs, batch_size, learning_rate):
    criterion = nn.MSELoss() #criterion = nn.CrossEntropyLoss() We are doing regression not classification
    optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)

    train_loader = torch.utils.data.DataLoader(trainingSet, batch_size=batch_size, 
                                            num_workers=1, shuffle=True)
    val_loader = torch.utils.data.DataLoader(validationSet, batch_size=batch_size, 
                                            num_workers=1, shuffle=True)

     # Set up some numpy arrays to store the training/test loss/erruracy
    train_err = np.zeros(num_epochs)
    train_loss = np.zeros(num_epochs)
    val_err = np.zeros(num_epochs)
    val_loss = np.zeros(num_epochs)
    ########################################################################
    # Train the network
    # Loop over the data iterator and sample a new batch of training data
    # Get the output from the network, and optimize our loss function.
    start_time = time.time()
    print ("Training Started...")
    for epoch in range(num_epochs):  # loop over the dataset multiple times
        total_train_loss = 0.0
        total_train_err = 0.0
        number_items = 0
        for i, data in enumerate(train_loader, 0): # Itterate through each batch
            #print("Training", i, "out of", len(train_loader))
            # Get the inputs
            inputs, labels = data
            inputs = inputs.view(inputs.shape[0], int(inputs.shape[1]/2), 2) ########### batch_size, number of mini RNNs, input to each mini RNN

            #############################################
            #To Enable GPU Usage
            if use_cuda and torch.cuda.is_available():
              inputs = inputs.cuda()
              labels = labels.cuda()
            #############################################
            
              
            outputs = net(inputs.float())             # forward pass
            loss = criterion(outputs, labels.float()) # compute the total loss
            loss.backward()               # backward pass (compute parameter updates)
            optimizer.step()              # make the updates for each parameter
            optimizer.zero_grad()         # a clean up step for PyTorch

            # Calculate the statistics
            total_train_loss += loss.item()

            cur_err, cur_num = compute_error_rnn(outputs, labels)
            total_train_err += cur_err
            number_items += cur_num
             

        train_err[epoch] = float(total_train_err) / number_items
        train_loss[epoch] = float(total_train_loss) / (i+1)
        val_err[epoch], val_loss[epoch] = get_accuracy(net, val_loader, criterion)
        
        print(("Epoch {}: Train err: {}, Train loss: {} |"
              + "Validation err: {}, Validation loss: {}"
                ).format(
                    epoch + 1,
                    train_err[epoch],
                    train_loss[epoch],
                    val_err[epoch],
                    val_loss[epoch]))
        # Save the current model (checkpoint) to a file
        model_path = get_model_name(net.name, batch_size, learning_rate, epoch + 1)
        #print(model_path) #for debugging the path smh
        #torch.save(net.state_dict(), model_path)
    print('Finished Training')
    end_time = time.time()
    elapsed_time = end_time - start_time
    print("Total time elapsed: {:.2f} seconds".format(elapsed_time))

    # Write the train/test loss/err into CSV file for plotting later
    
    epochs = np.arange(1, num_epochs + 1)
    np.savetxt("{}_train_err.csv".format(model_path), train_err)
    np.savetxt("{}_train_loss.csv".format(model_path), train_loss)
    np.savetxt("{}_val_err.csv".format(model_path), val_err)
    np.savetxt("{}_val_loss.csv".format(model_path), val_loss)

In [46]:
weather_rnn = weatherRNN(hidden_size=10)
if use_cuda:
  weather_rnn = weather_rnn.cuda()
train_rnn_network(weather_rnn, trainingSet=trainingSet, validationSet=validationSet, batch_size=40, learning_rate=0.004, num_epochs=45)

Training Started...
Epoch 1: Train err: 0.6749862863411958, Train loss: 181.6884369228197 |Validation err: 9.449716690823664, Validation loss: 151.2521041067023
Epoch 2: Train err: 0.5888645090510148, Train loss: 120.06661796569824 |Validation err: 7.732204232357678, Validation loss: 103.08221917403371
Epoch 3: Train err: 0.5156335710367526, Train loss: 83.8901487848033 |Validation err: 6.421285908066386, Validation loss: 72.86955462004009
Epoch 4: Train err: 0.451865057597367, Train loss: 59.49939342167067 |Validation err: 5.465867088807239, Validation loss: 50.877789547568874
Epoch 5: Train err: 0.3908392759188151, Train loss: 43.92477334063986 |Validation err: 4.715055119227352, Validation loss: 36.76949099490517
Epoch 6: Train err: 0.33872737246297313, Train loss: 33.13766421442447 |Validation err: 4.171839598501372, Validation loss: 28.17048634980854
Epoch 7: Train err: 0.2918266593527153, Train loss: 25.748851931613423 |Validation err: 3.7518252111349635, Validation loss: 22.9443

In [1]:
model_path = get_model_name("weatherRNN", batch_size=30, learning_rate=0.01, epoch=30)

plot_training_curve(model_path)


NameError: ignored

### Tuning

best model???

