<a href="https://colab.research.google.com/github/Shopping-Yuan/ML2021HW/blob/Shopping_branch/ML2021HW1_modified.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Pytorch Setting:

In [116]:
#import pytorch
import torch

# torch.backends.cudnn: set CNN algorithmtorch.backends.cudnn
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# get the current available device ('cpu' or 'cuda')
def get_device():
    return 'cuda' if torch.cuda.is_available() else 'cpu'
device = get_device()
print(torch.cuda.is_available())
device = get_device()

#set random variable
import numpy as np
myseed = 1
np.random.seed(myseed)
torch.manual_seed(myseed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(myseed)

False


##Data Set

In [107]:
#write data_set_function
#normalize data if needed
def normalize(df):
    return ((df - df.mean(axis = 0)) / (df.std(axis = 0)))
# generate index_list for spliting train & validation set
def got_index(df,mode):
    index_list = []
    first_day = 0
    for i in range(df.shape[1]):
      #count total data in that catetory
      days = df.iloc[:,i].sum()

      last_day = int(first_day + days-1)
      split_day = int(first_day + days*0.9)

      if mode == "train":
        index_list += list(range(first_day,split_day+1))
      elif mode == "val":
        index_list += list(range(split_day+1,last_day+1))

      first_day = last_day+1
    return index_list

#split train & validation set by index
def covid19_train_val_f(mode,d_l = "both"):
    train__val_df = pd.read_csv(data_info[mode]["path"])
    print('Size of training data: {}'.format(train__val_df.shape))

    train__val_df.iloc[:,41:] = normalize(train__val_df.iloc[:,41:])

    index_list = got_index(train__val_df.iloc[:,1:41],mode)
    train__val_df = train__val_df.iloc[ index_list,1:]
    label = torch.FloatTensor(train__val_df.iloc[:, -1].to_numpy())
    data = torch.FloatTensor(train__val_df.iloc[:, 0:-1].to_numpy())
    return({"data":data , "label":label})

def covid19_test_f(mode,d_l = "data"):
    test_df = pd.read_csv(data_info[mode]["path"])
    print('Size of testing data: {}'.format(test_df.shape))
    test_df.iloc[:,40:] = normalize(test_df.iloc[:,40:])
    data = torch.FloatTensor(test_df.to_numpy())
    return(data)
#create a dict of functions and path w.r.t. different mode
data_info = {
    "train":{"function":covid19_train_val_f,"path":"./covid.train.csv","data_or_label":"both"},
    "val":{"function":covid19_train_val_f,"path":"./covid.train.csv","data_or_label":"both"},
    "test":{"function":covid19_test_f,"path":"./covid.test.csv","data_or_label":"data"}
}

In [108]:
#rewrite class Dataset
import torch
import pandas as pd
from torch.utils.data import Dataset, DataLoader

class Dataset_preparation(Dataset):
  def __init__(self,mode,data_info):
      self.mode = mode
      self.data_or_label = data_info[mode]["data_or_label"]

      if self.data_or_label == "data":
        self.data = data_info[self.mode]["function"](self.mode,"data")
      elif self.data_or_label == "both":
        self.data = data_info[self.mode]["function"](self.mode,"both")["data"]
        self.label = data_info[self.mode]["function"](self.mode,"both")["label"]
      else :
        self.label = data_info[self.mode]["function"](self.mode,"label")["label"]

      self.dim = self.data.shape[0]
      print('Finished reading the {mode} set of Dataset ({len} samples found, each dim = {dim})'
              .format(mode = self.mode, len =len(self.data), dim=self.dim))

  def __getitem__(self, index):
      if self.data_or_label == "data":
        return self.data[index]
      elif self.data_or_label == "both":
        return self.data[index], self.label[index]
      else :
        return self.label[index]
  def __len__(self):
      # Returns the size of the dataset
      return len(self.data)

In [109]:
#decide how to load data
def prep_dataloader(mode,d_info,batch_size,n_jobs=0):
    dataset = Dataset_preparation(mode,d_info)
    dataloader = DataLoader(
        dataset, batch_size,
        shuffle=(mode == 'train'), drop_last=False,
        num_workers=n_jobs, pin_memory=False)
    return dataloader

In [119]:
from torch.nn.modules.activation import Sigmoid
import torch.nn as nn
class NeuralNet(nn.Module):
    def __init__(self, input_dim):
        super(NeuralNet, self).__init__()

        # Define your neural network here
        # TODO: How to modify this model to achieve better performance?
        model = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
            )

        self.criterion = nn.MSELoss(reduction='mean')

    def forward(self, x):
        x = model(x)
        return x

    def cal_loss(self, pred, target):

        return self.criterion(pred, target)

In [140]:
h_paras = {
    # maximum number of epochs
    'n_epochs': 3000,
    # mini-batch size for dataloader
    'batch_size': 270,
    # optimization algorithm (optimizer in torch.optim)
    'optimizer': 'SGD',
    # hyper-parameters for the optimizer (depends on which optimizer you are using)
    'optim_hparas': {
        # learning rate of SGD
        'lr': 0.001,
        # momentum for SGD
        'momentum': 0.9
    },
    # early stopping epochs (the number epochs since your model's last improvement)
    'early_stop': 200,
    # if loss small than this , early-stopping counter will reset(may update in validation process)
    'early_stop_loss_init': 100.0,
    # your model will be saved here
    'save_path': './model.pth'
}

In [123]:
def train(train_set,mode_d,device):
    # set model to training mode
    mode_d.train()
    # iterate through the dataloader
    for data , label in train_set:
      # move data to device (cpu/cuda)
      data , label = data.to(device), label.to(device)
      # forward pass (compute output tensor)
      pred = 100 * Sigmoid(mode_d(data))
      # compute loss
      mse_loss = mode_d.cal_loss(pred , label)
      # set optimizer gradient to zero
      optimizer.zero_grad()
      # compute gradient (backpropagation)
      mse_loss.backward()
      # update model with optimizer
      optimizer.step()
      return mse_loss.item()

In [124]:
def val(val_set,mode_d,device):
    # set model to evalutation mode
    mode_d.eval()

    total_loss = 0
    # iterate through the dataloader
    for data , label in val_set:
    # move data to device (cpu/cuda)
      data_d, label_d = data.to(device), label.to(device)
      # disable gradient calculation
      with torch.no_grad():
        # forward pass (compute output)
        pred = 100 * Sigmoid(mode_d(data_d))
        # compute loss
        mse_loss = mode_d.cal_loss(pred, label_d)
      # accumulate loss
      batch_size = len(data_d)
      total_loss += mse_loss.item() * batch_size
    # compute averaged loss
    totol_size = len(val_set.dataset)
    avg_loss =  total_loss/totol_size

    return avg_loss

In [125]:
def test(test_set, mode_d, device):
    # set model to evalutation mode
    mode_d.eval()
    preds = []
    # iterate through the dataloader
    for data in test_set:
      # move data to device (cpu/cuda)
      data_d = data.to(device)
      # disable gradient calculation
      with torch.no_grad():
        # forward pass (compute output)
        pred = mode_d(data_d)
        # collect prediction
        preds.append(pred.detach().cpu())
    # concatenate all predictions and convert to a numpy array
    preds = torch.cat(preds, dim=0).numpy()
    return preds

In [127]:
def train_val_process(training_set, validation_set, model, h_paras, device):
    #1: move model to device (cpu/cuda)
    model_d = model.to(device)
    #2: set optimizer = torch.optim.SGD(model_d.parameters(),lr=0.001,momentum=0.9)
    optimizer = getattr(torch.optim, h_paras['optimizer'])(
        model.parameters(), **h_paras['optim_hparas'])

    #3: set epoch = h_paras['n_epochs'] = 3000
    n_epochs = h_paras['n_epochs']

    #4: record training loss
    loss_record = {'train': [], "val": []}

    #5: setting paras
    # epoch para
    epoch = 0
    # early-stoping paras
    early_stop_cnt = 0
    min_loss = h_paras["early_stop_loss_init"]

    #start training
    while epoch < n_epochs:

      # Part1 : training process
      # updata model weight and compute model loss
      train_loss = train(training_set,model_d,device)
      # save loss to loss_record['train']
      loss_record['train'].append(train_loss)
      # renew epoch para
      epoch += 1

      # Part2 validation process
      # After each epoch, test your model
      # on the validation (development) set.
      val_loss = val(validation_set, model_d, device)
      # save loss to loss_record["val"]
      loss_record["val"].append(val_loss)

      # Part3 early stopping
      # Compute early-stopping counter
      if val_loss < min_loss:
        min_loss = val_loss
        print('Saving model (epoch = {:4d}, loss = {:.4f})'\
        .format(epoch+1 , min_loss))
      # Save model to specified path if your model improved
        torch.save(model_d.state_dict(), h_paras['save_path'])
        early_stop_cnt = 0
      else:
        early_stop_cnt += 1
      # Check early stop criteria
      if early_stop_cnt > h_paras['early_stop']:
          # Stop training if your model stops improving
          # for "h_paras['early_stop']" epochs.
          break

    print('Finished training after {} epochs'.format(epoch))
    return min_loss, loss_record

In [122]:
train_set = prep_dataloader('train',data_info,config['batch_size'])
val_set = prep_dataloader('val',data_info,config['batch_size'])
test_set = prep_dataloader('test',data_info,config['batch_size'])

Size of training data: (2700, 95)
Size of training data: (2700, 95)
Finished reading the train set of Dataset (2460 samples found, each dim = 2460)
Size of training data: (2700, 95)
Size of training data: (2700, 95)
Finished reading the val set of Dataset (240 samples found, each dim = 240)
Size of testing data: (893, 94)
Finished reading the test set of Dataset (893 samples found, each dim = 893)


In [138]:
# Construct model and move to device
model = NeuralNet(train_set.dataset.dim)
model_d = model.to(device)
optimizer = torch.optim.SGD(model_d.parameters(),lr = 0.001,momentum=0.9)
print(optimizer)

ValueError: ignored

In [135]:
model = NeuralNet(train_set.dataset.dim)  # Construct model and move to device
model_loss, model_loss_record = train_val_process(train_set, val_set, model, h_paras, device)

ValueError: ignored

In [None]:
def save_pred(preds, file):
    print('Saving results to {}'.format(file))
    # with open(file, 'w') as fp:
    #     writer = csv.writer(fp)
    #     writer.writerow(['id', 'tested_positive'])
    #     for i, p in enumerate(preds):
    #         writer.writerow([i, p])

preds = test(test_set, model, device)  # predict COVID-19 cases with your model
save_pred(preds, 'pred.csv')         # save prediction file to pred.csv

In [None]:
model = NeuralNet(tr_set.dataset.dim).to(device)  # Construct model and move to device
model_loading = torch.load(h_paras['save_path'], map_location=device)  # Load your best model
model.load_state_dict(model_loading)