# Preparation for data

Create a customized dataset.

In [449]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from torch.utils.data import Dataset
import os

# fetched by SelectKBest 
feature_cols = ['tested_positive.1', 'tested_positive', 'hh_cmnty_cli', 'hh_cmnty_cli.1', 'hh_cmnty_cli.2', 'nohh_cmnty_cli', 'nohh_cmnty_cli.1', 'nohh_cmnty_cli.2', 'cli', 'cli.1', 'cli.2', 'ili', 'ili.1', 'ili.2', 'worried_finances.2', 'worried_finances.1', 'worried_finances', 'public_transit.2', 'public_transit.1', 'public_transit']

class CovidDataset(Dataset):
    def abstract_features(self, data: pd.DataFrame, k=5)->(np.array, np.array):
        y = data[data.columns[94]]
        x = data[feature_cols]
        x = (x - x.min()) / (x.max() - x.min())
        return x.to_numpy(), y.to_numpy()
                
    def __init__(self, data_path: str, mode:str='train'):
        data = pd.read_csv(data_path)
        self.mode = mode
        if mode == 'train':
            self.X, self.Y = self.abstract_features(data, 20)
        else:
            x = data[feature_cols]
            x = (x - x.min()) / (x.max() - x.min())
            self.X = x.to_numpy()
    
    def __getitem__(self, index: int):
        if self.mode == 'train':
            return self.X[index], self.Y[index]
        else:
            return self.X[index]
    
    def __len__(self):
        return self.X.shape[0]
    
    
# ds = CovidDataset('../input/ml2021spring-hw1/covid.train.csv')
# for x, y in ds:
#     pass

# Define my model

In [450]:
import torch
import torch.nn as nn

class CovidModel(nn.Module):
    def __init__(self, input_dim: int):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 8),
            nn.ReLU(),
            nn.Linear(8, 4),
            nn.ReLU(),
            nn.Linear(4, 1)
        )
    def forward(self, x):
        x = self.layers(x)
        return x.squeeze(1)
    
# model = CovidModel(in_channels=20)
# print(model)

# Time to train it

In [451]:
# some global configuration
batch_size = 2
val_percent = 0.1
train_data_path = '../input/ml2021spring-hw1/covid.train.csv'
test_data_path = '../input/ml2021spring-hw1/covid.test.csv'
num_features = len(feature_cols)
epochs = 10000
lr = 0.1
need_ckpt = False

In [452]:
from torch.utils.data import DataLoader, random_split
def generate_loader(data_path: str, batch_size, val_percent=0.1) -> (DataLoader, DataLoader):
    dataset = CovidDataset(data_path)
    n_total = len(dataset)
    n_val = int(n_total * val_percent)
    n_train = n_total - n_val
#     print('train set size:', n_train)
#     print('validate set size:', n_val)
    
    train_set, val_set = random_split(dataset, [n_train, n_val])
    train = torch.utils.data.DataLoader(dataset=train_set, batch_size=batch_size, shuffle=True)
    val = torch.utils.data.DataLoader(dataset=val_set, batch_size=batch_size, shuffle=True)
    
    return train, val

# _, _ = generate_loader(train_data_path, 1)

In [453]:
def validate(model: nn.Module, device:torch.device, loader: DataLoader, n_val:int, loss_fn) -> float:
    model.eval()
    val_tot_loss = 0
    with torch.no_grad():
        for x, y in loader:
            x = x.to(device=device, dtype=torch.float32)
            y_true = y.to(device=device, dtype=torch.float32)
            y_pred = model(x)
            loss = loss_fn(y_pred, y_true)
            val_tot_loss += loss.item()
    return val_tot_loss / n_val        

In [454]:
def save_ckpt(model: nn.Module, epoch:int, loss:float):
    print('save ckpt, epoch = {}, loss = {}'.format(epoch, loss))
    torch.save(model.state_dict(), 'ckpt_epoch_{}_loss_{}.pth'.format(epoch, loss))

In [455]:
def train_model(model: nn.Module, device:torch.device, data_path:str, batch_size:int, epoch:int, lr: float):
    train_loader, val_loader = generate_loader(data_path, batch_size)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, 1e10, eta_min=1e-5)
    criterion = nn.MSELoss()
    
    best_loss = float('inf')
    if os.path.exists('best_loss.txt'):
        with open('best_loss.txt', mode='r') as f:
            best_loss = float(f.readline())
            print("read best loss:", best_loss)
    
    n_train_set = len(train_loader)
    n_val_set = len(val_loader)
    trained_cnt = 0
    for epoch in range(epochs):
        model.train()
        if epoch % 50 == 0:
            print('trainning...epoch = %d' % epoch)
        train_tot_loss = 0
        for x, y in train_loader:
            optimizer.zero_grad()
            
            x = x.to(device=device, dtype=torch.float32)
            y_true = y.to(device=device, dtype=torch.float32)
            y_pred = model(x)
            # compute loss
            loss = criterion(y_pred, y_true)
            train_tot_loss += loss.item()
            
            trained_cnt += batch_size
            if trained_cnt % 1000 == 0:
                val_loss = validate(model, device, val_loader, n_val_set, criterion)
                if val_loss < best_loss:
                    print('Avg Loss/Validate:', val_loss)
                    with open('best_loss.txt', mode='w') as f:
                        f.write(str(val_loss))
                    best_loss = val_loss
                    torch.save(model.state_dict(), 'best_model.pth')

            # update
            loss.backward()
            optimizer.step()
            scheduler.step()
#         print('Avg Loss/train:', train_tot_loss / n_train_set)
        if need_ckpt and epoch % 50 == 0:
            save_ckpt(model, epoch, train_tot_loss / n_train_set)

In [456]:
def test_model(model:nn.Module, device: torch.device, test_data_path:str):
    model.load_state_dict(torch.load('best_model.pth', map_location=device))
    model.eval()
    dataset = CovidDataset(test_data_path, mode='test')
    test_data = torch.utils.data.DataLoader(dataset=dataset, batch_size=1)
    df = pd.DataFrame()
    positives = []
    for x in test_data:
        x = x.to(device=device, dtype=torch.float32)
        y_pred = model(x)
        positives.append(y_pred[0].detach().to('cpu').item())
    df = pd.DataFrame({
        'tested_positive': positives
    })
    df.index.name = 'id'
    df.to_csv('submission.csv')
        

In [457]:
if __name__ == '__main__':
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = CovidModel(num_features)
    model.to(device)
    print(model)
    
    if os.path.exists("best_model.pth"):
        model.load_state_dict(torch.load('best_model.pth', map_location=device))
        
#     train_model(model, device, train_data_path, batch_size, epochs, lr)
    test_model(model, device, test_data_path)

CovidModel(
  (layers): Sequential(
    (0): Linear(in_features=20, out_features=64, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=64, out_features=8, bias=True)
    (4): ReLU()
    (5): Linear(in_features=8, out_features=4, bias=True)
    (6): ReLU()
    (7): Linear(in_features=4, out_features=1, bias=True)
  )
)
