In [7]:
import numpy as np
import pandas as pd
import random
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

In [34]:
def same_seed(seed):
    '''
    Fix random seed for reproducibility
    '''
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)


class COVID19Dataset(Dataset):
    '''
    x: features
    y: labels if exist else None
    '''
    def __init__(self, x, y=None):
        if y is None:
            self.y = y
        else:
            self.y = torch.FloatTensor(y)
        self.x = torch.FloatTensor(x)
    def __len__(self):
        return len(self.x)
    def __getitem__(self, idx):
        if self.y is None:
            return self.x[idx]
        else:
            return self.x[idx], self.y[idx]
        

def select_features(train_data, valid_data, test_data, select_all=True, k=11, train_path=None):
    '''
    select useful features for regression
    '''
    y_train, y_valid = train_data[:,-1], valid_data[:,-1]
    raw_x_train, raw_x_valid, raw_x_test = train_data[:,:-1], valid_data[:,:-1], test_data[:,:-1]
    if select_all:
        return raw_x_train, raw_x_valid, raw_x_test, y_train, y_valid
    else:
        train_data = pd.read_csv(train_path)
        _, cols = train_data.shape
        x = train_data.iloc[:,:cols-1]
        y = train_data.iloc[:,cols-1]
        skb = SelectKBest(score_func=f_regression, k=20)
        skb.fit(x, y)
        index = skb.get_support(indices=True)
        print('selected feature index: ', index)
        return raw_x_train[:,index], raw_x_valid[:,index], raw_x_test[:,index], y_train, y_valid
def train_valid_split(data_set, ratio, seed):
    valid_set_size = int(ratio * len(data_set))
    indics = [i for i in range(len(data_set))]
    random.shuffle(indics)
    valid_set = data_set[indics[:valid_set_size],:]
    train_set = data_set[indics[valid_set_size:],:]
    return np.array(train_set), np.array(valid_set)

In [12]:
class Regressioner(nn.Module):
    def __init__(self, input_dim):
        super(Regressioner, self).__init__()

        self.layers = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 16),
            nn.ReLU(),
            nn.Linear(16, 1)
        )
    def forward(self, x):
        x = self.layers(x)
        x = torch.squeeze(x)
        return x

In [39]:
def trainer(model, train_loader, valid_loader, configs):
    pass

In [None]:
if __name__ == '__main__':
    configs = {
        'seed': 1103,
        'ratio': 0.3,
        'train_path': './hw1_dataset/covid_train.csv',
        'test_path': './hw1_dataset/covid_test.csv',
        'batch_size': 256,
        'epochs': 10000,
        'lr': 1e-4,
        'select_all': False,
        'save_path': './models/model.ckpt',
        'k_features': 20,
    }

    same_seed(configs['seed'])
    train_data, test_data = pd.read_csv(configs['train_path']).values, pd.read_csv(configs['test_path']).values
    train_data, valid_data = train_valid_split(train_data, configs['ratio'], configs['seed'])
    print(f'train_data size: {train_data.shape}\nvalid_data size: {valid_data.shape}\ntest_data size: {test_data.shape}')
    x_train, x_valid, x_test, y_train, y_valid = select_features(train_data, valid_data, test_data, 
                                                                 select_all=configs['select_all'], k=configs['k_features'], train_path=configs['train_path'])
    train_dataset = COVID19Dataset(x_train, y_train)
    valid_dataset = COVID19Dataset(x_valid, y_valid)
    test_dataset = COVID19Dataset(x_test)
    train_loader = DataLoader(train_dataset, batch_size=configs['batch_size'], shuffle=True, pin_memory=True)
    valid_loader = DataLoader(valid_dataset, batch_size=configs['batch_size'], shuffle=True, pin_memory=True)
    test_loader = DataLoader(test_dataset, batch_size=configs['batch_size'], shuffle=False, pin_memory=True)

    model = Regressioner(test_dataset[0].shape[0])
    trainer(model, train_loader, test_loader, configs)
