In [132]:
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

import torch
import torchvision
import torchvision.transforms as transforms
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [185]:
RANDOM_SEED = 42
EPOCHS = 100
EPOCHS_TRAIN = 1000
BATCH_SIZE = 256
BATCH_SIZE_TRAIN = 20
PRETRAIN_PATH = './pretrainmodel8.pth'
TRAIN_PATH = './model8.pth'
SAVE_PATH ='./prediction8.csv'

np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [134]:
df_pretrain_features = pd.read_csv('./data/pretrain_features.csv')
df_pretrain_labels = pd.read_csv('./data/pretrain_labels.csv')

In [135]:
pretrain_features = df_pretrain_features.iloc[:,2:].to_numpy(dtype=np.dtype(np.float32))
pretrain_labels = df_pretrain_labels.iloc[:,1:].to_numpy(dtype=np.dtype(np.float32))
# print(np.shape(pretrain_features))
# pretrain_features

In [136]:
print(np.shape(pretrain_labels))

(50000, 1)


In [137]:
pretrian_data = np.hstack((pretrain_features, pretrain_labels))
print(pretrian_data[:, -1:].shape[0])

50000


In [138]:
pre_train_set, pre_valid_set = train_test_split(pretrian_data, test_size=0.1, random_state=RANDOM_SEED)

In [139]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, data, training=False):
        self.training = training
        if self.training:
            self.features = data[:, :-1]
            self.labels = data[:, -1:]
        else:
            self.features = data[:]
        
    
    def __getitem__(self, index):
        feature_i = self.features[index] 
        if self.training:
            label_i = self.labels[index]
            return feature_i, label_i
        else:
            return feature_i
    def __len__(self):
        return self.features.shape[0]

pre_train_dataset = MyDataset(pre_train_set, training=True)
pre_valid_dataset = MyDataset(pre_valid_set, training=True)

In [140]:
from torch.utils.data import DataLoader

pre_train_loader = DataLoader(dataset=pre_train_dataset, batch_size=BATCH_SIZE)
pre_valid_loader = DataLoader(dataset=pre_valid_dataset, batch_size=BATCH_SIZE)

In [187]:
import torch.nn as nn
import torchvision.models as models

class AutoEncoder(nn.Module):
    def __init__(self):
        super(AutoEncoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(1000, 512),
            nn.Tanh(),
            nn.Linear(512, 256),
            nn.Tanh(),
            nn.Linear(256,64),
            nn.Tanh(),
            nn.Linear(64, 32),
            nn.Tanh()
        )

        self.decoder = nn.Sequential(
            nn.Linear(32, 16),
            nn.Tanh(),
            nn.Linear(16, 1)
        )
    
    def forward(self, x):
        x = x.view(x.size(0), -1)
        x = self.encoder(x)
        x = self.decoder(x)
        return x

pretrain_model = AutoEncoder().to(device)

In [142]:
import torch.optim as optim

criterion = nn.MSELoss()
# optimizer = optim.SGD(pretrain_model.parameters(), lr=0.001, momentum=0.9, weight_decay=1e-5, nesterov=True)
optimizer = optim.Adam(pretrain_model.parameters(), lr=0.0001, weight_decay=1e-5)

In [143]:
def pretrain(model):
    min_loss = 100.0
    for epoch in range(EPOCHS):
        running_loss = 0.0
        valid_loss = 0.0

        print('Training')
        model.train()
        for i, data in enumerate(pre_train_loader):
            feature = data[0].to(device)
            label = data[1].to(device)
               
            optimizer.zero_grad()

            result = model(feature)
            loss = criterion(result, label)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
        print(f'[{epoch + 1}] average loss per epoch: {running_loss / len(pre_train_loader):.8f}')

        print('Validation')
        model.eval()
        with torch.no_grad():
            for i, data in enumerate(pre_valid_loader):
                feature = data[0].to(device)
                label = data[1].to(device)
                result = model(feature)
                loss = criterion(result, label)
                valid_loss += loss.item()
        print(f'Average Validation loss per epoch: {valid_loss / len(pre_valid_loader):.8f}')
        
        if valid_loss <= min_loss:
            min_loss = valid_loss
            torch.save(model.state_dict(), PRETRAIN_PATH)

    print('Finished Training')

In [144]:
pretrain(pretrain_model)

Training
[1] average loss per epoch: 4.10738450
Validation
Average Validation loss per epoch: 1.78243719
Training
[2] average loss per epoch: 1.18345387
Validation
Average Validation loss per epoch: 0.75413523
Training
[3] average loss per epoch: 0.54901719
Validation
Average Validation loss per epoch: 0.38696706
Training
[4] average loss per epoch: 0.30421613
Validation
Average Validation loss per epoch: 0.23807619
Training
[5] average loss per epoch: 0.20502085
Validation
Average Validation loss per epoch: 0.18007276
Training
[6] average loss per epoch: 0.16756095
Validation
Average Validation loss per epoch: 0.16001157
Training
[7] average loss per epoch: 0.15508236
Validation
Average Validation loss per epoch: 0.15416200
Training
[8] average loss per epoch: 0.15152386
Validation
Average Validation loss per epoch: 0.15280847
Training
[9] average loss per epoch: 0.15066928
Validation
Average Validation loss per epoch: 0.15259444
Training
[10] average loss per epoch: 0.15049591
Valida

In [188]:
df_train_features = pd.read_csv('./data/train_features.csv')
df_train_labels = pd.read_csv('./data/train_labels.csv')
df_test_features = pd.read_csv('./data/test_features.csv')

In [189]:
train_features = df_train_features.iloc[:,2:].to_numpy(dtype=np.dtype(np.float32))
train_labels = df_train_labels.iloc[:,1:].to_numpy(dtype=np.dtype(np.float32))
test_id = df_test_features.iloc[:,0:1].to_numpy(dtype='str')
test_features = df_test_features.iloc[:,2:].to_numpy(dtype=np.dtype(np.float32))
print(np.shape(test_features[:]))
print(np.shape(train_features[:]))

(10000, 1000)
(100, 1000)


In [190]:
trian_data = np.hstack((train_features, train_labels))
print(trian_data[:, :-1].shape)

(100, 1000)


In [191]:
train_set, valid_set = train_test_split(trian_data, test_size=0.1, random_state=RANDOM_SEED)

In [192]:
train_dataset = MyDataset(train_set, training=True)
valid_dataset = MyDataset(valid_set, training=True)
test_dataset = MyDataset(test_features, training=False)

In [193]:
train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE_TRAIN)
valid_loader = DataLoader(dataset=valid_dataset, batch_size=BATCH_SIZE_TRAIN)
test_loader = DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE)

In [194]:
class myLayer(nn.Module):
    def __init__(self):
        super(myLayer, self).__init__()
        self.fc1 = nn.Linear(32, 16)
        self.fc2 = nn.Linear(16, 1)
        # self.drop = nn.Dropout(p=0.5)
        self.activation_fn = nn.Tanh()
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.activation_fn(x)
        # x = self.drop(x)
        x = self.fc2(x)
        return x

pretrain_model.load_state_dict(torch.load(PRETRAIN_PATH))

for param in pretrain_model.parameters():
    param.requires_grad = False

pretrain_model.decoder = myLayer()

model = pretrain_model.to(device)

In [195]:
criterion2 = nn.MSELoss()
# optimizer2 = optim.SGD(pretrain_model.parameters(), lr=0.001, momentum=0.9, weight_decay=1e-5, nesterov=True)
optimizer2 = optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-5)

In [196]:
def adjust_lr(optimizer, epoch):
    if epoch >= 500:
        for param_group in optimizer.param_groups:
            param_group['lr'] = 0.005
    if epoch >= 800:
        for param_group in optimizer.param_groups:
            param_group['lr'] = 0.001


In [197]:
def train(model):
    min_loss = 100
    for epoch in range(EPOCHS_TRAIN):
        running_loss = 0.0
        valid_loss = 0.0

        print('Training')
        model.train()
        for i, data in enumerate(train_loader):
            feature = data[0].to(device)
            label = data[1].to(device)
            
            # adjust_lr(optimizer2, epoch)
            optimizer2.zero_grad()

            result = model(feature)
            loss = criterion2(result, label)
            loss.backward()
            optimizer2.step()

            running_loss += loss.item()
        print(f'[{epoch + 1}] average loss per epoch: {running_loss / len(train_loader):.8f}')

        print('Validation')
        model.eval()
        with torch.no_grad():
            for i, data in enumerate(valid_loader):
                feature = data[0].to(device)
                label = data[1].to(device)
                result = model(feature)
                loss = criterion2(result, label)
                valid_loss += loss.item()
        print(f'Average Validation loss per epoch: {valid_loss / len(valid_loader):.8f}')
        
        if valid_loss <= min_loss:
            min_loss = valid_loss
            torch.save(model.state_dict(), TRAIN_PATH)

    print('Finished Training')

In [198]:
train(model)

Training
[1] average loss per epoch: 1.00968401
Validation
Average Validation loss per epoch: 0.10504127
Training
[2] average loss per epoch: 0.23625587
Validation
Average Validation loss per epoch: 0.22053717
Training
[3] average loss per epoch: 0.21190385
Validation
Average Validation loss per epoch: 0.10263451
Training
[4] average loss per epoch: 0.11788520
Validation
Average Validation loss per epoch: 0.16368912
Training
[5] average loss per epoch: 0.16121083
Validation
Average Validation loss per epoch: 0.15336441
Training
[6] average loss per epoch: 0.13508664
Validation
Average Validation loss per epoch: 0.09889194
Training
[7] average loss per epoch: 0.12262123
Validation
Average Validation loss per epoch: 0.09876897
Training
[8] average loss per epoch: 0.11371921
Validation
Average Validation loss per epoch: 0.09488750
Training
[9] average loss per epoch: 0.10013146
Validation
Average Validation loss per epoch: 0.10562985
Training
[10] average loss per epoch: 0.10150630
Valida

In [199]:
model.load_state_dict(torch.load(TRAIN_PATH))
model.to(device)

AutoEncoder(
  (encoder): Sequential(
    (0): Linear(in_features=1000, out_features=512, bias=True)
    (1): Tanh()
    (2): Linear(in_features=512, out_features=256, bias=True)
    (3): Tanh()
    (4): Linear(in_features=256, out_features=64, bias=True)
    (5): Tanh()
    (6): Linear(in_features=64, out_features=32, bias=True)
    (7): Tanh()
  )
  (decoder): myLayer(
    (fc1): Linear(in_features=32, out_features=16, bias=True)
    (fc2): Linear(in_features=16, out_features=1, bias=True)
    (activation_fn): Tanh()
  )
)

In [200]:
def test(model):
    predictions = []
    model.eval()
    with torch.no_grad():
        for i, data in tqdm(enumerate(test_loader)):
            feature = data.to(device)
            predict = model(feature)
            predict = predict.cpu().numpy()
            predictions.append(predict)
    return predictions

In [201]:
predictions = test(model)
output = []
for i in range(len(predictions)):
    for j in range(len(predictions[i])):
        output.append(predictions[i][j])


output = np.hstack((test_id, output))
df_output = pd.DataFrame(output)
df_output.columns = ['Id', 'y']
df_output.to_csv(SAVE_PATH, index=False)

40it [00:00, 444.49it/s]
