In [5]:
# !kaggle competitions download -c spaceship-titanic
# !unzip spaceship-titanic.zip -d dataset

spaceship-titanic.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  spaceship-titanic.zip
replace dataset/sample_submission.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.utils.data import Dataset, DataLoader, BatchSampler, random_split, WeightedRandomSampler, Subset
import pandas as pd
import os
import re
from collections import Counter
from cosine_annealing_warmup import CosineAnnealingWarmupRestarts
import pickle
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
import gc
from tqdm import tqdm

In [10]:
# device = torch.device("mps")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cpu


In [3]:
current_path = os.getcwd()
train_path = os.path.join(current_path, "./dataset/train.csv")
test_path = os.path.join(current_path, "./dataset/test.csv")
submission_path = os.path.join(current_path, "./dataset/sample_submission.csv")

In [4]:
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
submission_df = pd.read_csv(submission_path)
len(train_df)

8693

In [38]:
from enum import Enum


class Filter_type(Enum):
    HIGH = 0
    MID = 1
    LOW = 2


class Dataset_type(Enum):
    TRAIN = 0
    TEST = 1


class Space_dataset(Dataset):
    def __init__(self, df_path, dataset_type: Dataset_type, mode: Filter_type, nor_mean=0, nor_std=1):
        self.df_path = df_path
        self.df = pd.read_csv(df_path)
        self.nor_mean = nor_mean
        self.nor_std = nor_std
        self.dataset_type = dataset_type
        self.mode = mode
        self.input = self._transform_df2input(
            self.df, self.mode, self.nor_mean, self.nor_std)
        self.trg_mean, self.trg_std = self._trg_mean_std(self.input[1])

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        src = self.input[0][idx] if self.dataset_type == Dataset_type.TRAIN else self.input[idx]
        if self.dataset_type == Dataset_type.TRAIN:
            trg = self.input[1][idx]
            return src, trg
        else:
            return src

    def _trg_mean_std(self, trg_list):
        mean = np.nanmean(trg_list, axis=0)
        std = np.nanstd(trg_list, axis=0)
        return mean, std

    def _transform_df2input(self, df, mode: Filter_type, dataset_type: Dataset_type, nor_mean=0, nor_std=1):
        if mode == Filter_type.HIGH:
            feature_list = ['Age', 'RoomService',
                            'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
        elif mode == Filter_type.MID:
            feature_list = ['OverallQual', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'TotalBsmtSF',
                            '1stFlrSF', 'GrLivArea', 'FullBath', 'TotRmsAbvGrd', 'Fireplaces',
                            'GarageYrBlt', 'GarageCars']

        elif mode == Filter_type.LOW:
            feature_list = ['OverallQual',
                            'TotalBsmtSF', 'GrLivArea', 'GarageCars']

        df_selected = df[feature_list]
        if self.dataset_type == Dataset_type.TRAIN:
            df_label = df["Transported"]
            numpy_label = df_label.values
        numpy_data = df_selected.values

        mean = np.nanmean(numpy_data, axis=0)
        std = np.nanstd(numpy_data, axis=0)

        # 정규화 수행
        normalized_array = (((numpy_data - mean) / (std) * nor_std)) + nor_mean
        normalized_array = np.nan_to_num(normalized_array)
        if self.dataset_type == Dataset_type.TRAIN:
            return (normalized_array, numpy_label)
        return normalized_array


In [39]:
train_set = Space_dataset(
    df_path=train_path, dataset_type=Dataset_type.TRAIN, mode=Filter_type.HIGH, nor_mean=0.5)

In [40]:
class Linear(nn.Module):
    def __init__(self, input_dim):
        super(Linear, self).__init__()
        self.input_dim = input_dim
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 32)
        self.fc4 = nn.Linear(32, 1)

    def forward(self, x):
        x = nn.ReLU()(self.fc1(x))
        x = nn.ReLU()(self.fc2(x))
        x = nn.ReLU()(self.fc3(x))
        x = self.fc4(x)
        x = torch.sigmoid(x)
        return x

In [45]:
import sys
input_dim = len(train_set[0][0])
print(input_dim)
model = Linear(input_dim=input_dim)


def train(dataset, num_of_epoch, valid_rate, batch_size,
          optimizer, scheduler, criterion, valid_random_seed, patience=3):
    torch.manual_seed(valid_random_seed)

    optimizer = optimizer
    criterion = criterion

    train_set_count = int(len(dataset) * valid_rate)
    val_set_count = len(dataset) - train_set_count

    train_set, val_set = random_split(
        dataset, [train_set_count, val_set_count])

    train_loader = DataLoader(train_set, batch_size=batch_size,
                              shuffle=True)
    val_loader = DataLoader(val_set, batch_size=batch_size,
                            shuffle=True)
    scheduler = scheduler

    train_losses = []
    train_accs = []
    val_losses = []
    val_accs = []
    lrs = []
    best_val_acc = 0
    current_patience = patience
    is_early_stopping = False

    for epoch in range(num_of_epoch):
        model.train()
        correct = 0
        for batch_idx, (data, target) in enumerate(train_loader):
            data = data.to(device)
            optimizer.zero_grad()
            # print(data.shape)
            # print(data.dtype)
            # print(target.shape)
            data = data.type(torch.float32)
            output = model(data).float()
            output = output.squeeze(1)
            target = target.float()
            # print(output.shape)
            # print(target.shape)
            loss = criterion(output, target).float()
            loss.backward()
            optimizer.step()
            lrs.append(optimizer.param_groups[0]["lr"])
            scheduler.step()
            pred = torch.tensor(
                [1 if output_data >= 0.5 else 0 for output_data in output]).to(device)
            correct += pred.eq(target.view_as(pred)).sum().item()

            if batch_idx % 100 == 0:
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}, lrs: {}'.format(
                    epoch + 1, batch_idx *
                    len(data[0]), len(train_loader.dataset),
                    100. * batch_idx / len(train_loader), loss.item(), lrs[-1]))

        train_accuracy = 100. * (correct / len(train_set))
        train_losses.append(loss.item())
        train_accs.append(train_accuracy)
        print(
            f"Epoch: {epoch + 1} - train Accs: {train_accuracy}")

        gc.collect()

        model.eval()
        preds = []
        targets = []
        correct = 0

        with torch.no_grad():
            for batch_idx, (data, target) in enumerate(val_loader):
                data = data.to(device)
                data = data.type(torch.float32)
                output = model(data).float()
                output = output.squeeze(1)
                target = target.float()
                loss = criterion(output, target).float()
                pred = torch.tensor(
                    [1 if output_data >= 0.5 else 0 for output_data in output]).to(device)
                correct += pred.eq(target.view_as(pred)).sum().item()
            val_accuracy = 100. * (correct / len(val_set))
            val_losses.append(loss.item())
            val_accs.append(val_accuracy)

            if val_accuracy > best_val_acc:
                current_patience = patience
                best_val_acc = val_accuracy
                best_model = model.state_dict()
                torch.save(best_model, 'model.pt')
            else:
                current_patience -= 1
                if current_patience < 0:
                    # print("Early Stopping!")
                    is_early_stopping = False

            print('Epoch {} finished: train loss = {}, val loss = {}'.format(epoch + 1,
                                                                             train_losses[-1], val_losses[-1]))
            print(
                f"Epoch: {epoch + 1} - Validation acc: {val_accuracy}")
            print("\n\n\n")

            if is_early_stopping:
                break

6


In [46]:
valid_rate = 0.9
batch_size = 64
learning_rate = 0.0005
num_of_epoch = 100
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
scheduler = CosineAnnealingWarmupRestarts(
    optimizer, first_cycle_steps=4000, cycle_mult=1.0, max_lr=0.001, min_lr=0.0001, warmup_steps=500, gamma=1)

criterion = nn.BCELoss()


valid_random_seed = 5
train(dataset=train_set, valid_rate=valid_rate, num_of_epoch=num_of_epoch, batch_size=64, optimizer=optimizer,
      scheduler=scheduler, criterion=criterion, valid_random_seed=valid_random_seed)


Epoch: 1 - train Accs: 65.52473475648728
Epoch 1 finished: train loss = 0.6390478610992432, val loss = 0.6112125515937805
Epoch: 1 - Validation acc: 72.8735632183908




Epoch: 2 - train Accs: 75.53368273041032
Epoch 2 finished: train loss = 0.4052242934703827, val loss = 0.5405288338661194
Epoch: 2 - Validation acc: 76.32183908045977




Epoch: 3 - train Accs: 77.11875239677873
Epoch 3 finished: train loss = 0.5570458769798279, val loss = 0.5032473206520081
Epoch: 3 - Validation acc: 75.97701149425288




Epoch: 4 - train Accs: 77.98798414930333
Epoch 4 finished: train loss = 0.6124874353408813, val loss = 0.4821784496307373
Epoch: 4 - Validation acc: 75.86206896551724




Epoch: 5 - train Accs: 78.48651412501599
Epoch 5 finished: train loss = 0.5227768421173096, val loss = 0.4735974371433258
Epoch: 5 - Validation acc: 77.01149425287356




Epoch: 6 - train Accs: 78.88278154160808
Epoch 6 finished: train loss = 0.4070718586444855, val loss = 0.6319120526313782
Epoch: 6 - Validation ac

In [55]:
def test(dataset, batch_size):

    test_loader = DataLoader(dataset, batch_size=batch_size,
                             shuffle=False)
    model.eval()
    pred_list = []

    with torch.no_grad():
        for batch_idx, data in enumerate(test_loader):
            data = data.to(device)
            data = data.type(torch.float32)
            output = model(data).float()
            pred = torch.tensor(
                [True if output_data >= 0.5 else False for output_data in output]).to(device)
            pred_list.extend(pred)
    return pred_list


In [56]:
test_set = Space_dataset(
    df_path=test_path, dataset_type=Dataset_type.TEST, mode=Filter_type.HIGH, nor_mean=0.5)

result = test(dataset=test_set, batch_size=batch_size)
result

[tensor(True),
 tensor(False),
 tensor(True),
 tensor(True),
 tensor(True),
 tensor(True),
 tensor(True),
 tensor(True),
 tensor(True),
 tensor(True),
 tensor(False),
 tensor(False),
 tensor(True),
 tensor(True),
 tensor(False),
 tensor(False),
 tensor(False),
 tensor(True),
 tensor(True),
 tensor(False),
 tensor(True),
 tensor(False),
 tensor(True),
 tensor(True),
 tensor(True),
 tensor(False),
 tensor(True),
 tensor(True),
 tensor(True),
 tensor(True),
 tensor(True),
 tensor(False),
 tensor(True),
 tensor(True),
 tensor(False),
 tensor(False),
 tensor(True),
 tensor(True),
 tensor(True),
 tensor(False),
 tensor(True),
 tensor(False),
 tensor(True),
 tensor(True),
 tensor(False),
 tensor(True),
 tensor(False),
 tensor(False),
 tensor(False),
 tensor(False),
 tensor(True),
 tensor(True),
 tensor(False),
 tensor(True),
 tensor(True),
 tensor(True),
 tensor(False),
 tensor(True),
 tensor(True),
 tensor(False),
 tensor(True),
 tensor(True),
 tensor(False),
 tensor(True),
 tensor(True),
 t

In [54]:
submission_df.head(5)

Unnamed: 0,PassengerId,Transported
0,0013_01,False
1,0018_01,False
2,0019_01,False
3,0021_01,False
4,0023_01,False


In [57]:
submission_df["Transported"] = [element.item() for element in result]
submission_df.head()

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True


In [58]:
submission_df.to_csv("submission_result.csv", index=False)