In [4]:
import torch
from torch import nn
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

# Train model trên máy local với bộ dữ liệu train:
##    Chia tập dữ liệu train thành 2 tập:
        train_local: 17000 dòng dữ liệu
        test_local: hơn 4000 dòng
--> Đánh giá hiệu quả kiến trúc model và các hyperparameter.

In [5]:
# Đọc dữ liệu local
x_train_local = pd.read_csv("local/train/data_local_train.csv")
y_train_local = pd.read_csv("local/train/target_local_train.csv")

x_test_local = pd.read_csv("local/test/data_local_test.csv")
y_test_local = pd.read_csv("local/test/target_local_test.csv")

In [6]:
# Tách cột id user ở tập train và test
id_user_train = x_train_local["id"]
id_user_test = x_test_local["id"]

x_train_local = x_train_local.drop(["id"], axis = 1)
x_test_local = x_test_local.drop(["id"], axis = 1)

# Transform thanh torch tensor
transform_x_train_local = torch.tensor(x_train_local.values, dtype = torch.float)
transform_x_test_local = torch.tensor(x_test_local.values, dtype = torch.float)

transform_y_train_local = torch.tensor(y_train_local.values, dtype = torch.float)
transform_y_test_local = torch.tensor(y_test_local.values, dtype = torch.float)

In [12]:
# Xây dựng model
device = "cuda" if torch.cuda.is_available() else "cpu"

print("Using {} device.".format(device))

class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.linear_chain = nn.Sequential(
        nn.Linear(138, 256),
        nn.ReLU(),
        nn.LayerNorm(256),
        nn.Linear(256, 128),
        nn.ReLU(),
        nn.LayerNorm(128),
        nn.Linear(128, 12),
        nn.ReLU()
        )
    def forward(self, x):
        result = self.linear_chain(x)
        return result
    
model = NeuralNetwork().to(device)
print(model)

Using cpu device.
NeuralNetwork(
  (linear_chain): Sequential(
    (0): Linear(in_features=138, out_features=256, bias=True)
    (1): ReLU()
    (2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
    (3): Linear(in_features=256, out_features=128, bias=True)
    (4): ReLU()
    (5): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
    (6): Linear(in_features=128, out_features=12, bias=True)
    (7): ReLU()
  )
)


In [13]:
loss_fn = nn.CrossEntropyLoss()

optimizer = torch.optim.SGD(model.parameters(), lr = 1e-3)
batch_size = 64

def train(x_train, y_train, model, loss_fn, optimizer):
    size = x_train.shape[0]
    
    for t in range(size // batch_size):
        X = x_train[t*batch_size: min((t+1) * batch_size, size)]
        y = y_train[t*batch_size: min((t+1) * batch_size, size)]
        y = torch.argsort(y, dim = 1, descending=True)[:, 0]
        
        pred = model(X)
        loss = loss_fn(pred, y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if t% 100 == 0:
            current = min((t+1) * batch_size, size)
            loss = loss.item()
            print(f"loss: {loss:>7f}, [current: {current} / {size}]")

In [14]:
y_true = torch.argsort(transform_y_test_local, dim = 1, descending = True)[:, 0]
history_accuracy = [0]
t = 0

while True:
    print(f"Epochs {t+1}\n............................................")
    t+=1
    train(transform_x_train_local, transform_y_train_local, model, loss_fn, optimizer)
    pred = model(transform_x_test_local)
    pred = torch.argsort(pred, dim = 1, descending = True)[:, 0]
    accuracy = (pred == y_true).sum() / y_true.shape[0]
    history_accuracy.append(accuracy.item())
    print(f"Accuracy on test set: {accuracy.item():>7f}")
    
    if(history_accuracy[-1] - history_accuracy[-2] < 0.0001 or t > 100):
        break
print("Done!")

Epochs 1
............................................
loss: 2.595920, [current: 64 / 170000]
loss: 1.694276, [current: 6464 / 170000]
loss: 1.865755, [current: 12864 / 170000]
loss: 1.815368, [current: 19264 / 170000]
loss: 1.914757, [current: 25664 / 170000]
loss: 1.860120, [current: 32064 / 170000]
loss: 1.572561, [current: 38464 / 170000]
loss: 1.693285, [current: 44864 / 170000]
loss: 1.998703, [current: 51264 / 170000]
loss: 1.821946, [current: 57664 / 170000]
loss: 1.735141, [current: 64064 / 170000]
loss: 1.605835, [current: 70464 / 170000]
loss: 1.394637, [current: 76864 / 170000]
loss: 1.735441, [current: 83264 / 170000]
loss: 1.692627, [current: 89664 / 170000]
loss: 1.608309, [current: 96064 / 170000]
loss: 1.866316, [current: 102464 / 170000]
loss: 1.608755, [current: 108864 / 170000]
loss: 1.692507, [current: 115264 / 170000]
loss: 1.776703, [current: 121664 / 170000]
loss: 1.611760, [current: 128064 / 170000]
loss: 1.945163, [current: 134464 / 170000]
loss: 1.476694, [cur

In [15]:
# Đọc data test
x_test = pd.read_csv("for_kaggle/not_normalize/test/x_test_not_normalize.csv")

id_test_user = list(x_test["id"])

x_test = x_test.drop(["id"], axis = 1)

x_test = torch.tensor(x_test.values, dtype = torch.float)

In [16]:
y_pred = model(x_test)
y_pred = torch.argsort(y_pred, dim = 1, descending = True)[:, :5].tolist()

In [17]:
label = ['country_AU',
 'country_CA',
 'country_DE',
 'country_ES',
 'country_FR',
 'country_GB',
 'country_IT',
 'country_NDF',
 'country_NL',
 'country_PT',
 'country_US',
 'country_other']

ids = []
result = []

for i in range(len(y_pred)):
    ids += [id_test_user[i]]*5
    for item in y_pred[i]:
        result.append(label[item][8:])

In [18]:
result_ = {"id": ids, "country": result}

submission = pd.DataFrame(result_)

submission.to_csv("submission.csv", index=False)

In [19]:
for item in history_accuracy:
    print("Accuracy: " + str(item) + "\n")

Accuracy: 0

Accuracy: 0.5875813961029053

Accuracy: 0.5875813961029053

