In [1]:
import torch
from torch import nn
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

# Train model trên máy local với bộ dữ liệu train:
##    Chia tập dữ liệu train thành 2 tập:
        train_local: 17000 dòng dữ liệu
        test_local: hơn 4000 dòng
--> Đánh giá hiệu quả kiến trúc model và các hyperparameter.

In [2]:
# Đọc dữ liệu local
x_train_local = pd.read_csv("local/train/data_local_train.csv")
y_train_local = pd.read_csv("local/train/target_local_train.csv")

x_test_local = pd.read_csv("local/test/data_local_test.csv")
y_test_local = pd.read_csv("local/test/target_local_test.csv")

In [3]:
# Tách cột id user ở tập train và test
id_user_train = x_train_local["id"]
id_user_test = x_test_local["id"]

x_train_local = x_train_local.drop(["id"], axis = 1)
x_test_local = x_test_local.drop(["id"], axis = 1)

# Transform thanh torch tensor
transform_x_train_local = torch.tensor(x_train_local.values, dtype = torch.float)
transform_x_test_local = torch.tensor(x_test_local.values, dtype = torch.float)

transform_y_train_local = torch.tensor(y_train_local.values, dtype = torch.float)
transform_y_test_local = torch.tensor(y_test_local.values, dtype = torch.float)

In [4]:
weights = np.array(y_test_local)
weights = weights.sum(axis = 0)
weights = weights.sum() / weights
weights = torch.tensor(weights / max(weights), dtype=torch.float)
weights

tensor([0.3684, 0.1489, 0.2019, 0.0888, 0.0401, 0.0832, 0.0668, 0.0016, 0.2781,
        1.0000, 0.0034, 0.0206])

In [5]:
# Xây dựng model
device = "cuda" if torch.cuda.is_available() else "cpu"

print("Using {} device.".format(device))

class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.linear_chain = nn.Sequential(
        nn.Linear(138, 256),
        nn.ReLU(),
        nn.LayerNorm(256),
        nn.Linear(256, 128),
        nn.ReLU(),
        nn.LayerNorm(128),
        nn.Linear(128, 12),
        nn.ReLU()
        )
    def forward(self, x):
        result = self.linear_chain(x)
        return result
    
model = NeuralNetwork().to(device)
print(model)

Using cpu device.
NeuralNetwork(
  (linear_chain): Sequential(
    (0): Linear(in_features=138, out_features=256, bias=True)
    (1): ReLU()
    (2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
    (3): Linear(in_features=256, out_features=128, bias=True)
    (4): ReLU()
    (5): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
    (6): Linear(in_features=128, out_features=12, bias=True)
    (7): ReLU()
  )
)


In [6]:
loss_fn = nn.CrossEntropyLoss(weight = weights)

optimizer = torch.optim.SGD(model.parameters(), lr = 1e-3)
batch_size = 64

def train(x_train, y_train, model, loss_fn, optimizer):
    size = x_train.shape[0]
    
    for t in range(size // batch_size):
        X = x_train[t*batch_size: min((t+1) * batch_size, size)]
        y = y_train[t*batch_size: min((t+1) * batch_size, size)]
        y = torch.argsort(y, dim = 1, descending=True)[:, 0]
        
        pred = model(X)
        loss = loss_fn(pred, y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if t% 100 == 0:
            current = min((t+1) * batch_size, size)
            loss = loss.item()
            print(f"loss: {loss:>7f}, [current: {current} / {size}]")

In [7]:
y_true = torch.argsort(transform_y_test_local, dim = 1, descending = True)[:, 0]
history_accuracy = [0]
t = 0

while True:
    print(f"Epochs {t+1}\n............................................")
    t+=1
    train(transform_x_train_local, transform_y_train_local, model, loss_fn, optimizer)
    pred = model(transform_x_test_local)
    pred = torch.argsort(pred, dim = 1, descending = True)[:, 0]
    accuracy = (pred == y_true).sum() / y_true.shape[0]
    history_accuracy.append(accuracy.item())
    print(f"Accuracy on test set: {accuracy.item():>7f}")
    
    if(history_accuracy[-1] - history_accuracy[-2] < 0.0001 or t > 100):
        break
print("Done!")

Epochs 1
............................................
loss: 2.570660, [current: 64 / 170000]
loss: 2.459688, [current: 6464 / 170000]
loss: 2.462295, [current: 12864 / 170000]
loss: 2.397453, [current: 19264 / 170000]
loss: 2.494218, [current: 25664 / 170000]
loss: 2.486502, [current: 32064 / 170000]
loss: 2.491349, [current: 38464 / 170000]
loss: 2.484542, [current: 44864 / 170000]
loss: 2.512975, [current: 51264 / 170000]
loss: 2.496047, [current: 57664 / 170000]
loss: 2.442175, [current: 64064 / 170000]
loss: 2.488690, [current: 70464 / 170000]
loss: 2.498415, [current: 76864 / 170000]
loss: 2.479093, [current: 83264 / 170000]
loss: 2.468226, [current: 89664 / 170000]
loss: 2.496162, [current: 96064 / 170000]
loss: 2.472875, [current: 102464 / 170000]
loss: 2.487516, [current: 108864 / 170000]
loss: 2.451059, [current: 115264 / 170000]
loss: 2.488273, [current: 121664 / 170000]
loss: 2.482559, [current: 128064 / 170000]
loss: 2.474267, [current: 134464 / 170000]
loss: 2.502118, [cur

In [8]:
# Đọc data test
x_test = pd.read_csv("for_kaggle/not_normalize/test/x_test_not_normalize.csv")

id_test_user = list(x_test["id"])

x_test = x_test.drop(["id"], axis = 1)

x_test = torch.tensor(x_test.values, dtype = torch.float)

In [9]:
y_pred = model(x_test)
y_pred = torch.argsort(y_pred, dim = 1, descending = True)[:, :5].tolist()

In [10]:
label = ['country_AU',
 'country_CA',
 'country_DE',
 'country_ES',
 'country_FR',
 'country_GB',
 'country_IT',
 'country_NDF',
 'country_NL',
 'country_PT',
 'country_US',
 'country_other']

ids = []
result = []

for i in range(len(y_pred)):
    ids += [id_test_user[i]]*5
    for item in y_pred[i]:
        result.append(label[item][8:])

In [11]:
result_ = {"id": ids, "country": result}

submission = pd.DataFrame(result_)

submission.to_csv("submission.csv", index=False)

In [12]:
for item in history_accuracy:
    print("Accuracy: " + str(item) + "\n")

Accuracy: 0

Accuracy: 0.2861154079437256

Accuracy: 0.2861154079437256

