# NBME project : Su Han Cho

In [2]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchtext import datasets
from torchtext.legacy import data

In [3]:
LABEL = data.Field(sequential=False,batch_first=True)
ANNOTATION = data.Field(sequential=True,batch_first=True,lower=True)

In [4]:
field = [('a',ANNOTATION),('l',LABEL)]

In [5]:
train_data= data.TabularDataset.splits(path = '/Users/suhancho/data/kaggle/nbme/',
                                       train = 'train.tsv',
                                       format = 'tsv',
                                       fields = field,
                                       skip_header = True)[0]

In [6]:
ANNOTATION.build_vocab(train_data)
LABEL.build_vocab(train_data)

In [7]:
# 하이퍼파라미터
BATCH_SIZE = 64
lr = 0.005
EPOCHS = 15
USE_CUDA = torch.cuda.is_available()
DEVICE = torch.device("cuda" if USE_CUDA else "cpu")
print("다음 기기로 학습합니다:", DEVICE)


다음 기기로 학습합니다: cpu


In [8]:
train_data, val_data = train_data.split(split_ratio=0.8)
train_iter, val_iter= data.BucketIterator.splits(
        (train_data, val_data), batch_size=BATCH_SIZE,
        shuffle=True, repeat=False,sort=False)
vocab_size = len(ANNOTATION.vocab)
n_classes = 144

# Model building

In [9]:
class BasicGRU(nn.Module):
    def __init__(self, n_layers, hidden_dim, n_vocab, embed_dim, n_classes, dropout_p=0.2):
        super(BasicGRU, self).__init__()
        print("Building Basic GRU model...")
        self.n_layers = n_layers
        self.embed = nn.Embedding(n_vocab, embed_dim)
        self.hidden_dim = hidden_dim
        self.dropout = nn.Dropout(dropout_p)
        self.gru = nn.GRU(embed_dim, self.hidden_dim,
                          num_layers=self.n_layers,
                          batch_first=True)
        self.out = nn.Linear(self.hidden_dim, n_classes)
    
    def _init_state(self, batch_size=1):
        weight = next(self.parameters()).data
        return weight.new(self.n_layers, batch_size, self.hidden_dim).zero_()
    
    def forward(self, x):
        x = self.embed(x)
        h_0 = self._init_state(batch_size=x.size(0))
        x, _ = self.gru(x, h_0)  # [i, b, h]
        h_t = x[:,-1,:]
        self.dropout(h_t)
        logit = self.out(h_t)  # [b, h] -> [b, o]
        return logit


In [10]:
def train(model, optimizer, train_iter):
    model.train()
    for b, batch in enumerate(train_iter):
        x, y = batch.a.to(DEVICE), batch.l.to(DEVICE)
        optimizer.zero_grad()
        logit = model(x)
        loss = F.cross_entropy(logit, y)
        loss.backward()
        optimizer.step()

In [11]:
def evaluate(model, val_iter):
    """evaluate model"""
    model.eval()
    corrects, total_loss = 0, 0
    for batch in val_iter:
        x, y = batch.a.to(DEVICE), batch.l.to(DEVICE)
        logit = model(x)
        loss = F.cross_entropy(logit, y, reduction='sum')
        total_loss += loss.item()
        corrects += (logit.max(1)[1].view(y.size()).data == y.data).sum()
    size = len(val_iter.dataset)
    avg_loss = total_loss / size
    avg_accuracy = 100.0 * corrects / size
    return avg_loss, avg_accuracy

In [12]:

# n_layers, hidden_dim, n_vocab, embed_dim, n_classes, dropout_p=0.2)
model = BasicGRU(1, 128, 70000, 256, n_classes, 0.2).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

Building Basic GRU model...


In [13]:
best_val_loss = None
for e in range(1, EPOCHS+1):
    train(model, optimizer, train_iter)
    val_loss, val_accuracy = evaluate(model, val_iter)

    print("[Epoch: %d] Validation Error : %5.2f | Validation Accuracy : %5.2f" % (e, val_loss, val_accuracy))
    
    # 검증 오차가 가장 적은 최적의 모델을 저장
    if not best_val_loss or val_loss < best_val_loss:
        if not os.path.isdir("snapshot"):
            os.makedirs("snapshot")
        torch.save(model.state_dict(), './snapshot/txtclassification.pt')
        best_val_loss = val_loss

[Epoch: 1] Validation Error :  0.99 | Validation Accuracy : 72.53
[Epoch: 2] Validation Error :  0.71 | Validation Accuracy : 77.37
[Epoch: 3] Validation Error :  0.68 | Validation Accuracy : 77.68
[Epoch: 4] Validation Error :  0.67 | Validation Accuracy : 77.83
[Epoch: 5] Validation Error :  0.66 | Validation Accuracy : 77.78
[Epoch: 6] Validation Error :  0.66 | Validation Accuracy : 79.34
[Epoch: 7] Validation Error :  0.67 | Validation Accuracy : 78.18
[Epoch: 8] Validation Error :  0.67 | Validation Accuracy : 78.69
[Epoch: 9] Validation Error :  0.66 | Validation Accuracy : 79.65
[Epoch: 10] Validation Error :  0.67 | Validation Accuracy : 79.14
[Epoch: 11] Validation Error :  0.67 | Validation Accuracy : 79.49
[Epoch: 12] Validation Error :  0.69 | Validation Accuracy : 77.88
[Epoch: 13] Validation Error :  0.70 | Validation Accuracy : 78.48
[Epoch: 14] Validation Error :  0.70 | Validation Accuracy : 78.03
[Epoch: 15] Validation Error :  0.69 | Validation Accuracy : 78.08


In [14]:
pn_history = data.Field(sequential=True,batch_first=True,lower=True)
field_test = [('a',pn_history)]

# Predicting with whole-text data and retrieve top 3 annotation (predicted) labels

In [15]:
test_data= data.TabularDataset.splits(path = '/Users/suhancho/data/kaggle/nbme/',
                                       train = 'notes.tsv',
                                       format = 'tsv',
                                       fields = field_test,
                                       skip_header = True)[0]

In [16]:
pn_history.build_vocab(test_data)

In [49]:
test_iter= data.BucketIterator(
        test_data, batch_size=1,
        shuffle=True, repeat=False,sort=False)

In [50]:
prediction_df=[]
for batch in test_iter:
    x= batch.a.to(DEVICE)
    logit = model(x)
    prediction_df.append([batch.a,logit.max(1)[1][0:3]])

KeyboardInterrupt: 

In [60]:
logit.max(1)

torch.return_types.max(
values=tensor([8.3162], grad_fn=<MaxBackward0>),
indices=tensor([89]))

In [51]:
import pandas as pd

In [52]:
prediction_df = pd.DataFrame(prediction_df)

In [53]:
prediction_df[0][0]

tensor([[  393,   325,    12,     8,    84,    41,   164,   175,     4,  5946,
           186,    11,    10,  1364, 19491,    25,    91,    27,    89,   594,
             9,    55,   723,    10,   371,    70,   309,    13,   456,    83,
           109,  3246,    14,  4149,   382,    12,   424,    25,    74,    21,
           450,    27,    38,   760,   394,    49,   764,     7,    11,    12,
            10,     7,  1388,   169,   488,     2,    12,   723,    10,   798,
            25,   223,    38,    26,     8,  1779,   714,  2303,    90,    25,
             9,   172,  1091,   143,   101,   411, 48577,    25,   931,     7,
            11,    13,   120,     8,   167,    25,    87,     8,   687,  2872,
            73,   456,    23,     2,   600,    27,    89,     9,  1355,   310,
           264,   871,    25,     9,     8,   211,     6,  1418,   217,    10,
           109,   609,  2378,    25,   179,    15,   358,    20,    76,     8,
            47,    22,   728,   505,    33,    83,  

In [43]:
prediction_df[1][0]

tensor([109,  91,  94])

In [168]:
model.load_state_dict(torch.load('./snapshot/txtclassification.pt'))
test_loss, test_acc = evaluate(model, test_iter)
print('테스트 오차: %5.2f | 테스트 정확도: %5.2f' % (test_loss, test_acc))

AttributeError: 'Batch' object has no attribute 'l'