In [1]:
from argparse import Namespace
from collections import Counter
import json
import os
import string

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import tqdm

In [2]:
class Vocabulary():
  def __init__(self, token_to_idx=None, add_unk=True, unk_token="<UNK>"):
    if token_to_idx is None:
      token_to_idx = {}
    self._token_to_idx = token_to_idx

    self._idx_to_token = {idx: token
                          for token, idx in self._token_to_idx.items()}
    self._add_unk = add_unk
    self._unk_token = unk_token

    self.unk_index = -1
    if add_unk:
      self.unk_index = self.add_token(unk_token)

  def to_serializable(self):
    return {
        "token_to_idx": self._token_to_idx,
        "add_unk": self._add_unk,
        "unk_token": self._unk_token
    }

  @classmethod
  def from_serializable(cls, contents):
    return cls(**contents)

  def add_token(self, token):
    try:
      index = self._token_to_idx[token]
    except KeyError:
      index = len(self._token_to_idx)
      self._token_to_idx[token] = index
      self._idx_to_token[index] = token
    return index

  def add_many(self, tokens):
    return [self.add_token(token) for token in tokens]

  def lookup_token(self, token):
    if self.unk_index >= 0: #unk token을 사용하는 경우
      return self._token_to_idx.get(token, self.unk_index)
    else:
      return self._token_to_idx[token]

  def lookup_index(self, index):
    if index not in self._idx_to_token:
      raise KeyError("Vocabulary에 인덱스 %d가 없습니다." % index)
    return self._idx_to_token[index]

  def __str__(self):
    return "<Vocabulary(size=%d)>" % len(self)

  def __len__(self):
    return len(self._token_to_idx)

In [3]:
class SurnameVectorizer():
  def __init__(self, surname_vocab, nationality_vocab):
    self.surname_vocab = surname_vocab #문자 -> 정수 mapping - Vocabulary 객체 저장
    self.nationality_vocab = nationality_vocab #국적 -> 정수 mapping

  def vectorize(self, surname):
    # 성씨 -> 원핫 벡터로 만듬.
    vocab = self.surname_vocab
    one_hot = np.zeros(len(vocab), dtype=np.float32)
    for token in surname:
      #원핫 벡터의 타입 -> numpy
      one_hot[vocab.lookup_token(token)] = 1
    return one_hot

  @classmethod
  def from_dataframe(cls, surname_df):
    surname_vocab = Vocabulary(unk_token="@")
    nationality_vocab = Vocabulary(add_unk=False)

    for index, row in surname_df.iterrows():
      for letter in row.surname:
        #스펠링 하나마다 파이프에 집어넣음. 위치정보는 무시됨.
        surname_vocab.add_token(letter)
      nationality_vocab.add_token(row.nationality)
    return cls(surname_vocab, nationality_vocab)

  @classmethod
  def from_serializable(cls, contents):
    surname_vocab = Vocabulary.from_serializable(contents["surname_vocab"])
    nationality_vocab = Vocabulary.from_serializable(contents["nationality_vocab"])
    return cls(surname_vocab=surname_vocab, nationality_vocab=nationality_vocab)

  def to_serializable(self):
    return {'surname_vocab': self.surname_vocab.to_serializable(),
      'nationality_vocab': self.nationality_vocab.to_serializable()}

In [4]:
from types import ClassMethodDescriptorType
class SurnameDataset(Dataset): #torch.utils.data.Dataset을 상속함.
  def __init__(self, surname_df, vectorizer):
    self.surname_df = surname_df
    self._vectorizer = vectorizer

    self.train_df = self.surname_df[self.surname_df.split == "train"]
    self.train_size = len(self.train_df)

    self.val_df = self.surname_df[self.surname_df.split == "val"]
    self.validation_size = len(self.val_df)

    self.test_df = self.surname_df[self.surname_df.split == "test"]
    self.test_size = len(self.test_df)

    self._lookup_dict = {
        "train": (self.train_df, self.train_size),
        "val": (self.val_df, self.validation_size),
        "test": (self.test_df, self.test_size)
    }
    self.set_split("train")

    class_counts = surname_df.nationality.value_counts().to_dict()
    def sort_key(item):
      return self._vectorizer.nationality_vocab.lookup_token(item[0])
    sorted_counts = sorted(class_counts.items(), key=sort_key)
    frequencies = [count for _, count in sorted_counts]
    self.class_weights = 1.0 / torch.tensor(frequencies, dtype=torch.float32)

  @classmethod
  def load_dataset_and_make_vectorizer(cls, surname_csv):
    surname_df = pd.read_csv(surname_csv)
    train_surname_df = surname_df[surname_df.split=="train"]
    print(surname_df.head())
    return cls(surname_df, SurnameVectorizer.from_dataframe(train_surname_df))

  @classmethod
  def load_dataset_and_load_vectorizer(cls, surname_csv, vectorizer_filepath):
    surname_df = pd.read_csv(surname_csv)
    vectorizer = cls.load_vectorizer_only(vectorizer_filepath)
    return cls(surname_df, vectorizer)

  @staticmethod
  def load_vectorizer_only(vectorizer_filepath):
    with open(vectorizer_filepath) as fp:
      return SurnameVectorizer.from_serializable(json.load(fp))

  def save_vectorizer(self, vectorizer_filepath):
    with open(vectorizer_filepath, "w") as fp:
      json.dump(self._vectorizer.to_serializable(), fp)

  def get_vectorizer(self):
    return self._vectorizer

  def set_split(self, split="train"):
    self._target_split = split
    self._target_df, self._target_size = self._lookup_dict[split]

  def __len__(self):
    return self._target_size

  def __getitem__(self, index):

    row = self._target_df.iloc[index]

    surname_vector = \
      self._vectorizer.vectorize(row.surname)

    nationality_index = \
      self._vectorizer.nationality_vocab.lookup_token(row.nationality)

    return {
        "x_surname": surname_vector,
        "y_nationality": nationality_index
    }

  def get_num_batches(self, batch_size):
    return len(self) // batch_size

In [5]:
def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last=True, device="cpu"):
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

In [6]:
class SurnameClassifier(nn.Module):
  def __init__(self, input_dim, hidden_dim, output_dim):
    super(SurnameClassifier, self).__init__()
    self.fc1 = nn.Linear(input_dim, hidden_dim)
    self.fc2 = nn.Linear(hidden_dim, output_dim)

  def forward(self, x_in, apply_softmax=False):
    intermediate_vector = F.relu(self.fc1(x_in))
    #dropout 추가
    prediction_vector = self.fc2(F.dropout(intermediate_vector, p=0.5))

    if apply_softmax:
      prediction_vector = F.softmax(prediction_vector, dim=1)
    return prediction_vector

In [7]:
def make_train_state(args):
    return {'stop_early': False,
            'early_stopping_step': 0,
            'early_stopping_best_val': 1e8,
            'learning_rate': args.learning_rate,
            'epoch_index': 0,
            'train_loss': [],
            'train_acc': [],
            'val_loss': [],
            'val_acc': [],
            'test_loss': -1,
            'test_acc': -1,
            'model_filename': args.model_state_file}

def update_train_state(args, model, train_state):
    if train_state['epoch_index'] == 0:
        torch.save(model.state_dict(), train_state['model_filename'])
        train_state['stop_early'] = False

    elif train_state['epoch_index'] >= 1:
        loss_tm1, loss_t = train_state['val_loss'][-2:]

        if loss_t >= train_state['early_stopping_best_val']:
            train_state['early_stopping_step'] += 1
        else:
            if loss_t < train_state['early_stopping_best_val']:
                torch.save(model.state_dict(), train_state['model_filename'])
            train_state['early_stopping_step'] = 0

        train_state['stop_early'] = \
            train_state['early_stopping_step'] >= args.early_stopping_criteria

    return train_state

def compute_accuracy(y_pred, y_target):
    _, y_pred_indices = y_pred.max(dim=1)
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct / len(y_pred_indices) * 100

In [8]:
def set_seed_everywhere(seed, cuda):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if cuda:
        torch.cuda.manual_seed_all(seed)

def handle_dirs(dirpath):
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)

In [9]:
base_path = "/content/drive/MyDrive/Github_NLP/NLP_with_PyTorch/Chapter4_FFNN_for_NLP/data/"
args = Namespace(
    # 날짜와 경로 정보
    surname_csv= base_path + "surnames/surnames_with_splits.csv",
    vectorizer_file= "vectorizer.json",
    model_state_file= "model.pth",
    save_dir= base_path + "model_storage/ch4/surname_mlp/with_drop_out",
    # 모델 하이퍼파라미터
    hidden_dim=300,
    # 훈련 하이퍼파라미터
    seed=1337,
    num_epochs=100,
    early_stopping_criteria=5,
    learning_rate=0.001,
    batch_size=64,
    # 실행 옵션
    cuda=True,
    reload_from_files=False,
    expand_filepaths_to_save_dir=True,
)

if args.expand_filepaths_to_save_dir:
    args.vectorizer_file = os.path.join(args.save_dir,
                                        args.vectorizer_file)

    args.model_state_file = os.path.join(args.save_dir,
                                         args.model_state_file)

    print("파일 경로: ")
    print("\t{}".format(args.vectorizer_file))
    print("\t{}".format(args.model_state_file))

# CUDA 체크
if not torch.cuda.is_available():
    args.cuda = False

args.device = torch.device("cuda" if args.cuda else "cpu")

print("CUDA 사용여부: {}".format(args.cuda))

# 재현성을 위해 시드 설정
set_seed_everywhere(args.seed, args.cuda)

# 디렉토리 처리
handle_dirs(args.save_dir)

파일 경로: 
	/content/drive/MyDrive/Github_NLP/NLP_with_PyTorch/Chapter4_FFNN_for_NLP/data/model_storage/ch4/surname_mlp/with_drop_out/vectorizer.json
	/content/drive/MyDrive/Github_NLP/NLP_with_PyTorch/Chapter4_FFNN_for_NLP/data/model_storage/ch4/surname_mlp/with_drop_out/model.pth
CUDA 사용여부: True


In [10]:
if args.reload_from_files:
    # 체크포인트에서 훈련을 다시 시작
    print("로딩!")
    dataset = SurnameDataset.load_dataset_and_load_vectorizer(args.surname_csv,
                                                              args.vectorizer_file)
else:
    # 데이터셋과 Vectorizer 만들기
    print("새로 만들기!")
    dataset = SurnameDataset.load_dataset_and_make_vectorizer(args.surname_csv)
    dataset.save_vectorizer(args.vectorizer_file)

vectorizer = dataset.get_vectorizer()
classifier = SurnameClassifier(input_dim=len(vectorizer.surname_vocab),
                               hidden_dim=args.hidden_dim,
                               output_dim=len(vectorizer.nationality_vocab))

새로 만들기!
    surname nationality  split
0     Totah      Arabic  train
1    Abboud      Arabic  train
2  Fakhoury      Arabic  train
3     Srour      Arabic  train
4    Sayegh      Arabic  train


In [11]:
classifier = classifier.to(args.device)
dataset.class_weights = dataset.class_weights.to(args.device)


loss_func = nn.CrossEntropyLoss(dataset.class_weights)
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                                 mode='min', factor=0.5,
                                                 patience=1)

train_state = make_train_state(args)

In [12]:
dataset.train_df.shape

(7680, 3)

In [13]:
epoch_bar = tqdm.notebook.tqdm(desc='training routine',
                               total=args.num_epochs,
                               position=0)

dataset.set_split('train')
train_bar = tqdm.notebook.tqdm(desc='split=train',
                               total=dataset.get_num_batches(args.batch_size),
                               position=1,
                               leave=True)
dataset.set_split('val')
val_bar = tqdm.notebook.tqdm(desc='split=val',
                             total=dataset.get_num_batches(args.batch_size),
                             position=1,
                             leave=True)
import time
now = time.perf_counter()
try:
    for epoch_index in range(args.num_epochs):
        train_state['epoch_index'] = epoch_index

        # 훈련 세트에 대한 순회

        # 훈련 세트와 배치 제너레이터 준비, 손실과 정확도를 0으로 설정
        dataset.set_split('train')
        batch_generator = generate_batches(dataset,
                                           batch_size=args.batch_size,
                                           device=args.device)
        running_loss = 0.0
        running_acc = 0.0
        classifier.train()
        for batch_index, batch_dict in enumerate(batch_generator):
            # 훈련 과정은 5단계로 이루어집니다

            # --------------------------------------
            # 단계 1. 그레이디언트를 0으로 초기화합니다
            optimizer.zero_grad()

            # 단계 2. 출력을 계산합니다
            y_pred = classifier(batch_dict['x_surname'])

            # 단계 3. 손실을 계산합니다
            loss = loss_func(y_pred, batch_dict['y_nationality'])
            loss_t = loss.item()
            running_loss += (loss_t - running_loss) / (batch_index + 1)

            # 단계 4. 손실을 사용해 그레이디언트를 계산합니다
            loss.backward()

            # 단계 5. 옵티마이저로 가중치를 업데이트합니다
            optimizer.step()
            # -----------------------------------------

            # 정확도를 계산합니다
            acc_t = compute_accuracy(y_pred, batch_dict['y_nationality'])
            running_acc += (acc_t - running_acc) / (batch_index + 1)

            # 진행 바 업데이트
            train_bar.set_postfix(loss=running_loss, acc=running_acc,
                            epoch=epoch_index)
            train_bar.update()

        train_state['train_loss'].append(running_loss)
        train_state['train_acc'].append(running_acc)

        # 검증 세트에 대한 순회

        # 검증 세트와 배치 제너레이터 준비, 손실과 정확도를 0으로 설정
        dataset.set_split('val')
        batch_generator = generate_batches(dataset,
                                           batch_size=args.batch_size,
                                           device=args.device)
        running_loss = 0.
        running_acc = 0.
        classifier.eval()

        for batch_index, batch_dict in enumerate(batch_generator):

            # 단계 1. 출력을 계산합니다
            y_pred =  classifier(batch_dict['x_surname'])

            # 단계 2. 손실을 계산합니다
            loss = loss_func(y_pred, batch_dict['y_nationality'])
            loss_t = loss.to("cpu").item()
            running_loss += (loss_t - running_loss) / (batch_index + 1)

            # 단계 3. 정확도를 계산합니다
            acc_t = compute_accuracy(y_pred, batch_dict['y_nationality'])
            running_acc += (acc_t - running_acc) / (batch_index + 1)
            val_bar.set_postfix(loss=running_loss, acc=running_acc,
                            epoch=epoch_index)
            val_bar.update()

        train_state['val_loss'].append(running_loss)
        train_state['val_acc'].append(running_acc)

        train_state = update_train_state(args=args, model=classifier,
                                         train_state=train_state)

        scheduler.step(train_state['val_loss'][-1])

        if train_state['stop_early']:
            break

        train_bar.n = 0
        val_bar.n = 0
        epoch_bar.update()
except KeyboardInterrupt:
    print("Exiting loop")

print("걸린시간: ", (time.perf_counter() - now))
#with-dropout version
#epochs = 100돌림
#걸린시간 183.054초
#사용한 GPU: A100GPU

training routine:   0%|          | 0/100 [00:00<?, ?it/s]

split=train:   0%|          | 0/120 [00:00<?, ?it/s]

split=val:   0%|          | 0/25 [00:00<?, ?it/s]

걸린시간:  187.77355801800002


In [14]:
# 가장 좋은 모델을 사용해 테스트 세트의 손실과 정확도를 계산합니다
classifier.load_state_dict(torch.load(train_state['model_filename']))

classifier = classifier.to(args.device)
dataset.class_weights = dataset.class_weights.to(args.device)
loss_func = nn.CrossEntropyLoss(dataset.class_weights)

dataset.set_split('test')
batch_generator = generate_batches(dataset,
                                   batch_size=args.batch_size,
                                   device=args.device)
running_loss = 0.
running_acc = 0.
classifier.eval()

for batch_index, batch_dict in enumerate(batch_generator):
    # 출력을 계산합니다
    y_pred =  classifier(batch_dict['x_surname'])

    # 손실을 계산합니다
    loss = loss_func(y_pred, batch_dict['y_nationality'])
    loss_t = loss.item()
    running_loss += (loss_t - running_loss) / (batch_index + 1)

    # 정확도를 계산합니다
    acc_t = compute_accuracy(y_pred, batch_dict['y_nationality'])
    running_acc += (acc_t - running_acc) / (batch_index + 1)

train_state['test_loss'] = running_loss
train_state['test_acc'] = running_acc


In [15]:
print("테스트 손실: {};".format(train_state['test_loss']))
print("테스트 정확도: {}".format(train_state['test_acc']))

테스트 손실: 1.8876551723480226;
테스트 정확도: 42.8125


In [16]:
def predict_nationality(surname, classifier, vectorizer):
    """새로운 성씨로 국적 예측하기

    매개변수:
        surname (str): 분류할 성씨
        classifier (SurnameClassifer): 분류기 객체
        vectorizer (SurnameVectorizer): SurnameVectorizer 객체
    반환값:
        가장 가능성이 높은 국적과 확률로 구성된 딕셔너리
    """
    vectorized_surname = vectorizer.vectorize(surname)
    vectorized_surname = torch.tensor(vectorized_surname).view(1, -1)
    result = classifier(vectorized_surname, apply_softmax=True)

    probability_values, indices = result.max(dim=1)
    index = indices.item()

    predicted_nationality = vectorizer.nationality_vocab.lookup_index(index)
    probability_value = probability_values.item()

    return {'nationality': predicted_nationality, 'probability': probability_value}

In [17]:
for i in range(10):
  new_surname = input("분류하려는 성씨를 입력하세요: ")
  classifier = classifier.to("cpu")
  prediction = predict_nationality(new_surname, classifier, vectorizer)
  print("{} -> {} (p={:0.2f})".format(new_surname,
                                      prediction['nationality'],
                                      prediction['probability']))

#정확도가 그렇게 높지는 않은 것으로 보임.

분류하려는 성씨를 입력하세요: Park
Park -> Czech (p=0.57)
분류하려는 성씨를 입력하세요: Kim
Kim -> Korean (p=0.25)
분류하려는 성씨를 입력하세요: Nagasawa
Nagasawa -> Japanese (p=0.32)
분류하려는 성씨를 입력하세요: Irish
Irish -> Japanese (p=0.88)
분류하려는 성씨를 입력하세요: abcdefg
abcdefg -> German (p=0.48)
분류하려는 성씨를 입력하세요: Irish
Irish -> Japanese (p=0.71)
분류하려는 성씨를 입력하세요: Richaard
Richaard -> Irish (p=0.61)
분류하려는 성씨를 입력하세요: Comico
Comico -> Italian (p=0.57)
분류하려는 성씨를 입력하세요: David
David -> Russian (p=0.36)
분류하려는 성씨를 입력하세요: Salma
Salma -> Arabic (p=0.46)


In [18]:
vectorizer.nationality_vocab.lookup_index(8)

'Irish'

In [19]:
def predict_topk_nationality(name, classifier, vectorizer, k=5):
    """새로운 성씨에 대한 최상위 K개 국적을 예측합니다

    매개변수:
        surname (str): 분류하려는 성씨
        classifier (SurnameClassifer): 분류기 객체
        vectorizer (SurnameVectorizer): SurnameVectorizer 객체
        k (int): the number of top nationalities to return
    반환값:
        딕셔너리 리스트, 각 딕셔너리는 국적과 확률로 구성됩니다.
    """
    vectorized_name = vectorizer.vectorize(name)
    vectorized_name = torch.tensor(vectorized_name).view(1, -1)
    prediction_vector = classifier(vectorized_name, apply_softmax=True)
    probability_values, indices = torch.topk(prediction_vector, k=k)

    # 반환되는 크기는 (1,k)입니다
    probability_values = probability_values.detach().numpy()[0]
    indices = indices.detach().numpy()[0]

    results = []
    for prob_value, index in zip(probability_values, indices):
        nationality = vectorizer.nationality_vocab.lookup_index(index)
        results.append({'nationality': nationality,
                        'probability': prob_value})

    return results


new_surname = input("분류하려는 성씨를 입력하세요: ")
classifier = classifier.to("cpu")

k = int(input("얼마나 많은 예측을 보고 싶나요? "))
if k > len(vectorizer.nationality_vocab):
    print("앗! 전체 국적 개수보다 큰 값을 입력했습니다. 모든 국적에 대한 예측을 반환합니다. :)")
    k = len(vectorizer.nationality_vocab)

predictions = predict_topk_nationality(new_surname, classifier, vectorizer, k=k)

print("최상위 {}개 예측:".format(k))
print("===================")
for prediction in predictions:
    print("{} -> {} (p={:0.2f})".format(new_surname,
                                        prediction['nationality'],
                                        prediction['probability']))

분류하려는 성씨를 입력하세요: Comico
얼마나 많은 예측을 보고 싶나요? 5
최상위 5개 예측:
Comico -> Italian (p=0.63)
Comico -> Portuguese (p=0.07)
Comico -> Scottish (p=0.06)
Comico -> Irish (p=0.05)
Comico -> French (p=0.05)


In [20]:
for key, value in vectorizer.surname_vocab._token_to_idx.items():
  print(key, value)
#vocabulary가 존재하는 모든철자로 구성되기 때문에, size가 크지않음.
# @, 0부터 시작해서 총 76개의 letter로 구성되어있음.

@ 0
T 1
o 2
t 3
a 4
h 5
A 6
b 7
u 8
d 9
F 10
k 11
r 12
y 13
S 14
e 15
g 16
C 17
m 18
H 19
i 20
K 21
n 22
W 23
s 24
f 25
G 26
M 27
l 28
B 29
z 30
N 31
I 32
w 33
D 34
Q 35
j 36
E 37
R 38
Z 39
c 40
Y 41
J 42
L 43
O 44
- 45
P 46
X 47
p 48
: 49
v 50
U 51
1 52
V 53
x 54
q 55
é 56
É 57
' 58
ß 59
ö 60
ä 61
ü 62
ú 63
à 64
ò 65
è 66
ó 67
Ś 68
ą 69
ń 70
á 71
ż 72
õ 73
í 74
ñ 75
Á 76
