# 실습: 성(姓) 만들기

- 18개의 국가의 수천개의 성(姓)과 그 국가를 학습
- 국가와 스펠링을 넣으면 조건을 만족하는 성(姓)을 생성

## 데이터

In [13]:
import glob
import unicodedata
import string
import random
import time
import math
import torch
import torch.nn as nn

from torch.autograd import Variable
from pprint import pprint
from io import open
from __future__ import unicode_literals, print_function, division

## helper functions

In [14]:
# Random item from a list
def randomChoice(l):
    return l[random.randint(0, len(l) - 1)]

# Get a random category and random line from that category
def randomTrainingPair():
    category = randomChoice(all_categories)
    line = randomChoice(category_lines[category])
    return category, line

# Make category, input, and target tensors from a random category, line pair
def randomTrainingSet():
    category, line = randomTrainingPair()
    category_tensor = Variable(categoryTensor(category))
    input_line_tensor = Variable(inputTensor(line))
    target_line_tensor = Variable(targetTensor(line))
    return category_tensor, input_line_tensor, target_line_tensor

def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

In [15]:
DATA_PATH = "/home/ubuntu/practical-pytorch/data/"

In [16]:
import glob

all_filenames = glob.glob(DATA_PATH + "names/*.txt")

In [17]:
pprint(all_filenames)

['/home/ubuntu/practical-pytorch/data/names/French.txt',
 '/home/ubuntu/practical-pytorch/data/names/Greek.txt',
 '/home/ubuntu/practical-pytorch/data/names/Dutch.txt',
 '/home/ubuntu/practical-pytorch/data/names/Japanese.txt',
 '/home/ubuntu/practical-pytorch/data/names/Chinese.txt',
 '/home/ubuntu/practical-pytorch/data/names/English.txt',
 '/home/ubuntu/practical-pytorch/data/names/German.txt',
 '/home/ubuntu/practical-pytorch/data/names/Spanish.txt',
 '/home/ubuntu/practical-pytorch/data/names/Scottish.txt',
 '/home/ubuntu/practical-pytorch/data/names/Russian.txt',
 '/home/ubuntu/practical-pytorch/data/names/Italian.txt',
 '/home/ubuntu/practical-pytorch/data/names/Polish.txt',
 '/home/ubuntu/practical-pytorch/data/names/Arabic.txt',
 '/home/ubuntu/practical-pytorch/data/names/Irish.txt',
 '/home/ubuntu/practical-pytorch/data/names/Vietnamese.txt',
 '/home/ubuntu/practical-pytorch/data/names/Korean.txt',
 '/home/ubuntu/practical-pytorch/data/names/Portuguese.txt',
 '/home/ubuntu/pr

In [18]:
import unicodedata
import string

all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)

# 유니코드를 ASCII로 변환
def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

print(unicode_to_ascii('Ślusàrski'))

Slusarski


In [19]:
import unicodedata
import string

# Build the category_lines dictionary, a list of names per language
category_lines = {}
all_categories = []

# Read a file and split into lines
def readLines(filename):
    # encoding 을 지정!
    lines = open(filename, encoding="utf-8").read().strip().split('\n')
    return [unicode_to_ascii(line) for line in lines]

In [20]:
for filename in all_filenames:
    category = filename.split('/')[-1].split('.')[0]
    all_categories.append(category)
    lines = readLines(filename)
    
    category_lines[category] = lines

n_categories = len(all_categories)

print('# categories:', n_categories, all_categories)
print(unicode_to_ascii("O'Néàl"))

# categories: 18 ['French', 'Greek', 'Dutch', 'Japanese', 'Chinese', 'English', 'German', 'Spanish', 'Scottish', 'Russian', 'Italian', 'Polish', 'Arabic', 'Irish', 'Vietnamese', 'Korean', 'Portuguese', 'Czech']
O'Neal


## 데이터 전처리(categories into Tensors)

In [21]:
# One-hot vector for category
def categoryTensor(category):
    li = all_categories.index(category)
    tensor = torch.zeros(1, n_categories)
    tensor[0][li] = 1
    return tensor

# One-hot matrix of first to last letters (not including EOS) for input
def inputTensor(line):
    tensor = torch.zeros(len(line), 1, n_letters)
    for li in range(len(line)):
        letter = line[li]
        tensor[li][0][all_letters.find(letter)] = 1
    return tensor

# LongTensor of second letter to end (EOS) for target
def targetTensor(line):
    letter_indexes = [all_letters.find(line[li]) for li in range(1, len(line))]
    letter_indexes.append(n_letters - 1) # EOS
    return torch.LongTensor(letter_indexes)

## 네트워크 구성(Creating the Network)

![image.png](https://i.imgur.com/jzVrf7f.png)

In [22]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size

        self.i2h = nn.Linear(n_categories + input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(n_categories + input_size + hidden_size, output_size)
        self.o2o = nn.Linear(hidden_size + output_size, output_size)
        self.dropout = nn.Dropout(0.1)
        self.softmax = nn.LogSoftmax()

    def forward(self, category, input, hidden):
        input_combined = torch.cat((category, input, hidden), 1)
        hidden = self.i2h(input_combined)
        output = self.i2o(input_combined)
        output_combined = torch.cat((hidden, output), 1)
        output = self.o2o(output_combined)
        output = self.dropout(output)
        output = self.softmax(output)
        return output, hidden

    def initHidden(self):
        return Variable(torch.zeros(1, self.hidden_size))

### 학습을 위한 준비

In [23]:
criterion = nn.NLLLoss()

learning_rate = 0.0005

def train(category_tensor, input_line_tensor, target_line_tensor):
    hidden = rnn.initHidden()

    rnn.zero_grad()

    loss = 0

    for i in range(input_line_tensor.size()[0]):
        output, hidden = rnn(category_tensor, input_line_tensor[i], hidden)
        loss += criterion(output, target_line_tensor[i])

    loss.backward()

    for p in rnn.parameters():
        p.data.add_(-learning_rate, p.grad.data)

    return output, loss.data[0] / input_line_tensor.size()[0]

### 학습

In [24]:
rnn = RNN(n_letters, 128, n_letters)

n_epochs = 10000
print_every = 1000
plot_every = 100
all_losses = []
total_loss = 0 # Reset every plot_every epochs

start = time.time()

for epoch in range(1, n_epochs + 1):
    output, loss = train(*randomTrainingSet())
    total_loss += loss

    if epoch % print_every == 0:
        print('%s (%d %d%%) %.4f' % (timeSince(start), epoch, epoch / n_epochs * 100, loss))

    if epoch % plot_every == 0:
        all_losses.append(total_loss / plot_every)
        total_loss = 0

0m 7s (1000 10%) 3.3561
0m 15s (2000 20%) 2.8805
0m 22s (3000 30%) 3.1061
0m 30s (4000 40%) 2.7348
0m 37s (5000 50%) 3.7178
0m 45s (6000 60%) 2.6870
0m 52s (7000 70%) 2.4110
0m 59s (8000 80%) 2.9221
1m 7s (9000 90%) 3.3541
1m 15s (10000 100%) 2.7953


In [13]:
max_length = 20

# Sample from a category and starting letter
def sample(category, start_letter='A'):
    category_tensor = Variable(categoryTensor(category))
    input = Variable(inputTensor(start_letter))
    hidden = rnn.initHidden()

    output_name = start_letter
    
    for i in range(max_length):
        output, hidden = rnn(category_tensor, input[0], hidden)
        topv, topi = output.data.topk(1)
        topi = topi[0][0]
        if topi == n_letters - 1:
            break
        else:    
            letter = all_letters[topi]
            output_name += letter
        input = Variable(inputTensor(letter))

    return output_name

# Get multiple samples from one category and multiple starting letters
def samples(category, start_letters='ABC'):
    for start_letter in start_letters:
        print(sample(category, start_letter))

In [14]:
samples('Korean', 'KY')

Kin
Yan


In [15]:
samples('German', 'GER')

Ganer
Eaner
Raner


In [16]:
samples('Spanish', 'SPA')

Sana
Pana
Aoan
