In [227]:
from io import open
import glob
import os
import matplotlib.pyplot as plt

### 1. 利用通配符打印出所有的文件名

In [228]:
# glob.glob()返回所有匹配的文件路径列表。它只有一个参数pathname，定义了文件路径匹配规则，这里可以是绝对路径，也可以是相对路径
def find_files(path): 
    return glob.glob(path)

print(find_files('data/names/*.txt'))

['data/names/Czech.txt', 'data/names/German.txt', 'data/names/Arabic.txt', 'data/names/Japanese.txt', 'data/names/Chinese.txt', 'data/names/Vietnamese.txt', 'data/names/Russian.txt', 'data/names/French.txt', 'data/names/Irish.txt', 'data/names/English.txt', 'data/names/Spanish.txt', 'data/names/Greek.txt', 'data/names/Italian.txt', 'data/names/Portuguese.txt', 'data/names/Scottish.txt', 'data/names/Dutch.txt', 'data/names/Korean.txt', 'data/names/Polish.txt']


### 2. 名字中包含不少非Ascii码，将他们转换成Ascii码

In [229]:
import unicodedata
import string

In [230]:
# string.ascii_letters 生成a-zA-Z所有的字母
all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)

In [231]:
n_letters

57

In [232]:
def unicode_2_Ascii(s):
    '''
    unicodedata.normalize('NFD',s) 表示字符s应该分解成多个组合字符表示 例如:Ś 分解成 S 和 上标符号
    unicodedata.category(s) 表示字符s在unicode里面的分类类型 Mn:标记 非间距
    '''
    ret = ''.join([c for c in unicodedata.normalize('NFD',s) 
                   if unicodedata.category(c) != 'Mn' and c in all_letters])
    return ret

### 3.建立类别与对应的名字的dictionary

In [233]:
category_lines = {}
all_categories = []

In [234]:
def read_lines(filename):
    lines = open(filename,encoding='utf-8').read().strip().split('\n')
    return [unicode_2_Ascii(line) for line in lines]

In [235]:
'''
os.path.basename(path) 返回path最后的文件名 如 path最后以 \ 或者 / 结尾 则返回空
os.path.splitext(filename) 分离文件名与扩展名 如 输入china.txt 返回 china 和 .txt
'''
for filename in find_files('data/names/*.txt'):
    category = os.path.splitext(os.path.basename(filename))[0]
    all_categories.append(category)
    lines = read_lines(filename)
    category_lines[category] = lines

In [236]:
'''总共有多少个类别的数量'''
n_categories = len(all_categories)

### 4.使用pytorch进行向量化

In [237]:
import torch

In [238]:
'''查找字符在all_letters中的索引'''
def letter_to_index(letter):
    return all_letters.find(letter)

In [239]:
def letter_to_tensor(letter):
    '''将字符letter向量化'''
    tensor = torch.zeros(1,n_letters)
    tensor[0][letter_to_index(letter)] = 1
    return tensor

In [240]:
def line_to_tensor(line):
    '''将字符串line向量化'''
    tensor = torch.zeros(len(line),1,n_letters)
    for li,letter in enumerate(line):
        tensor[li][0][letter_to_index(letter)] = 1
    return tensor

### 5.构建RNN网络

In [241]:
import torch.nn as nn

In [242]:
class RNN(nn.Module):
    
    def __init__(self,input_size,hidden_size,output_size):
        super(RNN,self).__init__()
        
        self.hidden_size = hidden_size
        
        self.i2h = nn.Linear(input_size + hidden_size , hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size , output_size)
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self,inputs,hidden):
        
        combined = torch.cat((inputs,hidden),1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output,hidden
    
    def initHidden(self):
        return torch.zeros(1,self.hidden_size)

In [243]:
# inputs维度
n_letters
# hidden维度
n_hidden = 128
# output维度
n_categories

18

In [244]:
rnn = RNN(n_letters, n_hidden, n_categories)

In [245]:
rnn

RNN(
  (i2h): Linear(in_features=185, out_features=128, bias=True)
  (i2o): Linear(in_features=185, out_features=18, bias=True)
  (softmax): LogSoftmax()
)

### 6.运行RNN

In [246]:
inputs = letter_to_tensor('L')
hidden = torch.zeros(1,n_hidden)

In [249]:
inputs.shape

torch.Size([1, 57])

In [247]:
'''
对于对象rnn直接在后面加()传参数相当于调用了RNN类中的forward方法,因为在RNN的父类 nn.Module中定义了__call__()方法
'''
output,next_hidden = rnn(inputs,hidden)

In [248]:
next_hidden.shape

torch.Size([1, 128])

### 7.将output变成人们方便识别的类别

In [193]:
def category_from_output(output):
    # topk(n)返回最大的n个数据 top_n 为 value top_i 为 index
    top_n,top_i = output.topk(1)
    category_i = top_i.item()
    return all_categories[category_i],category_i

In [194]:
category_from_output(output)

('Chinese', 4)

### 8.随机生成训练数据

In [195]:
import random

In [196]:
def sample(l):
    return l[random.randint(0,len(l)-1)]

def get_fact_sample(category):
    facts = category_lines[category]
    fact_sample = sample(facts)
    return fact_sample

def sample_trainning():
    category = sample(all_categories)
    line = get_fact_sample(category)
    category_tensor = torch.tensor([all_categories.index(category)],dtype=torch.long)
    line_tensor = line_to_tensor(line)
    return category,line,category_tensor,line_tensor

In [197]:
for i in range(10):
    category,line,category_tensor,line_tensor = sample_trainning()
    print('category=',category,'/ line=',line)

category= Portuguese / line= Torres
category= Arabic / line= Salib
category= Czech / line= Alt
category= Czech / line= Klemper
category= Spanish / line= Barros
category= Scottish / line= Lindsay
category= Spanish / line= Petit
category= Spanish / line= Gutierrez
category= Russian / line= Astrakhankin
category= Korean / line= Bang


### 9.使用交叉熵损失函数

In [198]:
criterion = nn.CrossEntropyLoss()
learning_rate = 0.005

def train(category_tensor,line_tensor):
    hidden = rnn.initHidden()
    # 将rnn中所有模型的参数梯度设置为0
    rnn.zero_grad()
    
    for i in range(line_tensor.size()[0]):
        output , hidden = rnn(line_tensor[i] , hidden)
        
    loss = criterion(output , category_tensor)
    loss.backward()
    
    for p in rnn.parameters():
        # add_表示张量的相加 以下相当于 -learing_rate * p.grad.data + p.data
        p.data.add_(-learning_rate,p.grad.data)
        
    return output , loss.item()

In [199]:
import math
import time

In [203]:
n_iters = 1000

print_every = 500
plot_every = 100

current_loss = 0
all_losses = []

def time_since(since):
    now = time.time()
    seconds = now - since
    minute = math.floor(seconds / 60)
    seconds -= minute * 60
    
    return '%dm %ds' % (minute,seconds)

start = time.time()

for iters in range(1,n_iters + 1):
    category , line , category_tensor , line_tensor = sample_trainning()
    output,loss = train(category_tensor,line_tensor)
    current_loss += loss
    
    if iters % print_every == 0 :
        guess , guess_i = category_from_output(output)
        correct = '✓' if guess == category else '✗ (%s)' % category
        print('%d %d%% (%s) %.4f %s / %s %s' % (iters, iters / n_iters * 100, time_since(start), loss, line, guess, correct))
        
    if iters % plot_every == 0:
        all_losses.append(current_loss / plot_every)
        current_loss = 0

500 50% (0m 1s) 2.1553 Koulaxizis / Greek ✓
1000 100% (0m 2s) 2.8088 Hakimi / Japanese ✗ (Arabic)


### 10.观察Loss的变化

In [204]:
all_losses

[2.7297172284126283,
 2.7527003359794615,
 2.7185774564743044,
 2.738234317302704,
 2.707736620903015,
 2.7611072516441344,
 2.758996150493622,
 2.6786904203891755,
 2.7377470314502714,
 2.6890264534950257]

In [205]:
%matplotlib inline

In [206]:
def evaluate(line_tensor):
    hidden = rnn.initHidden()
    
    for i in range(line_tensor.size()[0]):
        output,hidden = rnn(line_tensor[i],hidden)
        
    return output

In [207]:
def predict(input_line,n_prediction=3):
    print('\n> %s'% input_line)
    
    with torch.no_grad():
        output = evaluate(line_to_tensor(input_line))
        
        topv,topi = output.topk(n_prediction,1,True)
        predictions = []
        
        for i in range(n_prediction):
            value = topv[0][i].item()
            category_index = topi[0][i].item()
            print('(%.2f) %s' % (value,all_categories[category_index]))
            predictions.append([value,all_categories[category_index]])

In [208]:
predict('Mai')
predict('Stink')
predict('Yuki')


> Mai
(-2.63) Italian
(-2.64) Chinese
(-2.68) Vietnamese

> Stink
(-2.71) Polish
(-2.74) English
(-2.74) Japanese

> Yuki
(-2.52) Japanese
(-2.60) Polish
(-2.60) Italian


### 1. 尝试在我们的RNN模型中添加更多layers，然后观察Loss变化

In [209]:
import torch.nn as nn

In [210]:
class RNN_V1(nn.Module):
    
    def __init__(self,input_size,hidden_size1,hidden_size2,output_size):
        super(RNN_V1,self).__init__()
        
        self.hidden_size1 = hidden_size1
        self.hidden_size2 = hidden_size2
        
        self.i2h1 = nn.Linear(input_size + hidden_size1 , hidden_size1)
        self.i2h2 = nn.Linear(hidden_size1 + hidden_size2 , hidden_size2)
        self.i2o = nn.Linear(hidden_size1 + hidden_size2 , output_size)
        self.softmax = nn.LogSoftmax(dim=1)
        self.relu = nn.ReLU()
        
    def forward(self,inputs,hidden1,hidden2):
        
        combined1 = torch.cat((inputs,hidden1),1)
        hidden1 = self.i2h1(combined1)
        output1 = self.relu(hidden1)
        combined2 = torch.cat((output1,hidden2),1)
        hidden2 = self.i2h2(combined2)
        output = self.i2o(combined2)
        output = self.softmax(output)
        
        return output,hidden2
    
    def initHidden(self,hidden_size1,hidden_size2):
        hidden1 = torch.zeros(1,hidden_size1)
        hidden2 = torch.zeros(1,hidden_size2)
        return hidden1 , hidden2

In [211]:
# inputs维度
n_letters
# hidden1 维度
n_hidden1 = 128
#hidden2 维度
n_hidden2 = 64
# output维度
n_categories

18

In [212]:
rnn_v1 = RNN_V1(n_letters,n_hidden1,n_hidden2,n_categories)

In [213]:
rnn_v1

RNN_V1(
  (i2h1): Linear(in_features=185, out_features=128, bias=True)
  (i2h2): Linear(in_features=192, out_features=64, bias=True)
  (i2o): Linear(in_features=192, out_features=18, bias=True)
  (softmax): LogSoftmax()
  (relu): ReLU()
)

In [214]:
criterion = nn.CrossEntropyLoss()
learning_rate = 0.005

def train_v1(category_tensor,line_tensor):
    hidden1,hidden2 = rnn_v1.initHidden(n_hidden1,n_hidden2)
    # 将rnn中所有模型的参数梯度设置为0
    rnn_v1.zero_grad()
    
    for i in range(line_tensor.size()[0]):
        output ,hidden2 = rnn_v1(line_tensor[i] , hidden1,hidden2)
        
    loss = criterion(output , category_tensor)
    loss.backward()
    
    for p in rnn_v1.parameters():
        # add_表示张量的相加 以下相当于 -learing_rate * p.grad.data + p.data
        p.data.add_(-learning_rate,p.grad.data)
        
    return output , loss.item()

In [215]:
n_iters = 2000

print_every = 500
plot_every = 100

# 多layers RNN
current_loss1 = 0
all_losses1 = []
# 单leyers RNN
current_loss2 = 0
all_losses2 = []

start = time.time()

for iters in range(1,n_iters + 1):
    category , line , category_tensor , line_tensor = sample_trainning()
    # 多layers RNN
    output1,loss1 = train_v1(category_tensor,line_tensor)
    current_loss1 += loss1
    # 单layers RNN
    output2,loss2 = train(category_tensor,line_tensor)
    current_loss2 += loss2
    
    if iters % print_every == 0 :
        guess1 , guess_i1 = category_from_output(output1)
        correct = '✓' if guess1 == category else '✗ (%s)' % category
        print('***现在正在输出多layers RNN的结果***')
        print('%d %d%% (%s) %.4f %s / %s %s' % (iters, iters / n_iters * 100, time_since(start), loss1, line, guess1, correct))
        
    if iters % print_every == 0 :
        guess2 , guess_i2 = category_from_output(output2)
        correct = '✓' if guess2 == category else '✗ (%s)' % category
        print('***现在正在输出单layers RNN的结果***')
        print('%d %d%% (%s) %.4f %s / %s %s' % (iters, iters / n_iters * 100, time_since(start), loss2, line, guess2, correct))
        
    if iters % plot_every == 0:
        all_losses1.append(current_loss1 / plot_every)
        all_losses2.append(current_loss2 / plot_every)
        current_loss1 = 0
        current_loss2 = 0

***现在正在输出多layers RNN的结果***
500 25% (0m 2s) 3.0019 Rutten / Spanish ✗ (Dutch)
***现在正在输出单layers RNN的结果***
500 25% (0m 2s) 2.7401 Rutten / German ✗ (Dutch)
***现在正在输出多layers RNN的结果***
1000 50% (0m 5s) 2.9124 Luo / Scottish ✗ (Chinese)
***现在正在输出单layers RNN的结果***
1000 50% (0m 5s) 2.5574 Luo / Korean ✗ (Chinese)
***现在正在输出多layers RNN的结果***
1500 75% (0m 7s) 2.8061 Corti / Italian ✓
***现在正在输出单layers RNN的结果***
1500 75% (0m 7s) 2.4139 Corti / Italian ✓
***现在正在输出多layers RNN的结果***
2000 100% (0m 10s) 2.8150 Sokal / Italian ✗ (Polish)
***现在正在输出单layers RNN的结果***
2000 100% (0m 10s) 2.9171 Sokal / Arabic ✗ (Polish)


In [216]:
print('多layers RNN:')
print(all_losses1)
print('单layers RNN:')
print(all_losses2)

多layers RNN:
[2.885951325893402, 2.8919425296783445, 2.880792078971863, 2.8943641281127928, 2.887309763431549, 2.8920133018493654, 2.8980507373809816, 2.8875694108009338, 2.8883215618133544, 2.882408928871155, 2.8792552161216736, 2.8868811440467836, 2.8839226508140565, 2.8816251826286314, 2.873859176635742, 2.871556842327118, 2.8744288897514343, 2.8726945543289184, 2.877608821392059, 2.8697048830986023]
单layers RNN:
[2.7213080859184267, 2.6502015471458433, 2.720110001564026, 2.671740838289261, 2.629564433097839, 2.662118192911148, 2.7003481471538544, 2.5614106261730196, 2.6042903447151184, 2.5923542284965517, 2.5200770235061647, 2.5601474404335023, 2.5654919707775115, 2.5232647252082825, 2.5174214839935303, 2.4091359400749206, 2.503758072257042, 2.385744194984436, 2.5450039571523666, 2.451318130493164]


In [217]:
'''疑问  为什么多layers RNN loss反而比单layers loss 大呢？
我检查了好久 没发现问题 希望老师在改作业后在评语里指点一下  谢谢啦'''

'疑问  为什么多layers RNN loss反而比单layers loss 大呢？\n我检查了好久 没发现问题 希望老师在改作业后在评语里指点一下  谢谢啦'

### 2. 将原始的RNN模型改成nn.LSTM和nn.GRU， 并且改变 n_iters = 1000 这个值，观察其变化

## LSTM

In [323]:
class LSTM(nn.Module):
    
    def __init__(self,input_size,hidden_size,output_size):
        super(LSTM,self).__init__()
        
        self.hidden_size = hidden_size
        
        self.i2h = nn.Linear(input_size + hidden_size , hidden_size)
        self.i2o = nn.Linear(hidden_size , output_size)
        self.softmax = nn.LogSoftmax(dim=1)
        self.tanh = nn.Tanh()
        self.sigmod = nn.Sigmoid()
        
    def forward(self,inputs,hidden,control):
        
        combined = torch.cat((inputs,hidden),1)
        a = self.i2h(combined)
        
        z = self.tanh(a)
        zi = self.sigmod(a)
        zf = self.sigmod(a)
        zo = self.sigmod(a)
        
        control = zf.mul(control) + zi.mul(z)
        hidden = zo.mul(self.tanh(control))
        output = self.i2o(hidden)
        output = self.softmax(output)
        
        return output,hidden,control
    
    def initHidden(self):
        return torch.zeros(1,self.hidden_size)

In [358]:
lstm = LSTM(n_letters,n_hidden,n_categories)

In [359]:
lstm

LSTM(
  (i2h): Linear(in_features=185, out_features=128, bias=True)
  (i2o): Linear(in_features=128, out_features=18, bias=True)
  (softmax): LogSoftmax()
  (tanh): Tanh()
  (sigmod): Sigmoid()
)

In [360]:
criterion = nn.CrossEntropyLoss()
learning_rate = 0.005

def train_lstm(category_tensor,line_tensor):
    hidden = lstm.initHidden()
    control = lstm.initHidden()
    # 将rnn中所有模型的参数梯度设置为0
    lstm.zero_grad()
    
    for i in range(line_tensor.size()[0]):
        output , hidden ,control = lstm(line_tensor[i] , hidden,control)
        
    loss = criterion(output , category_tensor)
    loss.backward()
    
    for p in lstm.parameters():
        # add_表示张量的相加 以下相当于 -learing_rate * p.grad.data + p.data
        p.data.add_(-learning_rate,p.grad.data)
        
    return output , loss.item()

In [364]:
n_iters = 1000
print_every = 500
plot_every = 100
# LSTM
current_loss1 = 0   
all_losses1 = []
# RNN
current_loss2 = 0
all_losses2 = []

start = time.time()

for iters in range(1,n_iters + 1):
    category , line , category_tensor , line_tensor = sample_trainning()
    # LSTM
    output1,loss1 = train_lstm(category_tensor,line_tensor)
    current_loss1 += loss1
    # RNN
    output2,loss2 = train(category_tensor,line_tensor)
    current_loss2 += loss2
    
    if iters % print_every == 0 :
        guess1 , guess_i1 = category_from_output(output1)
        correct = '✓' if guess1 == category else '✗ (%s)' % category
        print('***现在正在输出LSTM的结果***')
        print('%d %d%% (%s) %.4f %s / %s %s' % (iters, iters / n_iters * 100, time_since(start), loss1, line, guess1, correct))
        
    if iters % print_every == 0 :
        guess2 , guess_i2 = category_from_output(output2)
        correct = '✓' if guess2 == category else '✗ (%s)' % category
        print('***现在正在输出RNN的结果***')
        print('%d %d%% (%s) %.4f %s / %s %s' % (iters, iters / n_iters * 100, time_since(start), loss2, line, guess2, correct))
        
    if iters % plot_every == 0:
        all_losses1.append(current_loss1 / plot_every)
        all_losses2.append(current_loss2 / plot_every)
        current_loss1 = 0
        current_loss2 = 0

***现在正在输出LSTM的结果***
500 50% (0m 2s) 2.9531 Zhang / Japanese ✗ (Chinese)
***现在正在输出RNN的结果***
500 50% (0m 2s) 0.9672 Zhang / Chinese ✓
***现在正在输出LSTM的结果***
1000 100% (0m 5s) 2.8784 Pantelas / German ✗ (Greek)
***现在正在输出RNN的结果***
1000 100% (0m 5s) 0.8430 Pantelas / Greek ✓


In [344]:
# LSTM Loss
all_losses1

[2.8746918559074404,
 2.874691483974457,
 2.869926104545593,
 2.888092164993286,
 2.875089776515961,
 2.871559782028198,
 2.877913990020752,
 2.866274833679199,
 2.873765397071838,
 2.869029121398926,
 2.868867003917694,
 2.8674361085891724,
 2.875820209980011,
 2.8756304502487184,
 2.8702869272232054,
 2.8669332814216615,
 2.866185784339905,
 2.878901560306549,
 2.866950137615204,
 2.8700921154022216,
 2.8705575942993162,
 2.862789945602417,
 2.864740924835205,
 2.874198396205902,
 2.8708564949035646,
 2.858320565223694,
 2.8755458307266237,
 2.8719739937782287,
 2.872731921672821,
 2.86985143661499,
 2.8627507948875426,
 2.85831524848938,
 2.8577174878120424,
 2.8526731610298155,
 2.863429026603699,
 2.8750904273986815,
 2.8583113312721253,
 2.855660719871521,
 2.852945771217346,
 2.8661801195144654,
 2.846739752292633,
 2.8481362676620483,
 2.869725930690765,
 2.8535204410552977,
 2.8539528274536132,
 2.862327363491058,
 2.8580382871627807,
 2.849465093612671,
 2.858138976097107,
 2

In [345]:
# RNN Loss
all_losses2

[2.368366609811783,
 2.3800265204906466,
 2.26980160176754,
 2.1485344287753105,
 2.310079976320267,
 2.309486210346222,
 2.198187758922577,
 2.223685482889414,
 2.2997890892624855,
 2.1631899851560594,
 2.274751192331314,
 2.213223315179348,
 2.301973067522049,
 2.2916450840234757,
 2.2561871933937074,
 2.312960294485092,
 2.2485823914408685,
 2.017235503345728,
 2.3089068818092344,
 2.23375236004591,
 2.3194860780239104,
 2.3144896566867827,
 2.116139702796936,
 2.1691104716062544,
 2.0746486197412013,
 2.1496936348080635,
 2.1594113618135453,
 2.213214085102081,
 2.28453542560339,
 1.98828871935606,
 2.1903842318058016,
 2.1506259302794932,
 2.2134738764166833,
 2.0398297011852264,
 2.1299920573830606,
 2.2089452774822713,
 2.0084380255639553,
 2.104708690345287,
 2.0012562365829947,
 2.184931677877903,
 2.1167761573195456,
 2.0497565034031866,
 2.1721402701735495,
 2.0642763060331344,
 2.035502164512873,
 1.983254341483116,
 2.1099649310112,
 2.0824568292498586,
 2.2400856107473373

## GRU

In [411]:
class GRU(nn.Module):
    
    def __init__(self,input_size,hidden_size,output_size):
        super(GRU,self).__init__()
        
        self.hidden_size = hidden_size
        
        self.i2h = nn.Linear(input_size + hidden_size , hidden_size)
        self.i2o = nn.Linear(hidden_size , output_size)
        self.softmax = nn.LogSoftmax(dim=1)
        self.tanh = nn.Tanh()
        self.sigmod = nn.Sigmoid()
        
    def forward(self,inputs,hidden):
        
        combined = torch.cat((inputs,hidden),1)
        a = self.i2h(combined)
        
        z = self.sigmod(a)
        r = self.sigmod(a)
        
        hidden_1 = hidden.mul(r)
        combined2 = torch.cat((inputs,hidden_1),1)
        a1 = self.i2h(combined2)
        h = self.tanh(a1)
        
        hidden = z.mul(hidden) + (1 - z).mul(h)
        
        output = self.i2o(hidden)
        output = self.softmax(output)
        
        return output,hidden
    
    def initHidden(self):
        return torch.zeros(1,self.hidden_size)

In [412]:
gru = GRU(n_letters,n_hidden,n_categories)

In [413]:
criterion = nn.CrossEntropyLoss()
learning_rate = 0.005

def train_gru(category_tensor,line_tensor):
    hidden = gru.initHidden()
    # 将rnn中所有模型的参数梯度设置为0
    gru.zero_grad()
    
    for i in range(line_tensor.size()[0]):
        output , hidden = gru(line_tensor[i] , hidden)
        
    loss = criterion(output , category_tensor)
    loss.backward()
    
    for p in gru.parameters():
        # add_表示张量的相加 以下相当于 -learing_rate * p.grad.data + p.data
        p.data.add_(-learning_rate,p.grad.data)
        
    return output , loss.item()

In [415]:
n_iters = 5000
print_every = 500
plot_every = 100
# GRU
current_loss1 = 0   
all_losses1 = []
# RNN
current_loss2 = 0
all_losses2 = []

start = time.time()

for iters in range(1,n_iters + 1):
    category , line , category_tensor , line_tensor = sample_trainning()
    # GRU
    output1,loss1 = train_gru(category_tensor,line_tensor)
    current_loss1 += loss1
    # RNN
    output2,loss2 = train(category_tensor,line_tensor)
    current_loss2 += loss2
    
    if iters % print_every == 0 :
        guess1 , guess_i1 = category_from_output(output1)
        correct = '✓' if guess1 == category else '✗ (%s)' % category
        print('***现在正在输出GRU的结果***')
        print('%d %d%% (%s) %.4f %s / %s %s' % (iters, iters / n_iters * 100, time_since(start), loss1, line, guess1, correct))
        
    if iters % print_every == 0 :
        guess2 , guess_i2 = category_from_output(output2)
        correct = '✓' if guess2 == category else '✗ (%s)' % category
        print('***现在正在输出RNN的结果***')
        print('%d %d%% (%s) %.4f %s / %s %s' % (iters, iters / n_iters * 100, time_since(start), loss2, line, guess2, correct))
        
    if iters % plot_every == 0:
        all_losses1.append(current_loss1 / plot_every)
        all_losses2.append(current_loss2 / plot_every)
        current_loss1 = 0
        current_loss2 = 0

***现在正在输出GRU的结果***
500 10% (0m 3s) 2.8469 Mata / Spanish ✗ (Portuguese)
***现在正在输出RNN的结果***
500 10% (0m 3s) 3.0739 Mata / Japanese ✗ (Portuguese)
***现在正在输出GRU的结果***
1000 20% (0m 7s) 2.8579 Tochikura / Portuguese ✗ (Japanese)
***现在正在输出RNN的结果***
1000 20% (0m 7s) 0.2611 Tochikura / Japanese ✓
***现在正在输出GRU的结果***
1500 30% (0m 10s) 2.7759 Rios / Chinese ✗ (Portuguese)
***现在正在输出RNN的结果***
1500 30% (0m 10s) 2.4378 Rios / Greek ✗ (Portuguese)
***现在正在输出GRU的结果***
2000 40% (0m 14s) 2.8074 Nieves / Portuguese ✗ (Spanish)
***现在正在输出RNN的结果***
2000 40% (0m 14s) 2.2065 Nieves / Portuguese ✗ (Spanish)
***现在正在输出GRU的结果***
2500 50% (0m 17s) 2.8728 Donoghue / English ✗ (Irish)
***现在正在输出RNN的结果***
2500 50% (0m 17s) 2.6515 Donoghue / French ✗ (Irish)
***现在正在输出GRU的结果***
3000 60% (0m 21s) 2.8435 Tsumemasa / Spanish ✗ (Japanese)
***现在正在输出RNN的结果***
3000 60% (0m 21s) 0.5246 Tsumemasa / Japanese ✓
***现在正在输出GRU的结果***
3500 70% (0m 24s) 2.7480 Riagan / Irish ✓
***现在正在输出RNN的结果***
3500 70% (0m 24s) 1.1044 Riagan / Irish ✓
*

In [417]:
print("*****GRU*****")
all_losses1

*****GRU*****


[2.893720610141754,
 2.8847940802574157,
 2.885782995223999,
 2.882409162521362,
 2.881149389743805,
 2.8807849621772768,
 2.878885102272034,
 2.8676226806640623,
 2.860492997169495,
 2.880382194519043,
 2.883038890361786,
 2.876764600276947,
 2.8755289483070374,
 2.896203107833862,
 2.8749061107635496,
 2.8844662857055665,
 2.876378688812256,
 2.8725864577293394,
 2.8686391282081605,
 2.871350691318512,
 2.8767373538017273,
 2.874485261440277,
 2.8593267512321474,
 2.874884297847748,
 2.8663968300819396,
 2.870559823513031,
 2.8720271468162535,
 2.859490547180176,
 2.8692965006828306,
 2.8452204775810244,
 2.87095388174057,
 2.8631205654144285,
 2.876673855781555,
 2.8731401324272157,
 2.8457133436203,
 2.858735284805298,
 2.859727690219879,
 2.86003940820694,
 2.8603555965423584,
 2.8496090888977053,
 2.8372186279296874,
 2.853773331642151,
 2.8574480605125427,
 2.8557692885398867,
 2.841560192108154,
 2.859740414619446,
 2.8326065039634702,
 2.8455194354057314,
 2.828581783771515,
 

In [419]:
print("*****RNN*****")
all_losses2

*****RNN*****


[1.6506175993010401,
 1.90040491938591,
 1.784052850306034,
 1.797691224887967,
 1.8138391511887313,
 1.7869735908508302,
 1.9064816419035195,
 1.8194462862610816,
 1.8746739821881055,
 1.8324728465080262,
 1.7360979357361794,
 1.5592568418383599,
 1.8262093536555768,
 1.8589673418551684,
 1.850891173928976,
 1.6741281850636005,
 1.878282853513956,
 1.7375369933247566,
 1.7985009685903788,
 1.7369938434660435,
 1.6666024950146676,
 1.9613409201800822,
 1.861621048077941,
 1.8835010581463576,
 1.7810922824963926,
 1.6397405238449574,
 1.933578432686627,
 1.9376843455433845,
 1.7132708989828824,
 1.5891053189337254,
 1.6950543866679073,
 1.6917134291678666,
 1.8855361287295818,
 1.5863108824193477,
 1.6864173837564886,
 1.6311909140273928,
 1.7016473835706711,
 1.7971281175315381,
 1.8035540939867496,
 1.6947978019528092,
 1.6223999182879925,
 2.0280141976475714,
 1.7371028871834278,
 1.6882592931389808,
 1.6783495077490806,
 1.789519346728921,
 1.8175171756744384,
 1.6227541592158377,
 

### 3. 把该RNN模型变成多层RNN模型，观察Loss的变化

In [453]:
class RNN2(nn.Module):
    
    def __init__(self,input_size,hidden_size,output_size):
        super(RNN2,self).__init__()
        
        self.hidden_size = hidden_size
        
        self.i2h = nn.Linear(input_size + hidden_size , input_size)
        self.i2o = nn.Linear(input_size + hidden_size , output_size)
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self,inputs,hidden):
        
        combined = torch.cat((inputs,hidden),1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output,hidden
    
    def initHidden(self):
        return torch.zeros(1,self.hidden_size)

In [454]:
rnn2 = RNN2(n_letters,n_letters,n_categories)

In [457]:
rnn2

RNN2(
  (i2h): Linear(in_features=114, out_features=57, bias=True)
  (i2o): Linear(in_features=114, out_features=18, bias=True)
  (softmax): LogSoftmax()
)

In [458]:
criterion = nn.CrossEntropyLoss()
learning_rate = 0.005

def train2(category_tensor,line_tensor):
    hidden1 = rnn2.initHidden()
    hidden2 = rnn2.initHidden()
    # 将rnn中所有模型的参数梯度设置为0
    rnn2.zero_grad()
    
    sencond_inputs = []
    for i in range(line_tensor.size()[0]):
        output , hidden1 = rnn2(line_tensor[i] , hidden1)
        sencond_inputs.append(hidden1)
    for h in range(len(sencond_inputs)):
        output , hidden2 = rnn2(sencond_inputs[h],hidden2)
        
    loss = criterion(output , category_tensor)
    loss.backward()
    
    for p in rnn2.parameters():
        # add_表示张量的相加 以下相当于 -learing_rate * p.grad.data + p.data
        p.data.add_(-learning_rate,p.grad.data)
        
    return output , loss.item()

In [466]:
n_iters = 5000
print_every = 500
plot_every = 100
# 多层RNN
current_loss1 = 0   
all_losses1 = []
# 单层RNN
current_loss2 = 0
all_losses2 = []

start = time.time()

for iters in range(1,n_iters + 1):
    category , line , category_tensor , line_tensor = sample_trainning()
    # 多层RNN
    output1,loss1 = train2(category_tensor,line_tensor)
    current_loss1 += loss1
    # 单层RNN
    output2,loss2 = train(category_tensor,line_tensor)
    current_loss2 += loss2
    
    if iters % print_every == 0 :
        guess1 , guess_i1 = category_from_output(output1)
        correct = '✓' if guess1 == category else '✗ (%s)' % category
        print('***现在正在输出多层RNN的结果***')
        print('%d %d%% (%s) %.4f %s / %s %s' % (iters, iters / n_iters * 100, time_since(start), loss1, line, guess1, correct))
        
    if iters % print_every == 0 :
        guess2 , guess_i2 = category_from_output(output2)
        correct = '✓' if guess2 == category else '✗ (%s)' % category
        print('***现在正在输出单层RNN的结果***')
        print('%d %d%% (%s) %.4f %s / %s %s' % (iters, iters / n_iters * 100, time_since(start), loss2, line, guess2, correct))
        
    if iters % plot_every == 0:
        all_losses1.append(current_loss1 / plot_every)
        all_losses2.append(current_loss2 / plot_every)
        current_loss1 = 0
        current_loss2 = 0

***现在正在输出多层RNN的结果***
500 10% (0m 2s) 2.2804 Yakhnenko / Russian ✓
***现在正在输出单层RNN的结果***
500 10% (0m 2s) 0.6690 Yakhnenko / Russian ✓
***现在正在输出多层RNN的结果***
1000 20% (0m 5s) 2.8682 Gutierrez / Czech ✗ (Spanish)
***现在正在输出单层RNN的结果***
1000 20% (0m 5s) 1.2153 Gutierrez / Spanish ✓
***现在正在输出多层RNN的结果***
1500 30% (0m 7s) 2.5020 Kasimor / Greek ✗ (Czech)
***现在正在输出单层RNN的结果***
1500 30% (0m 7s) 2.2821 Kasimor / Arabic ✗ (Czech)
***现在正在输出多层RNN的结果***
2000 40% (0m 10s) 2.2565 Attard / Japanese ✗ (English)
***现在正在输出单层RNN的结果***
2000 40% (0m 10s) 1.9437 Attard / French ✗ (English)
***现在正在输出多层RNN的结果***
2500 50% (0m 12s) 2.7598 Freitas / Greek ✗ (Portuguese)
***现在正在输出单层RNN的结果***
2500 50% (0m 12s) 1.2936 Freitas / Portuguese ✓
***现在正在输出多层RNN的结果***
3000 60% (0m 15s) 1.6951 Awad / Arabic ✓
***现在正在输出单层RNN的结果***
3000 60% (0m 15s) 1.2004 Awad / Arabic ✓
***现在正在输出多层RNN的结果***
3500 70% (0m 17s) 1.4490 Shuo / Korean ✗ (Chinese)
***现在正在输出单层RNN的结果***
3500 70% (0m 17s) 1.6235 Shuo / Korean ✗ (Chinese)
***现在正在输出多层RNN的结果**

In [467]:
print("*****多层RNN*****")
all_losses1

*****多层RNN*****


[2.6914182925224304,
 2.6663350439071656,
 2.629610325098038,
 2.6034675860404968,
 2.545738945007324,
 2.5613265240192415,
 2.4636453652381896,
 2.4468847790360453,
 2.4790514588356016,
 2.391013212800026,
 2.4410859644412994,
 2.4378731799125672,
 2.3516276919841768,
 2.35990096449852,
 2.355634834468365,
 2.32068509221077,
 2.363510847091675,
 2.230501587241888,
 2.2534689250588418,
 2.229762721657753,
 2.275754176080227,
 2.129536775946617,
 2.194841649532318,
 2.183096173405647,
 2.314688563644886,
 2.0682940724492074,
 2.1791704535484313,
 2.002256373241544,
 2.1270493584871293,
 2.3663136833906173,
 2.125438638627529,
 2.070333033800125,
 2.1414993649721143,
 2.0872655564546587,
 2.1606117691099644,
 2.1327601355314254,
 2.224880239367485,
 2.111905972510576,
 2.1590923546254635,
 2.123886049389839,
 2.1612658050656317,
 2.257351344525814,
 1.972505216896534,
 2.1200771646201613,
 2.049088230133057,
 2.118472330570221,
 2.1095726814866067,
 2.1327398672699927,
 2.00404365375638,

In [468]:
print("*****单层RNN*****")
all_losses2

*****单层RNN*****


[1.720785503089428,
 1.5889805012056604,
 1.5779910457506776,
 1.4663517324067652,
 1.5641002970188855,
 1.6835120202600955,
 1.6494250504672527,
 1.6245457633212208,
 1.6242474947869778,
 1.5332370697706939,
 1.5669024156592786,
 1.7303452488780022,
 1.492285533361137,
 1.5786627519130707,
 1.6716720640659333,
 1.5110215187445284,
 1.6863150171376764,
 1.4880665805190803,
 1.599388481726637,
 1.5342576113343238,
 1.740769415833056,
 1.337001416236162,
 1.7211052253842354,
 1.6127065877616404,
 1.658262789696455,
 1.6163846035674214,
 1.606808689981699,
 1.5299408080708234,
 1.4790775046730413,
 1.9366396598517894,
 1.570744334757328,
 1.5212892780080438,
 1.7288183481246233,
 1.5592882024496795,
 1.5434037662670017,
 1.511865881551057,
 1.675853467658162,
 1.523870654590428,
 1.5942396591417491,
 1.6292838803865015,
 1.6915586812794208,
 1.8894137078523636,
 1.4519678528048099,
 1.5857489686831832,
 1.5374054498970509,
 1.620724435225129,
 1.5933929305151104,
 1.4983191972598433,
 1.5

### 4. Pytorch里边常用nn.NLLoss来代替crossentropy，将criterion改为nn.NLLoss，观察变化

In [219]:
criterion4 = nn.NLLLoss()
learning_rate = 0.005

def train_v4(category_tensor,line_tensor):
    hidden = rnn.initHidden()
    # 将rnn中所有模型的参数梯度设置为0
    rnn.zero_grad()
    
    for i in range(line_tensor.size()[0]):
        output , hidden = rnn(line_tensor[i] , hidden)
        
    loss = criterion4(output , category_tensor)
    loss.backward()
    
    for p in rnn.parameters():
        # add_表示张量的相加 以下相当于 -learing_rate * p.grad.data + p.data
        p.data.add_(-learning_rate,p.grad.data)
        
    return output , loss.item()

In [220]:
n_iters = 2000

print_every = 500
plot_every = 100

# crossentropy
current_loss1 = 0
all_losses1 = []
# NLLLoss
current_loss2 = 0
all_losses2 = []

def time_since(since):
    now = time.time()
    seconds = now - since
    minute = math.floor(seconds / 60)
    seconds -= minute * 60
    
    return '%dm %ds' % (minute,seconds)

start = time.time()

for iters in range(1,n_iters + 1):
    category , line , category_tensor , line_tensor = sample_trainning()
    # crossentropy
    output1,loss1 = train(category_tensor,line_tensor)
    current_loss1 += loss1
    # NLLLoss
    output2,loss2 = train_v4(category_tensor,line_tensor)
    current_loss2 += loss2
    
    if iters % print_every == 0 :
        guess1 , guess_i1 = category_from_output(output1)
        correct = '✓' if guess1 == category else '✗ (%s)' % category
        print('***使用crossentropy损失函数的结果***')
        print('%d %d%% (%s) %.4f %s / %s %s' % (iters, iters / n_iters * 100, time_since(start), loss1, line, guess1, correct))
        
    if iters % print_every == 0 :
        guess2 , guess_i2 = category_from_output(output2)
        correct = '✓' if guess2 == category else '✗ (%s)' % category
        print('***使用NLLLoss的结果***')
        print('%d %d%% (%s) %.4f %s / %s %s' % (iters, iters / n_iters * 100, time_since(start), loss2, line, guess2, correct))
        
    if iters % plot_every == 0:
        all_losses1.append(current_loss1 / plot_every)
        all_losses2.append(current_loss2 / plot_every)
        current_loss1 = 0
        current_loss2 = 0

***使用crossentropy损失函数的结果***
500 25% (0m 2s) 1.4325 Kunisada / Japanese ✓
***使用NLLLoss的结果***
500 25% (0m 2s) 1.3258 Kunisada / Japanese ✓
***使用crossentropy损失函数的结果***
1000 50% (0m 4s) 1.2978 Pho / Vietnamese ✓
***使用NLLLoss的结果***
1000 50% (0m 4s) 1.2657 Pho / Vietnamese ✓
***使用crossentropy损失函数的结果***
1500 75% (0m 5s) 1.5661 Gorecki / Polish ✓
***使用NLLLoss的结果***
1500 75% (0m 5s) 1.4245 Gorecki / Polish ✓
***使用crossentropy损失函数的结果***
2000 100% (0m 8s) 1.3200 Saller / German ✓
***使用NLLLoss的结果***
2000 100% (0m 8s) 1.2177 Saller / German ✓


In [221]:
print('crossentropy:')
print(all_losses1)
print('NLLLoss:')
print(all_losses2)

crossentropy:
[2.445645269751549, 2.532129064798355, 2.4525664973258974, 2.4334159362316132, 2.326144289970398, 2.3374868005514147, 2.383486284613609, 2.306963896751404, 2.3138293850421907, 2.329093291759491, 2.2931398260593414, 2.3485636255145073, 2.3733429205417633, 2.175671352148056, 2.3958961790800095, 2.1925224813818933, 2.1804625084996223, 2.212094279527664, 2.2730108308792114, 2.231453881561756]
NLLLoss:
[2.2589722231030462, 2.3939952838420866, 2.3388511389493942, 2.313135607242584, 2.136678504347801, 2.1386983948946, 2.2003092336654664, 2.1212553709745405, 2.149138662815094, 2.156963657736778, 2.1287244445085527, 2.1765494625270367, 2.194986729621887, 1.9816330878436565, 2.176300084888935, 1.9641265647113324, 1.9950997203588485, 1.9688079485297203, 2.054890798330307, 2.0253399986028673]


In [222]:
'''对比发现使用NLLLoss作为损失函数得到的loss更小 效果更好'''

'对比发现使用NLLLoss作为损失函数得到的loss更小 效果更好'