In [1]:
import os
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader

In [2]:
class EnglishNames(Dataset):
    def __init__(self, file_path):
        
        self.samples = list(pd.read_table(file_path, header=None)[0])
        self.max_len = max([len(x) for x in self.samples])
        self.start_token = '$'
        self.pad_token = '#'
        #self.samples = [self.start_token + x + self.pad_token * (self.max_len - len(x)) 
        #                for x in self.samples]
        self.samples = [self.start_token + x + self.pad_token for x in self.samples]
        
        self.dictionary = sorted(list(set(list(''.join(self.samples)))))
        self.dict_size = len(self.dictionary)
        self.dict_mapping = dict(zip(self.dictionary, np.arange(self.dict_size)))   
        self.inv_mapping = dict(zip(np.arange(self.dict_size), self.dictionary))  
        
    def __getitem__(self, idx):
        sample_string = self.samples[idx]
        sample = [self.dict_mapping[x] for x in sample_string]
        one_hot = np.zeros((len(sample), self.dict_size))
        one_hot[np.arange(len(sample)), sample] = 1
        sample_one_hot = torch.tensor(one_hot[:-1], dtype=torch.float32)
        target = torch.tensor(sample[1:], dtype=torch.long)
        #target_one_hot = torch.tensor(one_hot[1:], dtype=torch.float32)
        
        return {'name': sample_string,
                'name_encoded': sample,
                'sample': sample_one_hot,
                'target': target
               }
    
    def __len__(self):
        return len(self.samples)


In [3]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
    
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)

    def forward(self, input, hidden):
        input_combined = torch.cat((input, hidden), -1)
        hidden = self.i2h(input_combined)
        output = self.i2o(input_combined)

        return output, hidden

    def initHidden(self):
        return torch.zeros(self.hidden_size)

In [4]:
def train(sample):
    hidden = rnn.initHidden()
    rnn.zero_grad()
    loss = 0

    for i in range(len(sample['sample'])):
        output, hidden = rnn(sample['sample'][i], hidden)
        l = criterion(output.unsqueeze(0), sample['target'][i].unsqueeze(0))
        loss += l
    
    loss /= (i+1)
    loss.backward()

    for p in rnn.parameters():
        p.data.add_(-learning_rate, p.grad.data)

    return output, loss.item() / len(sample['name'])

In [5]:
def generate():
    hidden = rnn.initHidden()
    softmax = nn.Softmax(dim=0)
    input_token = ' '
    name = ''
    
    while input_token!='#':
        input_token_id = mapping[input_token]
        one_hot = np.zeros(dict_size)
        one_hot[input_token_id] = 1
        one_hot = torch.tensor(one_hot, dtype=torch.float32)
    
        output, hidden = rnn(one_hot, hidden)
        output = softmax(output)
        probs = output.detach().numpy()
        input_token_id = np.random.choice(np.arange(len(probs)), p=probs)
        input_token = inv_mapping[input_token_id]
        name += input_token
        
    return name[:-1]

In [6]:
def check():
    # probability for capital letters being first
    
    hidden = rnn.initHidden()
    softmax = nn.Softmax(dim=0)
    input_token = ' '
    name = ''
    
    input_token_id = mapping[input_token]
    one_hot = np.zeros(dict_size)
    one_hot[input_token_id] = 1
    one_hot = torch.tensor(one_hot, dtype=torch.float32)
    
    output, hidden = rnn(one_hot, hidden)
    output = softmax(output)
    probs = output.detach().numpy()
    
    capital_letter_probs = probs[3:28].sum()
    
    # most probable name
    
    while input_token!='#':
        input_token_id = mapping[input_token]
        one_hot = np.zeros(dict_size)
        one_hot[input_token_id] = 1
        one_hot = torch.tensor(one_hot, dtype=torch.float32)
    
        output, hidden = rnn(one_hot, hidden)
        output = softmax(output)
        _, input_token_id = output.topk(1)
        input_token_id = input_token_id.item()
        input_token = inv_mapping[input_token_id]
        name += input_token
        
    return capital_letter_probs, name[:-1]
    

In [8]:
config = {'data_path': 'data/English.txt',
          'l_rate': 0.01,
          'n_epochs': 200,
          'n_per_epoch': 3000,
          'n_hidden_features': 128}

dataset = EnglishNames(file_path=config['data_path'])
dict_size = dataset.dict_size
inv_mapping = dataset.inv_mapping
mapping = dataset.dict_mapping

rnn = RNN(dict_size, config['n_hidden_features'], dict_size)
criterion = nn.CrossEntropyLoss()
learning_rate = config['l_rate']

for epoch in range(config['n_epochs']):
    epoch_loss = 0
    indices = np.random.choice(np.arange(dataset.__len__()), 
                               config['n_per_epoch'], replace=False)
    
    for i in indices:
        sample = dataset.__getitem__(i)
        output, loss = train(sample)
        epoch_loss += loss
    
    if (epoch+1) % 10 == 0:
        epoch_loss /= config['n_per_epoch']
        checking = check()
        print('Epoch {}: Train loss = {:.4f}'.format(epoch+1, epoch_loss))
        print('Prob Capital Letter Being First = {:.3f}, Most likely name = {}'
              .format(checking[0], checking[1]))
        print('Name samples: {}, {}, {}, {}, {}\n'.format(generate(), generate(), generate(), 
                                                  generate(), generate()))

Epoch 10: Train loss = 0.2917
Prob Capital Letter Being First = 0.626, Most likely name = arder
Name samples: yiytsm,  dgliy, Dlrnflly, Ezwgem, tenson

Epoch 20: Train loss = 0.2793
Prob Capital Letter Being First = 0.693, Most likely name = arey
Name samples: Bacr, Siavder, Hiocurus, Galg, Jodeal

Epoch 30: Train loss = 0.2711
Prob Capital Letter Being First = 0.725, Most likely name = aner
Name samples: wbotte, Bank, Ularoes, pailot, Oedsrn

Epoch 40: Train loss = 0.2667
Prob Capital Letter Being First = 0.744, Most likely name = and
Name samples: $Edckedy, saor, BVeerry, cCuhton, icgersyla

Epoch 50: Train loss = 0.2647
Prob Capital Letter Being First = 0.756, Most likely name = ane
Name samples: qelles, Dverl, Waith, mor, g

Epoch 60: Train loss = 0.2620
Prob Capital Letter Being First = 0.764, Most likely name = ers
Name samples: Fens, Qvirles, Yies, Roder, Corain

Epoch 70: Train loss = 0.2596
Prob Capital Letter Being First = 0.771, Most likely name = ard
Name samples: manes, Gy