<a href="https://colab.research.google.com/github/ShivaKondapalli/NLPColabNotebooks/blob/master/Classyfying_names_to_Gender_with_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Deep Learning for Natural Languge Processing - III

# # Intorduction 

In this notebook, we classify names on people to their genders. The theory behind LSTM networks and how they work is described in the following two notebooks. 

[Vanilla Rnn and GRU](https://colab.research.google.com/drive/1SyElHeyoRAY9MtalzeyBMSCvu28bk53x)

[Long Short Term memory networks](https://colab.research.google.com/drive/1w2tK7_SCHdeiV6NEPohjkhru_O3j1W1m)

The interesting part of this notebook is that the data was Scraped from the web. It was then saved in a file. The code to scrape data is hosted at my github. 

[Web Scraper](https://github.com/ShivaKondapalli/NLPPyTorch/blob/master/Web_Scraper.py)



In [0]:
# All Imports

import torch
import torch.nn as nn
import unicodedata
import string
from io import open
import numpy as np
import time
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import os
import glob

In [15]:
path = 'sample_data/'
ext = '*.txt'


def get_files(path):
    return glob.glob(os.path.join(path, ext))


all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)


def unicodetoascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn' and c in all_letters)


# Build the category_lines dictionary, a list of names per language
sex_to_name = {}
all_categories = []


# Read a file and split into lines
def readfiles(filename):
    lines = open(filename, encoding='utf-8').read().strip().split('\n')
    return [unicodetoascii(line) for line in lines]


for f in get_files(path):
    category = f.split('/')[1].split('.')[0]
    all_categories.append(category)
    lines = readfiles(f)
    sex_to_name[category] = lines

n_categories = len(all_categories)
n_categories

2

In [0]:
def lettertoindex(l):
    """converts letter to index"""
    return all_letters.index(l)


def lettertotensor(l):
    """converts a letter to a tensor"""
    tensor = torch.zeros(1, len(all_letters))
    l_idx = lettertoindex(l)
    tensor[0][l_idx] = 1
    return tensor


def nametotensor(name):
    """converts a name into a tensor of shape seq, 1, len(all_letters)"""
    tensor = torch.zeros(len(name), 1, len(all_letters))
    for idx, l in enumerate(name):
        tensor[idx][0][lettertoindex(l)] =1
    return tensor


def categoryfromoutput(output):
    top_v, top_i = output.topk(1)
    cat_i = top_i[0].item()
    return all_categories[cat_i], cat_i

In [17]:
class LSTM(nn.Module):

    def __init__(self, input_size, hidden_size, output_size, num_layers=1):

        super(LSTM, self).__init__()

        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        batch_size = x.size(1)

        hidden = self.init_hidden(batch_size)
        cell = self.cell_state(batch_size)

        output, hidden = self.lstm(x, (hidden, cell))

        last_output = output[-1]  # batch_size * hidden_size

        fc_out = self.fc(last_output)  # 1 * hidden_size

        return fc_out

    def init_hidden(self, batch_size):
        return torch.zeros(self.num_layers, batch_size, self.hidden_size)

    def cell_state(self, batch_size):
        return torch.zeros(self.num_layers, batch_size, self.hidden_size)
      
n_hidden = 256
lstm = LSTM(n_letters, n_hidden, n_categories)
lstm

LSTM(
  (lstm): LSTM(57, 256)
  (fc): Linear(in_features=256, out_features=2, bias=True)
)

In [0]:
def genderfromoutput(output):
    top_v, top_i = output.topk(1)
    cat_i = top_i[0].item()
    return all_categories[cat_i], cat_i


def randomtrainningexample():
    sex = np.random.choice(all_categories)
    name = np.random.choice(sex_to_name[sex])
    sex_tensor = torch.tensor([all_categories.index(sex)], dtype=torch.long)
    name_tensor = nametotensor(name)
    return sex, name, sex_tensor, name_tensor

In [0]:
def genderfromoutput(output):
    top_v, top_i = output.topk(1)
    cat_i = top_i[0].item()
    return all_categories[cat_i], cat_i


def randomtrainningexample():
    sex = np.random.choice(all_categories)
    name = np.random.choice(sex_to_name[sex])
    sex_tensor = torch.tensor([all_categories.index(sex)], dtype=torch.long)
    name_tensor = nametotensor(name)
    return sex, name, sex_tensor, name_tensor

## Training 

We train the network and plot losses. 

In [0]:
learning_rate = 0.007
criterion_lstm = nn.CrossEntropyLoss()


def train_lstm(sex_tensor, name_tensor):
    lstm.zero_grad()

    output = lstm.forward(name_tensor)

    loss = criterion_lstm(output.squeeze(1), sex_tensor)
    loss.backward()

    for p in lstm.parameters():
        p.data.add_(-learning_rate, p.grad.data)  # can also use torch.optim() if you so choose to

    return output, loss.item()


def evaluate(name_tensor, model):

    out = model.forward(name_tensor)

    return out


def predict(name, model, n_predictions=3):

    with torch.no_grad():
        output = evaluate(nametotensor(name), model)

        output = output.squeeze(1)

        top_n, top_i = output.topk(n_predictions, 1, True)
        predictions_lst = []

        for i in range(n_predictions):
            val = top_n[0][i]
            cat_idx = top_i[0][i].item()
            print(f'Value: {val.item()}, language: {all_categories[cat_idx]}')
            predictions_lst.append([val, all_categories[cat_idx]])

In [0]:
def time_taken(start):
    time_elapsed = time.time() - start
    min = time_elapsed//60
    sec = time_elapsed%60
    return '%dm %ds' % (min, sec)

In [0]:
# Training
    n_iters = 100000
    print_every = 5000
    plot_every = 1000

    current_loss = 0
    all_losses = []

    start = time.time()

    for i in range(1, n_iters+1):
        sex, name, sex_tensor, name_tensor = randomtrainningexample()
        output, loss = train_lstm(sex_tensor, name_tensor)
        current_loss += loss

        if n_iters % print_every == 0:
            pred, pred_i = genderfromoutput(output)
            prediction = 'True' if pred == sex else f'False, correct one is {sex}'
            print('%d %d%% (%s) %.4f %s / %s %s' % (i, i / n_iters * 100, time_taken(start), loss, name, pred, prediction))

        if i % plot_every == 0:
            all_losses.append(current_loss/plot_every)
            current_loss = 0

    confusion = torch.zeros(n_categories, n_categories)
    n_confusion = 10000

    # Add one to each row: the real category and each column: the predicted category.
    # The darker the principal diagonal, the better the model.
    for i in range(n_confusion):
        sex, name, sex_tensor, name_tensor = randomtrainningexample()
        output = evaluate(name_tensor, lstm)
        guess, guess_i = genderfromoutput(output)
        real_category_i = all_categories.index(category)
        confusion[real_category_i][guess_i] += 1

    for i in range(n_categories):
        confusion[i] = confusion[i] / confusion[i].sum()

    # Set up fig, axes.
    fig = plt.figure()
    ax1 = fig.add_subplot(121)
    ax1.set_title('Confusion Matrix for two classes')
    cax = ax1.matshow(confusion.numpy())
    fig.colorbar(cax)

    # Set the labels for x and y axes
    ax1.set_xticklabels([''] + all_categories, rotation=90)
    ax1.set_yticklabels([''] + all_categories)

    # Major tick locations on the axis are set.
    ax1.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax1.yaxis.set_major_locator(ticker.MultipleLocator(1))

    # Plot Vanilla Rnn losses
    ax1 = fig.add_subplot(122)
    ax1.set_title('LSTM Losses')
    ax1.set_xlabel('Epochs')
    ax1.set_ylabel('Losses')
    ax1.plot(all_losses)
    plt.show()