## Importing the modules

In [1]:
import numpy as np
import csv
import re
from numpy import genfromtxt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import keras
from keras.datasets import reuters
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.preprocessing.text import Tokenizer
from keras import utils as np_utils
import pandas as pd
from __future__ import unicode_literals, print_function, division
from io import open
import glob


import unicodedata
import string
import torch
import random
import time
import math
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import torch.nn as nn
from torch.autograd import Variable

Using TensorFlow backend.


In [2]:
#function for finding the path of the files
def findFiles(path): 
    return glob.glob(path)



#Function for turning unicode to ASCII
def unicodeToAscii(s,all_letters):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )


#Read a file and split it into lines
def readLines(filename,all_letters):
    lines = open(filename, encoding='utf-8').read().strip().split('\n')
    #return [unicodeToAscii(line,all_letters) for line in lines]
    return [line for line in lines]




# Find letter index from all_letters, e.g. "a" = 0
def letterToIndex(letter,all_letters):
    return all_letters.find(letter)



# Turn a line into a <line_length x 1 x n_letters>,
# or an array of one-hot letter vectors
def lineToTensor(line,n_letters,all_letters):
    tensor = torch.zeros(len(line), 1, n_letters)
    for li, letter in enumerate(line):
        tensor[li][0][letterToIndex(letter,all_letters)] = 1
    return tensor




In [3]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size

        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax()

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden

    def initHidden(self):
        return Variable(torch.zeros(1, self.hidden_size))


In [4]:
# to interpret the output of the network, which we know to be a likelihood of each category
def categoryFromOutput(output,all_categories):
    top_n, top_i = output.data.topk(1) # Tensor out of Variable with .data
    category_i = top_i[0][0]
    return all_categories[category_i], category_i


def randomChoice(l):
    return l[random.randint(0, len(l) - 1)]
##Function to get random training samples
def randomTrainingExample(all_categories,category_lines,n_letters,all_letters):
    category = randomChoice(all_categories)
    line = randomChoice(category_lines[category])
    category_tensor = Variable(torch.LongTensor([all_categories.index(category)]))
    line_tensor = Variable(lineToTensor(line,n_letters,all_letters))
    return category, line, category_tensor, line_tensor
def randomTestingExample(all_categories,category_lines,n_letters,all_letters):
    category = randomChoice(all_categories)
    line = randomChoice(category_lines[category])
    category_tensor = Variable(torch.LongTensor([all_categories.index(category)]))
    line_tensor = Variable(lineToTensor(line,n_letters,all_letters))
    return category, line, category_tensor, line_tensor

#for i in range(10):
    #category, line, category_tensor, line_tensor = randomTrainingExample()
    

In [5]:
#defining a way to train the data 
def train(rnn,criterion,category_tensor, line_tensor,learning_rate=0.0001):
    hidden = rnn.initHidden()

    rnn.zero_grad()
    
    if len(line_tensor)==0:
        print("Zero line occured")
    for i in range(len(line_tensor)):#line_tensor.size()[0]

        output, hidden = rnn(line_tensor[i], hidden)

    loss = criterion(output, category_tensor)
    loss.backward()

    # Add parameters' gradients to their values, multiplied by learning rate
    for p in rnn.parameters():
        p.data.add_(-learning_rate, p.grad.data)

    return output, loss.data[0]

         

In [6]:
def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

In [7]:
# Just return an output given a line
def evaluate(rnn,line_tensor):
    hidden = rnn.initHidden()

    for i in range(line_tensor.size()[0]):
        output, hidden = rnn(line_tensor[i], hidden)

    return output


In [None]:
def main():
    n_iters = 100000
    print_every = 5000
    plot_every = 300
    
    n_confusion=10000

    all_letters = string.ascii_letters + " .,;'"
    n_letters = len(all_letters)
    criterion = nn.NLLLoss()

    # Build the category_lines dictionary, a list of names per language
    category_lines = {}
    category_lines_test={}
    all_categories = []
    test_categories=[]
    # Read a file and split into lines

    for filename in findFiles('langdata_test/*.txt'):
        category = filename.split('/')[-1].split('.')[0]
        all_categories.append(category)
        lines = readLines(filename,all_letters)
        category_lines[category] = lines
    
    
    for filename in findFiles('langdata/*.txt'):
        category = filename.split('/')[-1].split('.')[0]
        test_categories.append(category)
        lines = readLines(filename,all_letters)
        category_lines_test[category] = lines
    

    n_categories = len(all_categories)
    n_test_categories=len(test_categories)
    confusion = torch.zeros(n_categories, n_categories)
    confusion_1=torch.zeros(n_test_categories,n_test_categories)
    
    n_hidden = 100
    #Construct the RNN
    rnn = RNN(n_letters, n_hidden, n_categories)
    
    current_loss=0
    current_test_loss=0
    all_losses=[]
    start=time.time()
    
    for iter in range(1, n_iters + 1):
        category, line, category_tensor, line_tensor = randomTrainingExample(test_categories,category_lines,n_letters,all_letters)
        output, loss = train(rnn,criterion,category_tensor, line_tensor)
        current_loss += loss
        
#         category,line,category_tensor,line_tensor=randomTestingExample(test_categories,category_lines,n_letters,all_letters)
#         output=evaluate(rnn,line_tensor)
        
        if iter % print_every == 0:
            guess, guess_i = categoryFromOutput(output,all_categories)

            correct = '✓' if guess == category else '✗ (%s)' % category

            print('%d %d%% (%s) %.4f %s / %s %s' % (iter, iter / n_iters * 100, timeSince(start), loss, line, guess, correct))

        # Add current loss avg to list of losses
        if iter % plot_every == 0:
            all_losses.append(current_loss / plot_every)
            current_loss = 0

    plt.figure()
    plt.plot(all_losses)
    
        # Go through a bunch of examples and record which are correctly guessed
    for i in range(n_confusion):
        #category, line, category_tensor, line_tensor = randomTrainingExample(all_categories,category_lines,n_letters,all_letters)
        category, line, category_tensor, line_tensor = randomTestingExample(test_categories,category_lines_test,n_letters,all_letters)
        
        output = evaluate(rnn,line_tensor)
        guess, guess_i = categoryFromOutput(output,test_categories)
        category_i = test_categories.index(category)
        confusion[category_i][guess_i] += 1

    # Normalize by dividing every row by its sum
    for i in range(n_test_categories):
        confusion[i] = confusion[i] / confusion[i].sum()

    # Set up plot
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(confusion.numpy())
    fig.colorbar(cax)

    # Set up axes
    ax.set_xticklabels([''] + all_categories, rotation=90)
    ax.set_yticklabels([''] + all_categories)

    # Force label at every tick
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    # sphinx_gallery_thumbnail_number = 2
    plt.show()
    return all_losses

In [None]:
#space_removal()
all_losses=main()

5000 5% (2m 16s) 1.6160 Hallo Es sind ca 2000 bilder und speicher 1644 gb / Polish ✗ (German)
10000 10% (4m 23s) 1.6113 Claro pero me resulto raro ya que 1451 sigue siendo / Polish ✗ (Spanish)
15000 15% (6m 26s) 1.6182 Monica estce que tu veux qu’on te ramène à la / Spanish ✗ (French)
20000 20% (8m 29s) 1.6401 da wäre ich mir nicht so sicher Die müssen doch / French ✗ (German)
25000 25% (10m 36s) 1.5010 Mnie jest gorąco / Polish ✓


In [None]:
plt.plot(all_losses)
plt.ylabel('Loss')
plt.xlabel('/300 iteration')
plt.title('Training Loss')
plt.show()