In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
import re
import string
import random
import unicodedata


In [None]:
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [None]:
SOS_token=0
EOS_token=0

class Lang:
  def __init__(self,name):
    self.name=name
    self.word2index={}
    self.word2count={}
    self.index2word={0:"SOS",1:"EOS"}
    self.n_word=2

  def addSentence(self,sentence):
    for word in sentence.split(" "):
      self.addWord(word)


  def addWord(self,word):
    if word not in self.word2index:
      self.word2index[word]=self.n_word
      self.word2count=1
      self.index2word[self.n_word]=word
      self.n_word +=1
    else:
      self.word2count+=1

    # print("self.word2count: ",self.word2count)
    # print("self.word2index: ",self.word2index)
    # print("self.index2word: ",self.index2word)
    # print("self.n_word: ",self.n_word)



In [None]:
sentence="Hey my name and your name sorry and make my this clear and"
lang=Lang("g")
lang.addSentence(sentence)

In [None]:
def unicodeToAscii(s):
  return ''.join(c for c in unicodedata.normalize('NFD', s)
                  if unicodedata.category(c) != 'Mn')


In [None]:
s=u"A \u00c0 \u0394 \u038E"
unicodeToAscii(s)

'A A Δ Υ'

In [None]:
my_var3 = unicodedata.normalize(u'NFKD', s).encode('ascii', 'ignore').decode('utf8')

In [None]:
" ".join(my_var3.replace(" ","").lower())

'a a'

In [None]:
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

In [None]:
normalizeString(s)

'a a '

In [None]:
# def normalizeString(s):
#   my_var=unicodedata.normalize(u"NFKD",s).encode("ascii","ignore").decode("utf8").lower()
#   return " ".join(my_var.replace(" ",""))

In [None]:
normalizeString(s)

'a a '

In [None]:
def readLangs(lang1,lang2,reverse=False):
  print("Reading Lines..")

  #Read the file and split into lines
  lines=open("/content/drive/MyDrive/dataset/eng-fra/data/%s-%s.txt"%(lang1,lang2),encoding="utf-8").read().strip().split("\n")
   
  #Split every lines into pairs and normalize

  pairs=[[normalizeString(s) for s in l.split("\t")] for l in lines]

  if reverse:
    pairs=[list(reversed(p)) for p in pairs]
    input_lang=Lang(lang2)
    output_lang=Lang(lang1)
  else:
    input_lang=Lang(lang1)
    output_lang=Lang(lang2)

  return input_lang,output_lang,pairs




In [None]:
a="  Hello     Word"
a.strip()

'Hello     Word'

In [None]:
b="this is new game.\
and i love to play"

b.split("\n")

['this is new game.and i love to play']

In [None]:
MAX_LENGTH=10
eng_prefixes=(
    "i am","i m",
    "he is","he s",
    "she is","she s",
    "they are","they re",
    "we are","we re",
    "you are","you re"
)


def filterPair(p):
  return len(p[0].split(" ")) < MAX_LENGTH and len(p[1].split(" ")) < MAX_LENGTH and p[1].startswith(eng_prefixes)


def filterPairs(pairs):
  return [pair for pair in pairs if filterPair(pair)]



In [None]:
def prepareData(lang1,lang2,reverse=False):
  input_lang,output_lang,pairs=readLangs(lang1,lang2,reverse)
  print("Read Sentence Pairs " , len(pairs))
  pairs=filterPairs(pairs)
  print("Trimmed to %s sentence pairs "%len(pairs))
  print("Counting Words")
  for pair in pairs:
    input_lang.addSentence(pair[0])
    output_lang.addSentence(pair[1])

  print("Counted Words: ")
  print(input_lang.name,input_lang.n_word)
  print(output_lang.name,output_lang.n_word)
  return input_lang,output_lang,pairs


input_lang,output_lang,pairs=prepareData("eng","fra",True)
print(random.choice(pairs))

Reading Lines..
Read Sentence Pairs  135842
Trimmed to 11893 sentence pairs 
Counting Words
Counted Words: 
fra 4920
eng 3228
['nous sommes dans le petrin .', 'we re up a creek without a paddle .']


In [None]:
class EncoderRNN((nn.Module)):
  def __init__(self,input_size,hidden_size):
    super(EncoderRNN,self).__init__()
    self.input_size=input_size
    self.hidden_size=hidden_size

    self.embedding=nn.Embedding(input_size,hidden_size)
    self.gru=nn.GRU(hidden_size,hidden_size)

  def forward(self,x,hidden):
    embedd=self.embedding(x).view(1,1,-1)
    output,hidden=self.gru(embedd,hidden)
    return output,hidden

  def initHidden(self):
    return torch.zeros(1,1,self.hidden_size,device=device)

In [None]:
class DecoderRNN(nn.Module):
  def __init__(self,hidden_size,output_size):
    super(DecoderRNN).__init__()
    self.hidden_size=hidden_size
    self.output_size=output_size

    self.embedding=nn.Embedding(output_size,hidden_size)
    self.gru=nn.GRU(hidden_size,hidden_size)
    self.out=nn.Linear(hidden_size,output_size)
    self.softmax=nn.LogSoftmax(dim=1)

  def forward(self,x,hidden):
    output=self.embedding(x).view(1,1,-1)
    output=F.relu(output)
    output,hidden=self.gru(output,hidden)
    output=self.softmax(self.out(output[0]))
    return output,hidden

  def initHidden(self):
    return torch.zeros(1,1,self.hidden_size,device=device)

In [None]:
class AttnDecoderRNN(nn.Module):
  def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
    super(AttnDecoderRNN, self).__init__()
    self.hidden_size = hidden_size
    self.output_size = output_size
    self.dropout_p = dropout_p
    self.max_length = max_length

    self.embedding = nn.Embedding(self.output_size, self.hidden_size)
    self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
    self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
    self.dropout = nn.Dropout(self.dropout_p)
    self.gru = nn.GRU(self.hidden_size, self.hidden_size)
    self.out = nn.Linear(self.hidden_size, self.output_size)

  def forward(self, input, hidden, encoder_outputs):
    embedded = self.embedding(input).view(1, 1, -1)
    embedded = self.dropout(embedded)

    attn_weights = F.softmax(
          self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
    attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

    output = torch.cat((embedded[0], attn_applied[0]), 1)
    output = self.attn_combine(output).unsqueeze(0)

    output = F.relu(output)
    output, hidden = self.gru(output, hidden)

    output = F.log_softmax(self.out(output[0]), dim=1)
    return output, hidden, attn_weights

  def initHidden(self):
    return torch.zeros(1, 1, self.hidden_size, device=device)

In [None]:
def indexFromSentence(lang,sentence):
  return [lang.word2index[word] for word in sentence.split(' ')]

def tensorFromSentence(lang,sentence):
  indexes=indexFromSentence(lang,sentence)
  indexes.append(SOS_token)
  return torch.tensor(indexes,dtype=torch.long,device=device).view(-1,1)

def tensorFromPairs(pairs):
  input_tensor=tensorFromSentence(input_lang,pairs[0])
  target_tensor=tensorFromSentence(output_lang,pairs[1])

  return (input_tensor,target_tensor)

In [None]:
teacher_forcing_ratio=0.5

def train(input_tensor,target_tensor,encoder,decoder,encoder_optimizer,decoder_optimizer,criterion,max_length=MAX_LENGTH):
  encoder_hidden=encoder.initHidden()

  encoder_optimizer.zero_grad()
  decoder_optimizer.zero_grad()

  input_length=input_tensor.size(0)
  target_length=target_tensor.size(0)

  encoder_outputs=torch.zeros(max_length,encoder.hidden_size,device=device)

  loss=0

  for ei in range(input_length):
    encoder_output,encoder_hidden=encoder(input_tensor[ei],encoder_hidden)
    encoder_outputs[ei]=encoder_output[0,0]

  decoder_input=torch.tensor([[SOS_token]],device=device)

  decoder_hidden=encoder_hidden



  use_teacher_forcing=True if random.random() < teacher_forcing_ratio else False

  if use_teacher_forcing:
    for di in range(target_length):
      decoder_output,decoder_hidden,decoder_attention=decoder(decoder_input,decoder_hidden,encoder_outputs)
      loss+=criterion(decoder_output,target_tensor[di])
      decoder_input=target_tensor[di]


  else:
    for di in range(target_length):
      decoder_output,decoder_hidden,decoder_attention=decoder(decoder_input,decoder_hidden,encoder_outputs)
      topv,topi=decoder_output.topk(1)
      decoder_input=topi.squeeze().detach()

      loss+=criterion(decoder_output,target_tensor[di])
      if decoder_input.item() == EOS_token:
        break

  
  loss.backward()

  encoder_optimizer.step()
  decoder_optimizer.step()

  return loss.item() / target_length



In [None]:
import math
import time

def asMinutes(s):
  m=math.floor(s/60)
  s-=m*60
  return "%dm %ds"%(m,s)

def timeSince(since,percent):
  now=time.time()
  s=now-since
  es = s / (percent)
  rs = es - s
  return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [None]:
def trainIters(encoder,decoder,n_iters,print_every=1000,plot_every=100,learning_rate=0.01):
  start=time.time()

  plot_losses=[]
  print_loss_total=0
  plot_loss_total=0

  encoder_optimizer=torch.optim.SGD(encoder.parameters(),lr=learning_rate)
  decoder_optimizer=torch.optim.SGD(decoder.parameters(),lr=learning_rate)


  training_pairs=[tensorFromPairs(random.choice(pairs))for i in range(n_iters)]

  criterion=nn.NLLLoss()
  
  for iter in range(1,n_iters+1):
    training_pair=training_pairs[iter-1]
    input_tensor=training_pair[0]
    target_tensor=training_pair[1]

    loss=train(input_tensor,target_tensor,encoder,decoder,encoder_optimizer,decoder_optimizer,criterion)

    print_loss_total+=loss
    plot_loss_total+=loss

    if iter % plot_every==0:
      print_loss_avg=print_loss_total / print_every
      print_loss_total=0
      print("%s (%d %d%%) % 4f"%(timeSince(start,iter/n_iters),iter,iter/n_iters*100,print_loss_avg))

    if iter % plot_every==0:
      plot_loss_avg = plot_loss_total / plot_every
      plot_losses.append(plot_loss_avg)
      plot_loss_total = 0


  showPlot(plot_losses)

In [None]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [None]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
  with torch.no_grad():
    input_tensor = tensorFromSentence(input_lang, sentence)
    input_length = input_tensor.size()[0]
    encoder_hidden = encoder.initHidden()

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    for ei in range(input_length):
      encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
      encoder_outputs[ei] += encoder_output[0, 0]

      decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

      decoder_hidden = encoder_hidden

      decoded_words = []
      decoder_attentions = torch.zeros(max_length, max_length)

      for di in range(max_length):
        decoder_output, decoder_hidden, decoder_attention = decoder(
        decoder_input, decoder_hidden, encoder_outputs)
        decoder_attentions[di] = decoder_attention.data
        topv, topi = decoder_output.data.topk(1)
        if topi.item() == EOS_token:
          decoded_words.append('<EOS>')
          break
        else:
          decoded_words.append(output_lang.index2word[topi.item()])

        decoder_input = topi.squeeze().detach()

      return decoded_words, decoder_attentions[:di + 1]

In [None]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [None]:
hidden_size = 256
encoder1 = EncoderRNN(input_lang.n_word, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_word, dropout_p=0.1).to(device)

trainIters(encoder1, attn_decoder1, 100000, print_every=5000)

0m 6s (- 113m 25s) (100 0%)  0.092212
0m 8s (- 67m 33s) (200 0%)  0.073673
0m 9s (- 52m 35s) (300 0%)  0.075381
0m 10s (- 44m 54s) (400 0%)  0.070723
0m 12s (- 40m 31s) (500 0%)  0.074030
0m 13s (- 37m 34s) (600 0%)  0.072584
0m 15s (- 35m 31s) (700 0%)  0.071735
0m 16s (- 34m 2s) (800 0%)  0.072257
0m 17s (- 32m 52s) (900 0%)  0.068275
0m 19s (- 32m 2s) (1000 1%)  0.071043
0m 20s (- 31m 12s) (1100 1%)  0.064095
0m 22s (- 30m 31s) (1200 1%)  0.066370
0m 23s (- 29m 58s) (1300 1%)  0.063057
0m 25s (- 29m 25s) (1400 1%)  0.062651
0m 26s (- 29m 0s) (1500 1%)  0.061973
0m 27s (- 28m 40s) (1600 1%)  0.059055
0m 29s (- 28m 19s) (1700 1%)  0.060068
0m 30s (- 28m 0s) (1800 1%)  0.057395
0m 32s (- 27m 46s) (1900 1%)  0.062586
0m 33s (- 27m 31s) (2000 2%)  0.060321
0m 35s (- 27m 19s) (2100 2%)  0.057020
0m 36s (- 27m 8s) (2200 2%)  0.062423
0m 38s (- 26m 56s) (2300 2%)  0.060950
0m 39s (- 26m 45s) (2400 2%)  0.060469
0m 40s (- 26m 36s) (2500 2%)  0.062315
0m 42s (- 26m 27s) (2600 2%)  0.058360
0m

In [None]:
evaluateRandomly(encoder1, attn_decoder1)

> je suis votre et vous etes mien .
= i am yours and you are mine .
< i am . . <EOS>

> tu es fort craintive .
= you re very timid .
< you re welcome . <EOS>

> vous etes dement .
= you re demented .
< you re welcome . <EOS>

> il est prevu que je dejeune avec lui .
= i m scheduled to have lunch with him .
< he is doing . <EOS>

> elle est accoutumee a veiller toute la nuit .
= she is used to staying up all night .
< she is crazy about her . <EOS>

> il parle au telephone .
= he s talking on the telephone .
< he is doing . <EOS>

> j en ai marre qu il m engueule .
= i m tired of him bawling me out .
< you re welcome . <EOS>

> vous etes tres sages .
= you re very wise .
< you re welcome . <EOS>

> je me rends compte des difficultes .
= i m aware of the difficulties .
< you re welcome . <EOS>

> je demenage le mois prochain .
= i am moving next month .
< i m kidding . <EOS>



In [None]:
output_words, attentions = evaluate(
    encoder1, attn_decoder1, "je suis trop froid .")
plt.matshow(attentions.numpy())

<matplotlib.image.AxesImage at 0x7f6b9647ff90>

In [None]:
def showAttention(input_sentence, output_words, attentions):
    # Set up figure with colorbar
    fig = plt.figure()
    ax = fig.add_subplot(111)
    cax = ax.matshow(attentions.numpy(), cmap='bone')
    fig.colorbar(cax)

    # Set up axes
    ax.set_xticklabels([''] + input_sentence.split(' ') +
                       ['<EOS>'], rotation=90)
    ax.set_yticklabels([''] + output_words)

    # Show label at every tick
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    plt.show()


def evaluateAndShowAttention(input_sentence):
    output_words, attentions = evaluate(
        encoder1, attn_decoder1, input_sentence)
    print('input =', input_sentence)
    print('output =', ' '.join(output_words))
    showAttention(input_sentence, output_words, attentions)


evaluateAndShowAttention("elle a cinq ans de moins que moi .")

evaluateAndShowAttention("elle est trop petit .")

evaluateAndShowAttention("je ne crains pas de mourir .")

evaluateAndShowAttention("c est un jeune directeur plein de talent .")

input = elle a cinq ans de moins que moi .
output = she is wearing . <EOS>
input = elle est trop petit .
output = she is really to <EOS>
input = je ne crains pas de mourir .
output = i am . . <EOS>
input = c est un jeune directeur plein de talent .
output = i m just kidding . <EOS>
