<a href="https://colab.research.google.com/github/Rahul-dsml/Myprojects/blob/main/Eng_French_Translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [115]:
#from google.colab import drive
#drive.mount('/content/drive')

In [116]:
from io import open
import unicodedata
import string
import re
import random
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device= torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [117]:
SOS_token= 0
EOS_token = 1

In [118]:
class Lang:
  def __init__(self, name):
    self.name = name
    self.W2I = { 'SOS':SOS_token, 'EOS': EOS_token}
    self.I2W= {SOS_token: 'SOS', EOS_token: 'EOS'}
    self.W2C = {}
    self.n_words = 2
  
  def addSentence(self, s):
    for word in s.split(" "):
      self.addWord(word)

  def addWord(self, w):
    if w not in self.W2I:
      self.W2I[w]= self.n_words
      self.W2C[w]= 1
      self.I2W[self.n_words] = w
      self.n_words += 1
    else:
      self.W2C[w]+=1
  
  def printAllWords(self):
    words= list(self.W2I.keys())
    for word in words:
      print(word)

In [119]:
L= Lang('Eng')

In [120]:
L.addWord('NLP')
L.addSentence('I am learning Machine Translation')

In [121]:
L.printAllWords()

SOS
EOS
NLP
I
am
learning
Machine
Translation


In [122]:
def unicode2ascii(s):
  return "".join(
      c for c in unicodedata.normalize('NFD',s) if unicodedata.category(c) != 'Mn'
  )

In [123]:
def normalizeString(s):
  s= unicode2ascii(s.lower().strip())
  s= re.sub(r'([.!?])',r'\1',s)
  s= re.sub(r'[^a-zA-Z.!?]+',' ',s)
  return s

In [124]:
print(normalizeString('saudgfeyugxce7y978d6y9'))

saudgfeyugxce y d y 


## Read the data file

In [125]:
## Read the data file
def readLangs():
  lines= open('/content/drive/MyDrive/NLP/eng-fra.txt', encoding='utf-8').read().strip().split('\n')
  pairs= [[normalizeString(s) for s in l.split('\t')] for l in lines]
  input_lang= Lang('eng')
  output_lang= Lang('fra')
  return input_lang, output_lang, pairs

In [126]:
I,O,P= readLangs()

In [127]:
P[0:5]

[['go.', 'va !'],
 ['run!', 'cours !'],
 ['run!', 'courez !'],
 ['wow!', 'ca alors !'],
 ['fire!', 'au feu !']]

In [128]:
def prepareData(I,O,P):
  Max_Len= 0
  for pair in P:
    I.addSentence(pair[0])
    O.addSentence(pair[1])
    Max_Len = max(Max_Len, len(pair[0].split()), len(pair[1].split()))
  return I, O, Max_Len

In [129]:
input_lang, output_lang, Max_Len= prepareData(I,O,P)

In [130]:
Max_Len

59

In [131]:
input_lang.n_words

20753

In [132]:
output_lang.n_words

29481

In [None]:
output_lang.printAllWords()

In [134]:
print(random.choice(P))

['her belief in god is very firm.', 'c est une fervente croyante.']


In [135]:
pairs= P

## Encoder RNN

In [136]:
class EncoderRNN(nn.Module):
  def __init__(self, vocabSize, hidden_size):
    super(EncoderRNN, self).__init__()
    self.hidden_size = hidden_size
    self.E = nn.Embedding(vocabSize,hidden_size)
    self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True, bidirectional=True)
  def forward(self, input, hidden):
    emb= self.E(input).view(1,1,-1)
    output,hidden = self.gru(emb, hidden)
    return output, hidden
  def initHidden(self):
    return torch.zeros(2,1,self.hidden_size, device= device)

## Decoder RNN

In [137]:
class DecoderRNN(nn.Module):
  def __init__(self, hidden_size, vocabSize, max_length = Max_Len):
    super(DecoderRNN, self).__init__()
    self.hidden_size = hidden_size
    self.output_size = vocabSize
    self.max_length = Max_Len
    self.E = nn.Embedding(self.output_size, self.hidden_size)
    self.attn = nn.Linear(self.hidden_size*2, self.max_length)
    self.attn_combine = nn.Linear(self.hidden_size*3, self.hidden_size)
    self.gru = nn.GRU(self.hidden_size, self.hidden_size)
    self.out = nn.Linear(self.hidden_size, self.output_size)

  def forward(self, input, hidden, encoder_outputs):
    emb = self.E(input).view(1,1,-1)
    attn_w = F.softmax(self.attn(torch.cat((emb[0], hidden[0]), 1)), dim=1)
    attn_A = torch.bmm(attn_w.unsqueeze(0), encoder_outputs.unsqueeze(0))
    output = torch.cat((emb[0], attn_A[0]),1)
    output = self.attn_combine(output.unsqueeze(0))
    output = F.relu(output)
    output, hidden = self.gru(output, hidden)
    output = F.log_softmax(self.out(output[0]), dim= 1)
    return output, hidden, attn_w

  def initHidden(self):
    return torch.zeros(1,1,self.hidden_size, device= device)


## Decoder RNN Helper Functions

In [138]:
def indexesFromSentence(lang, s):
  return [lang.W2I[w] for w in s.split()]

def tensorFromSentence(lang,s):
  idx = indexesFromSentence(lang, s)
  idx.append(EOS_token)
  return torch.tensor(idx, dtype = torch.long, device = device).view(-1,1)

def tensorsFromPair(pair):
  input_tensor = tensorFromSentence(input_lang, pair[0])
  output_tensor = tensorFromSentence(output_lang, pair[1])
  return (input_tensor, output_tensor)

## Training

In [139]:
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, loss_fn, max_length= Max_Len):
  encoder_hidden = encoder.initHidden()
  encoder_optimizer.zero_grad()
  decoder_optimizer.zero_grad()
  input_length = input_tensor.size(0)
  target_length = target_tensor.size(0)
  encoder_outputs = torch.zeros(max_length, 2*encoder.hidden_size, device=device)
  loss = 0
  for ei in range(input_length):
    encoder_output, encoder_hidden = encoder(
        input_tensor[ei], encoder_hidden)
    out_reshaped = encoder_output.view(1,1,2,encoder.hidden_size)
    out_forward = out_reshaped[:,:,0,:]
    out_backward = out_reshaped[:,:,1,:]
    encoder_outputs[ei] = torch.cat((out_forward[0,0], out_backward[0,0]),0)
  decoder_input = torch.tensor([[SOS_token]], device= device)
  h_reshaped = encoder_hidden.view(1,2,1,encoder.hidden_size)
  decoder_hidden = h_reshaped[:,0,:,:]

  for di in range(target_length):
    decoder_output, decoder_hidden, decoder_attention = decoder(
        decoder_input, decoder_hidden, encoder_outputs
    )
    topv, topi = decoder_output.topk(1)
    decoder_input = topi.squeeze().detach()
    loss+= loss_fn(decoder_output, target_tensor[di])
    if decoder_input.item() == EOS_token:
      break
    loss.backward()
    encoder_optimizer.step()
    decoder_optimizer.step()
    return loss.item()/target_length

### Stochastic Gradient Descent

In [140]:
def trainIter(encoder, decoder, n_iters, lr = 0.001):
  totalLoss= 0
  encoder_optimizer = optim.SGD(encoder.parameters(), lr=lr)
  decoder_optimizer = optim.SGD(decoder.parameters(), lr=lr)
  training_pairs = [tensorsFromPair(random.choice(pairs)) for i in range(n_iters)]
  loss_fn = nn.NLLLoss()
  for iter in range(n_iters):
    training_pair = training_pairs[iter]
    input_tensor = training_pair[0]
    target_tensor = training_pair[1]
    loss = train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, loss_fn)
    totalLoss+= loss
    print(totalLoss/(iter+1))


## NMT Training

In [141]:
hidden_size = 128
encoder = EncoderRNN(input_lang.n_words, hidden_size).to(device)
decoder = DecoderRNN(hidden_size, output_lang.n_words).to(device)

In [None]:
trainIter(encoder, decoder, 100, lr = 0.001)

## Evaluation of Neural Machine Translation

In [143]:
def evaluate(encoder, decoder, s, max_length= Max_Len):
  with torch.no_grad():
    input_tensor = tensorFromSentence(input_lang, s)
    input_length = input_tensor.size()[0]
    encoder_hidden = encoder.initHidden()
    encoder_outputs = torch.zeros(max_length, 2*encoder.hidden_size, device=device)
    
    for ei in range(input_length):
      encoder_output, encoder_hidden = encoder(
          input_tensor[ei], encoder_hidden)
      out_reshaped = encoder_output.view(1,1,2,encoder.hidden_size)
      out_forward = out_reshaped[:,:,0,:]
      out_backward = out_reshaped[:,:,1,:]
      encoder_outputs[ei] = torch.cat((out_forward[0,0], out_backward[0,0]),0)
    decoder_input = torch.tensor([[SOS_token]], device= device)
    h_reshaped = encoder_hidden.view(1,2,1,encoder.hidden_size)
    decoder_hidden = h_reshaped[:,0,:,:]

    decoder_words = []
    decoder_att = torch.zeros(max_length, max_length)

    for di in range(max_length):
      decoder_output, decoder_hidden, decoder_attention = decoder(
        decoder_input, decoder_hidden, encoder_outputs)
      decoder_att[di] = decoder_attention.data
      topv, topi = decoder_output.topk(1)
      if topi.item()== EOS_token:
        decoder_words.append('<EOS>')
        break
      else:
        decoder_words.append(output_lang.I2W[topi.item()])
      decoder_input = topi.squeeze().detach()

    return decoder_words


In [144]:
print(evaluate(encoder, decoder, pairs[0][0]), pairs[0][1])

['corrompt', 'grandement', 'remerciement.', 'atlanta', 'remerciement.', 'atlanta', 'remerciement.', 'atlanta', 'remerciement.', 'atlanta', 'remerciement.', 'atlanta', 'remerciement.', 'atlanta', 'remerciement.', 'atlanta', 'remerciement.', 'atlanta', 'remerciement.', 'atlanta', 'remerciement.', 'atlanta', 'remerciement.', 'atlanta', 'remerciement.', 'atlanta', 'remerciement.', 'atlanta', 'remerciement.', 'atlanta', 'remerciement.', 'atlanta', 'remerciement.', 'atlanta', 'remerciement.', 'atlanta', 'remerciement.', 'atlanta', 'remerciement.', 'atlanta', 'remerciement.', 'atlanta', 'remerciement.', 'atlanta', 'remerciement.', 'atlanta', 'remerciement.', 'atlanta', 'remerciement.', 'atlanta', 'remerciement.', 'atlanta', 'remerciement.', 'atlanta', 'remerciement.', 'atlanta', 'remerciement.', 'atlanta', 'remerciement.'] va !
