<a href="https://colab.research.google.com/github/WibuSOS/live-chat-translator/blob/ai-branch/Grammarly_Check.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# import unicodedata
import re
import math
import random
from random import shuffle
from io import open
import numpy as np
import matplotlib.pyplot as plt
import pickle

import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
# import torch.cuda

# this line clears sys to allow for argparse to work as gradient clipper
import sys; sys.argv=['']; del sys

In [None]:
# This function converts a Unicode string to plain ASCII 
# from https://stackoverflow.com/a/518232/2809427
# def uniToAscii(sentence):
#     return ''.join(
#         c for c in unicodedata.normalize('NFD', sentence)
#         if unicodedata.category(c) != 'Mn'
#     )

# Lowercase, trim, and remove non-letter characters (from pytorch)
def normalizeString(s):
    s = re.sub(r" ##AT##-##AT## ", r" ", s)
    s = s.lower().strip()
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

# Denote patterns that sentences must start with to be kept in dataset. 
# Can be changed if desired (from pytorch)
eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)

# Filters each input-output pair, keeping sentences that are less than max_length 
# if start_filter is true, also filters out sentences that don't start with eng_prefixes
def filterPair(p, max_length, start_filter):
    filtered = len(p[0].split(' ')) < max_length and \
        len(p[1].split(' ')) < max_length 
    if start_filter:
        return filtered and p[1].startswith(eng_prefixes)
    else:
        return filtered

# Filters all of the input-output language pairs in the dataset using filterPair 
# for each pair (from pytorch)
def filterPairs(pairs, max_length, start_filter):
    return [pair for pair in pairs if filterPair(pair, max_length, start_filter)]

In [None]:
# start of sentence tag
SOS_token = 0

# end of sentence tag
EOS_token = 1

# unknown word tag (this is used to handle words that are not in our Vocabulary)
UNK_token = 2

# Lang class, used to store the vocabulary of each language
class Lang:
    def __init__(self, language):
        self.language_name = language
        self.word_to_index = {"SOS":SOS_token, "EOS":EOS_token, "<UNK>":UNK_token}
        self.word_to_count = {}
        self.index_to_word = {SOS_token: "SOS", EOS_token: "EOS", UNK_token: "<UNK>"}
        self.vocab_size = 3
        self.cutoff_point = -1

    def countSentence(self, sentence):
        for word in sentence.split(' '):
            self.countWords(word)

    # counts the number of times each word appears in the dataset
    def countWords(self, word):
        if word not in self.word_to_count:
            self.word_to_count[word] = 1
        else:
            self.word_to_count[word] += 1

    # if the number of unique words in the dataset is larger than the
    # specified max_vocab_size, creates a cutoff point that is used to
    # leave infrequent words out of the vocabulary
    def createCutoff(self, max_vocab_size):
        word_freqs = list(self.word_to_count.values())
        word_freqs.sort(reverse=True)
        if len(word_freqs) > max_vocab_size:
            self.cutoff_point = word_freqs[max_vocab_size]

    # assigns each unique word in a sentence a unique index
    def addSentence(self, sentence):
        new_sentence = ''
        for word in sentence.split(' '):
            unk_word = self.addWord(word)
            if not new_sentence:
                new_sentence = unk_word
            else:
                new_sentence = new_sentence + ' ' + unk_word
        return new_sentence

    # assigns a word a unique index if not already in vocabulary
    # and it appeaars often enough in the dataset
    # (self.word_to_count is larger than self.cutoff_point)
    def addWord(self, word):
        if self.word_to_count[word] > self.cutoff_point:
            if word not in self.word_to_index:
                self.word_to_index[word] = self.vocab_size
                self.index_to_word[self.vocab_size] = word
                self.vocab_size += 1
            return word
        else:
            return self.index_to_word[2]

In [None]:
# converts a sentence to one hot encoding vectors - pytorch allows us to just
# use the number corresponding to the unique index for that word,
# rather than a complete one hot encoding vector for each word

def indexesFromSentence(lang, sentence):
    indexes = []
    for word in sentence.split(' '):
        try:
            indexes.append(lang.word_to_index[word])
        except:
            indexes.append(lang.word_to_index["<UNK>"])
    return indexes

def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    result = torch.LongTensor(indexes).view(-1)
    if use_cuda:
        return result.cuda()
    else:
        return result

# converts from tensor of one hot encoding vector indices to sentence
# def sentenceFromTensor(lang, tensor):
#     raw = tensor.data
#     words = []
#     for num in raw:
#         words.append(lang.index_to_word[num.item()])
#     return ' '.join(words)

In [None]:
# prepares both the input and output Lang classes from the passed dataset
def prepareLangs(file_path, reverse=False):
    print("Reading lines...")
    if len(file_path) == 2:
        lang1_lines = open(file_path[0], encoding='utf-8').\
            read().strip().split('\n')
        lang2_lines = open(file_path[1], encoding='utf-8').\
            read().strip().split('\n')
        if len(lang1_lines) != len(lang2_lines):
            print("Input and output text sizes do not align")
            print("Number of lang1 lines: %s " %len(lang1_lines))
            print("Number of lang2 lines: %s " %len(lang2_lines))
            quit()
        pairs = []
        for line in range(len(lang1_lines)):
            pairs.append([normalizeString(lang1_lines[line]),
                          normalizeString(lang2_lines[line])])
        print("pairs=%s with file_path=%s has been created " % (len(pairs), len(file_path)))
    elif len(file_path) == 1:
        lines = open(file_path[0], encoding='utf-8').\
    	read().strip().split('\n')
        pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]
        print("pairs=%s with file_path=%s has been created " % (len(pairs), len(file_path)))
    print(lines[0])
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        print("pair reverse has been created")
    else:
        print("pair reverse is not created")
    
    return pairs

In [None]:
class EncoderRNN(nn.Module):
    def __init__(self,input_size,hidden_size,layers=1,dropout=0.1,
                bidirectional=True):
        super(EncoderRNN, self).__init__()

        if bidirectional:
            self.directions = 2
        else:
            self.directions = 1
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = layers
        self.dropout = dropout
        self.embedder = nn.Embedding(input_size,hidden_size)
        self.dropout = nn.Dropout(dropout)
        self.lstm = nn.LSTM(input_size=hidden_size,hidden_size=hidden_size,
                        num_layers=layers,dropout=dropout,
                        bidirectional=bidirectional,batch_first=False)
        self.fc = nn.Linear(hidden_size*self.directions, hidden_size)

    def forward(self, input_data, h_hidden, c_hidden):
        embedded_data = self.embedder(input_data)
        embedded_data = self.dropout(embedded_data)
        hiddens, outputs = self.lstm(embedded_data, (h_hidden, c_hidden))

        return hiddens, outputs

    # creates initial hidden states for encoder corresponding to batch size
    def create_init_hiddens(self, batch_size):
        h_hidden = Variable(torch.zeros(self.num_layers*self.directions, 
                                    batch_size, self.hidden_size))
        c_hidden = Variable(torch.zeros(self.num_layers*self.directions, 
                                    batch_size, self.hidden_size))
        
        return h_hidden, c_hidden

In [None]:
class DecoderAttn(nn.Module):
	def __init__(self, hidden_size, output_size, layers=1, dropout=0.1, bidirectional=True):
		super(DecoderAttn, self).__init__()

		if bidirectional:
			self.directions = 2
		else:
			self.directions = 1
		self.output_size = output_size
		self.hidden_size = hidden_size
		self.num_layers = layers
		self.dropout = dropout
		self.embedder = nn.Embedding(output_size,hidden_size)
		self.dropout = nn.Dropout(dropout)
		self.score_learner = nn.Linear(hidden_size*self.directions, 
                                   hidden_size*self.directions)
		self.lstm = nn.LSTM(input_size=hidden_size,hidden_size=hidden_size,
                        num_layers=layers,dropout=dropout,
                        bidirectional=bidirectional,batch_first=False)
		self.context_combiner = nn.Linear((hidden_size*self.directions)
                                      +(hidden_size*self.directions), hidden_size)
		self.tanh = nn.Tanh()
		self.output = nn.Linear(hidden_size, output_size)
		self.soft = nn.Softmax(dim=1)
		self.log_soft = nn.LogSoftmax(dim=1)

	def forward(self, input_data, h_hidden, c_hidden, encoder_hiddens):
		embedded_data = self.embedder(input_data)
		embedded_data = self.dropout(embedded_data)	
		batch_size = embedded_data.shape[1]
		hiddens, outputs = self.lstm(embedded_data, (h_hidden, c_hidden))	
		top_hidden = outputs[0].view(self.num_layers,self.directions,
                                 hiddens.shape[1],
                                 self.hidden_size)[self.num_layers-1]
		top_hidden = top_hidden.permute(1,2,0).contiguous().view(batch_size,-1,1)

		prep_scores = self.score_learner(encoder_hiddens.permute(1,0,2))
		scores = torch.bmm(prep_scores, top_hidden)
		attn_scores = self.soft(scores)
		con_mat = torch.bmm(encoder_hiddens.permute(1,2,0),attn_scores)
		h_tilde = self.tanh(self.context_combiner(torch.cat((con_mat,
                                                         top_hidden),dim=1)
                                              .view(batch_size,-1)))
		pred = self.output(h_tilde)
		pred = self.log_soft(pred)

		return pred, outputs

In [None]:
# Returns the predicted translation of a given input sentence. Predicted
# translation is trimmed to length of cutoff_length argument

def evaluate(encoder, decoder, input_lang, output_lang, sentence, cutoff_length):
    with torch.no_grad():
        input_variable = tensorFromSentence(input_lang, sentence)
        input_variable = input_variable.view(-1,1)

        enc_h_hidden, enc_c_hidden = encoder.create_init_hiddens(1)
        enc_hiddens, enc_outputs = encoder(input_variable, enc_h_hidden, enc_c_hidden)

        decoder_input = Variable(torch.LongTensor(1,1).fill_(output_lang.word_to_index.get("SOS")).cuda()) if use_cuda \
                        else Variable(torch.LongTensor(1,1).fill_(output_lang.word_to_index.get("SOS")))
        dec_h_hidden = enc_outputs[0]
        dec_c_hidden = enc_outputs[1]

        decoded_words = []

        for di in range(cutoff_length):
            pred, dec_outputs = decoder(decoder_input, dec_h_hidden, dec_c_hidden, enc_hiddens)
            topv, topi = pred.topk(1,dim=1)
            ni = topi.item()

            if ni == output_lang.word_to_index.get("EOS"):
                # decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index_to_word[ni])

            decoder_input = Variable(torch.LongTensor(1,1).fill_(ni).cuda()) if use_cuda \
                            else Variable(torch.LongTensor(1,1).fill_(ni))
            dec_h_hidden = dec_outputs[0]
            dec_c_hidden = dec_outputs[1]

        output_sentence = ' '.join(decoded_words)

        return output_sentence

In [None]:
# HYPERPARAMETERS: FEEL FREE TO PLAY WITH THESE TO TRY TO ACHIEVE BETTER RESULTS

# signifies whether the Encoder and Decoder should be bidirectional LSTMs or not
bidirectional = True
if bidirectional:
	directions = 2
else:
	directions = 1

# number of layers in both the Encoder and Decoder
layers = 2

# Hidden size of the Encoder and Decoder
hidden_size = 600

# Dropout value for Encoder and Decoder
dropout = 0.8

In [None]:
# LOAD CONFIGURATIONS

# Set the common name of the loading files
common_file_name = "testdata.tatoeba_trim.20_vocab.25000_directions.2_layers.4_hidden.100_dropout.0.5_learningrate.1_batch.10_epochs.100"
id_lang = 'id'
en_lang = 'en'
dataset = 'tatoeba'
directory = ''

# Set these configurations if you want to load models from google drive
load_from_drive = True
if load_from_drive:
    common_file_name = "testdata>tatoeba_trim>20_vocab>25000_directions>2_layers>4_hidden>100_dropout>0.5_learningrate>1_batch>10_epochs>100"
    super_directory = '/content/drive/'
    experiment_directory = 'MyDrive/Kuliah/Thesis/experiment/grammarly/tatoeba/'
    directory = super_directory + experiment_directory

# denotes the fixedness of the randomness
# applied to torch.random and python random module
seed_value = 10

# file path of dataset in the form of a list. If translated sentences are
# stored in two files, this list will have two elements
raw_data_file_path = [directory+'tatoeba_id_en.txt']

# True if you want to reverse the order of the sentence pairs. For example, 
# in our dataset the sentence pairs list the English sentence first followed by
# the French translation. But we want to translate from French to English,
# so we set reverse as True.
reverse = False

# Remove sentences from dataset that are longer than trim (in either language)
trim = 30

# max number of words in the vocabulary for both languages
max_vocab_size = 30000

# if true removes sentences from the dataset that don't start with eng_prefixes.
# Typically will want to use False, but implemented to compare results with Pytorch
# tutorial. Can also change the eng_prefixes to prefixes of other languages or
# other English prefixes. Just be sure that the prefixes apply to the OUTPUT
# language (i.e. the language that the model is translating to NOT from)
start_filter = False

# denotes what percentage of the data to use as training data. the remaining 
# percentage becomes dev/validation data and test data.
# Typically want to use 0.8-0.9.
perc_train_set = 0.8

# Number of sentences that are put into Grammarly check.
# taken from test set.
checkup_size = 100

# Set the name of the loading files
id_vocab_file = directory + id_lang + '_4310_' + dataset + '_vocab.p'
en_vocab_file = directory + en_lang + '_3572_' + dataset + '_vocab.p'
id_en_enc_file = '%s%s_%s_enc_direction_%s_layer_%s_hidden_%s_dropout_%s.pth' % (directory, id_lang, en_lang, directions, layers, hidden_size, dropout)
id_en_dec_file = '%s%s_%s_dec_direction_%s_layer_%s_hidden_%s_dropout_%s.pth' % (directory, id_lang, en_lang, directions, layers, hidden_size, dropout)
en_id_enc_file = '%s%s_%s_enc_direction_%s_layer_%s_hidden_%s_dropout_%s.pth' % (directory, en_lang, id_lang, directions, layers, hidden_size, dropout)
en_id_dec_file = '%s%s_%s_dec_direction_%s_layer_%s_hidden_%s_dropout_%s.pth' % (directory, en_lang, id_lang, directions, layers, hidden_size, dropout)

# File path to predicted sentences for Grammarly test
print_to = directory + 'grammarly_test' + '.txt'

# Mandatory variables initialization
device = torch.device('cpu')
# use_cuda = torch.cuda.is_available()
use_cuda = False
id_vocab = None
en_vocab = None
id_en_encoder = None
id_en_decoder = None
en_id_encoder = None
en_id_decoder = None

In [None]:
# LOAD FROM DRIVE: OPTIONAL!!!
# execute this cell if you want to load models from google drive

if load_from_drive:
    from google.colab import drive
    drive.mount(super_directory)

Mounted at /content/drive/


In [None]:
# from google.colab import files
# uploaded = files.upload()

In [None]:
# LOAD EVERYTHING

# sets the fixedness of the randomness
torch.random.manual_seed(seed_value)
random.seed(seed_value)

pairs = prepareLangs(raw_data_file_path, reverse=reverse)

id_vocab = pickle.load(open(id_vocab_file,'rb'))
en_vocab = pickle.load(open(en_vocab_file,'rb'))

id_en_encoder = EncoderRNN(id_vocab.vocab_size, hidden_size, layers=layers, 
                           dropout=dropout, bidirectional=bidirectional)
id_en_decoder = DecoderAttn(hidden_size, en_vocab.vocab_size, layers=layers, 
                            dropout=dropout, bidirectional=bidirectional)

id_en_encoder.load_state_dict(torch.load(id_en_enc_file, map_location=device))
id_en_decoder.load_state_dict(torch.load(id_en_dec_file, map_location=device))

id_en_encoder.eval()
id_en_decoder.eval()

Reading lines...
pairs=7141 with file_path=1 has been created 
Lari!	Run!
pair reverse is not created


DecoderAttn(
  (embedder): Embedding(3572, 600)
  (dropout): Dropout(p=0.8, inplace=False)
  (score_learner): Linear(in_features=1200, out_features=1200, bias=True)
  (lstm): LSTM(600, 600, num_layers=2, dropout=0.8, bidirectional=True)
  (context_combiner): Linear(in_features=2400, out_features=600, bias=True)
  (tanh): Tanh()
  (output): Linear(in_features=600, out_features=3572, bias=True)
  (soft): Softmax(dim=1)
  (log_soft): LogSoftmax(dim=1)
)

In [None]:
if trim != 0:
    pairs = filterPairs(pairs, trim, start_filter)
    print("Trimmed to %s sentence pairs" % len(pairs))

shuffle(pairs)

temp_pairs = pairs[:math.ceil(perc_train_set*len(pairs))]
test_pairs = pairs[math.ceil(perc_train_set*len(pairs)):]
train_pairs = temp_pairs[:math.ceil(perc_train_set*len(temp_pairs))]
dev_pairs = temp_pairs[math.ceil(perc_train_set*len(temp_pairs)):]

# Test a sentence outside the dataset
for index in range(checkup_size):
    prediction = evaluate(id_en_encoder, id_en_decoder, id_vocab, en_vocab, normalizeString(test_pairs[index][0]), cutoff_length=trim)
    print(index+1, prediction)
    with open(print_to, 'a+') as f:
        f.write(prediction + '\n')

Trimmed to 7140 sentence pairs
1 i am her for a . .
2 tom looks confused .
3 he was a of his .
4 for children to to years on the jam can can be to to . .
5 the the the the the the the . .
6 how this is this ? ?
7 i m sorry i didn t all you all . .
8 i m tired .
9 i think a m very . .
10 do you want to wait of us us you ? ?
11 we had a lot of our we we had .
12 tom will be in the .
13 i m sorry i i i m t . .
14 when will you leave home ?
15 i have a . .
16 tom is reading a book .
17 that is really true .
18 tom is still one one nothing who .
19 don t be too much .
20 when will you leave home ?
21 tom has to .
22 what do you want to do ?
23 i know it s understand is all .
24 tom left to get hour ago .
25 please turn the the . .
26 tom looks confused .
27 let s do it .
28 is there a highest on in ? ?
29 the island is a is .
30 i am tired of that .
31 tom went to the school .
32 this is the first time we had a had in in boston .
33 tom has three three children .
34 i ve just finished break