In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install --no-cache-dir transformers sentencepiece
#installing the hugging face library

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/ed/db/98c3ea1a78190dac41c0127a063abf92bd01b4b0b6970a6db1c2f5b66fa0/transformers-4.0.1-py3-none-any.whl (1.4MB)
[K     |████████████████████████████████| 1.4MB 5.5MB/s 
[?25hCollecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/e5/2d/6d4ca4bef9a67070fa1cac508606328329152b1df10bdf31fb6e4e727894/sentencepiece-0.1.94-cp36-cp36m-manylinux2014_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 15.7MB/s 
[?25hCollecting tokenizers==0.9.4
[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 27.6MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (88

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelWithLMHead
import numpy as np
import pandas as pd


#Data Collection

In [None]:
!pip install opustools-pkg

Collecting opustools-pkg
[?25l  Downloading https://files.pythonhosted.org/packages/6c/9f/e829a0cceccc603450cd18e1ff80807b6237a88d9a8df2c0bb320796e900/opustools_pkg-0.0.52-py3-none-any.whl (80kB)
[K     |████                            | 10kB 26.9MB/s eta 0:00:01[K     |████████                        | 20kB 17.1MB/s eta 0:00:01[K     |████████████▏                   | 30kB 14.4MB/s eta 0:00:01[K     |████████████████▏               | 40kB 13.5MB/s eta 0:00:01[K     |████████████████████▎           | 51kB 9.3MB/s eta 0:00:01[K     |████████████████████████▎       | 61kB 9.9MB/s eta 0:00:01[K     |████████████████████████████▎   | 71kB 9.8MB/s eta 0:00:01[K     |████████████████████████████████| 81kB 6.3MB/s 
[?25hInstalling collected packages: opustools-pkg
Successfully installed opustools-pkg-0.0.52


In [None]:
! opus_read -d JW300 -s ach -t en -wm moses -w jw300.ach jw300.en -q


Alignment file /proj/nlpl/data/OPUS/JW300/latest/xml/ach-en.xml.gz not found. The following files are available for downloading:

 724 KB https://object.pouta.csc.fi/OPUS-JW300/v1/xml/ach-en.xml.gz
   8 MB https://object.pouta.csc.fi/OPUS-JW300/v1/xml/ach.zip
 263 MB https://object.pouta.csc.fi/OPUS-JW300/v1/xml/en.zip

 272 MB Total size
Unable to retrieve the data.
No alignment file "/proj/nlpl/data/OPUS/JW300/latest/xml/ach-en.xml.gz" or "./JW300_latest_xml_ach-en.xml.gz" found


In [None]:
#change these variables to train a model on a different set of languages
source_language="ach"
target_language="en"

In [None]:
# TMX file to dataframe
source_file = 'jw300.' + source_language
target_file = 'jw300.' + target_language

source = []
target = []
skip_lines = []  # Collect the line numbers of the source portion to skip the same lines for the target portion.
with open(source_file) as f:
    for i, line in enumerate(f):
        # Skip sentences that are contained in the test set.
            source.append(line.strip())
                     
with open(target_file) as f:
    for j, line in enumerate(f):
        # Only add to corpus if corresponding source was not skipped.
            target.append(line.strip())
       
df = pd.DataFrame(zip(source, target), columns=['source_sentence', 'target_sentence'])
# if you get TypeError: data argument can't be an iterator is because of your zip version run this below
#df = pd.DataFrame(list(zip(source, target)), columns=['source_sentence', 'target_sentence'])
df.head(5)

Unnamed: 0,source_sentence,target_sentence
0,Lok ma Tye i Iye,Table of Contents
1,"Marci 1 , 2011","March 1 , 2011"
2,“ Kwena Maber me Ker ” Obedo Gin Ango ?,The “ Good News of the Kingdom ” ​ — What Is It ?
3,PWONY MA KWAKO LOK MA I POK NGEYE,FROM OUR COVER
4,3 Lok Mo ma Pire Tek pa Lanebi,3 A Prophecy of Enormous Importance


##Data Preprocessing and Preparation

In [None]:
seed=23

In [None]:
len(df)

81969

In [None]:
df.replace('', np.nan, inplace=True)

In [None]:
df.dropna(inplace=True)

In [None]:
df=df.applymap(str)

In [None]:
len(df)

79310

In [None]:
#removing rows with cells that contain only non-alphabetic characters
df=df[df['source_sentence'].str.contains('[A-Za-z]')]
df=df[df['target_sentence'].str.contains('[A-Za-z]')]
len(df)

76402

In [None]:
# drop duplicate translations
df_pp=df.copy()
df_pp = df_pp.drop_duplicates()

# drop conflicting translations
df_pp.drop_duplicates(subset='source_sentence', inplace=True)
df_pp.drop_duplicates(subset='target_sentence', inplace=True)

# Shuffle the data to remove bias in dev set selection.
df_pp = df_pp.sample(frac=1, random_state=seed).reset_index(drop=True)

In [None]:
df_pp.reset_index(drop=True, inplace=True)

In [None]:
df_pp.head()

Unnamed: 0,source_sentence,target_sentence
0,Wanyuto nining ni wacwako yub ma dul pa Jehova...,How do we demonstrate our support for the arra...
1,"( Niyabo 14 : 6 ) Lobo kibiloko doko Paradic ,...","Paradise will be restored on earth , and anyon..."
2,Mitte ni waket tek wek wajal cawa me kwano Bai...,We need self - discipline to devote time to re...
3,"Ma lubbe ki lok man , luot acel acel ma gitye ...",About this matter each Christian couple should...
4,Nen kong gin mutimme i kom lakwena Paulo .,Consider what happened to the apostle Paul .


In [None]:
df_pp.head()

Unnamed: 0,source_sentence,target_sentence
0,Wanyuto nining ni wacwako yub ma dul pa Jehova...,How do we demonstrate our support for the arra...
1,"( Niyabo 14 : 6 ) Lobo kibiloko doko Paradic ,...","Paradise will be restored on earth , and anyon..."
2,Mitte ni waket tek wek wajal cawa me kwano Bai...,We need self - discipline to devote time to re...
3,"Ma lubbe ki lok man , luot acel acel ma gitye ...",About this matter each Christian couple should...
4,Nen kong gin mutimme i kom lakwena Paulo .,Consider what happened to the apostle Paul .


In [None]:
size=len(df_pp)
size

73077

In [None]:
# Split between train/dev/test- ratio 7:2:1 for then saves them as separate files
import csv

lc=True
# Optional: lower case the corpora - this will make it easier to generalize, but without proper casing.
if lc: 
    df_pp["source_sentence"] = df_pp["source_sentence"].str.lower()
    df_pp["target_sentence"] = df_pp["target_sentence"].str.lower()

train_size=int(size*0.7)
train=df_pp.iloc[:train_size, 0:2]

dev_size=int(size*0.2)
end=train_size+dev_size
dev=df_pp.iloc[train_size:end, 0:2]


test_size=int(size*0.1)
test=df_pp.iloc[end:, 0:2]


with open("train."+source_language, "w") as src_file, open("train."+target_language, "w") as trg_file:
  for index, row in train.iterrows():
    src_file.write(row["source_sentence"]+"\n")
    trg_file.write(row["target_sentence"]+"\n")
    
with open("dev."+source_language, "w") as src_file, open("dev."+target_language, "w") as trg_file:
  for index, row in dev.iterrows():
    src_file.write(row["source_sentence"]+"\n")
    trg_file.write(row["target_sentence"]+"\n")

with open("test."+source_language, "w") as src_file, open("test."+target_language, "w") as trg_file:
  for index, row in test.iterrows():
    src_file.write(row["source_sentence"]+"\n")
    trg_file.write(row["target_sentence"]+"\n")

df_pp.to_csv("ach-en", header=False, index=False) 


# Doublecheck the format below. There should be no extra quotation marks or weird characters.
! head train.*
! head dev.*

==> train.ach <==
wanyuto nining ni wacwako yub ma dul pa jehovah oketo ?
( niyabo 14 : 6 ) lobo kibiloko doko paradic , dok ngat mo keken ma mito timo miti pa lubanga twero bedo iye !
mitte ni waket tek wek wajal cawa me kwano baibul , kwan piwa kenwa , ki dong wot i cokkewa .
ma lubbe ki lok man , luot acel acel ma gitye lukricitayo myero gumok tam pigi kengi ma weko cwinygi pe ngoligi kop .
nen kong gin mutimme i kom lakwena paulo .
* inge kare manok , en obolo catan kacel ki lumalaikane piny i lobo .
ento lupwonye dini gunyweno pwony me baibul woko ; macalo adwogine , man oweko jo mapol gitamo ni pe giromo niang baibul wacel . ​ — tic pa lukwena 20 : 29 , 30 .
baibul tye ki lanen pa jo ma gubedo luwaka ki ma gubedo lumwolo .
( nen cal ma tye i pot karatac 17 . ) ( b ) ngo ma dong itimo wek lutinoni gubed agonya me lok kwedi ?
calo kabaka solomon , leg bot jehovah wek omini ryeko me tiyo ticwa me pwony - nyi kacel ki ticci me kacokke .

==> train.en <==
how do we demonstrate our sup

In [None]:
print((train_size,dev_size, test_size))

(51153, 14615, 7307)


# Building the Model

In [None]:
device = 'cpu'
if torch.cuda.is_available(): #check if GPU device is available
    device = 'cuda' # assign the gpu to the device

In [None]:
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-luo-en")
model = AutoModelWithLMHead.from_pretrained("Helsinki-NLP/opus-mt-luo-en")  #Loading from hugging face models
# model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-luo-en")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1133.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=769849.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=741813.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1108211.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=43.0, style=ProgressStyle(description_w…




In [None]:
model = AutoModelWithLMHead.from_pretrained("Helsinki-NLP/opus-mt-luo-en")



HBox(children=(FloatProgress(value=0.0, description='Downloading', max=285893229.0, style=ProgressStyle(descri…




In [None]:
tokenizer.save_pretrained("./tokenizer")

('./tokenizer/tokenizer_config.json',
 './tokenizer/special_tokens_map.json',
 PosixPath('tokenizer/source_spm'),
 PosixPath('tokenizer/target_spm'),
 PosixPath('tokenizer/vocab'),
 PosixPath('tokenizer/tokenizer_config_file'),
 './tokenizer/added_tokens.json')

In [None]:
!cp -R tokenizer './drive/My Drive/Year 4/Machine Learning/Loki/ach-en'

##Model Fine Tuning the model
Initially, we trained all the layers of the model. In this stage, I will freeze the encoder and just train the head layers of the model

In [None]:
#printing the paramaters of the base model- whether they are trainable or not- Initially they are all trainable
for param in model.base_model.parameters():
    print(param.requires_grad)

In [None]:
for name, param in model.named_parameters():
   print(name)

model.shared.weight
model.encoder.embed_positions.weight
model.encoder.layers.0.self_attn.k_proj.weight
model.encoder.layers.0.self_attn.k_proj.bias
model.encoder.layers.0.self_attn.v_proj.weight
model.encoder.layers.0.self_attn.v_proj.bias
model.encoder.layers.0.self_attn.q_proj.weight
model.encoder.layers.0.self_attn.q_proj.bias
model.encoder.layers.0.self_attn.out_proj.weight
model.encoder.layers.0.self_attn.out_proj.bias
model.encoder.layers.0.self_attn_layer_norm.weight
model.encoder.layers.0.self_attn_layer_norm.bias
model.encoder.layers.0.fc1.weight
model.encoder.layers.0.fc1.bias
model.encoder.layers.0.fc2.weight
model.encoder.layers.0.fc2.bias
model.encoder.layers.0.final_layer_norm.weight
model.encoder.layers.0.final_layer_norm.bias
model.encoder.layers.1.self_attn.k_proj.weight
model.encoder.layers.1.self_attn.k_proj.bias
model.encoder.layers.1.self_attn.v_proj.weight
model.encoder.layers.1.self_attn.v_proj.bias
model.encoder.layers.1.self_attn.q_proj.weight
model.encoder.la

In [None]:
#Freezing layers of the model

#Option 1
modules = [*model.get_encoder().layers[:4]] 
for module in modules:
      param.requires_grad = False


In [None]:
#Option 2
for name, param in model.base_model.named_parameters():
        if name.startswith("encoder"): #You can replace "encoder" with any layer you want to freeze
            param.requires_grad = False
            print(f"Froze layer {name}...")

Froze layer encoder.embed_positions.weight...
Froze layer encoder.layers.0.self_attn.k_proj.weight...
Froze layer encoder.layers.0.self_attn.k_proj.bias...
Froze layer encoder.layers.0.self_attn.v_proj.weight...
Froze layer encoder.layers.0.self_attn.v_proj.bias...
Froze layer encoder.layers.0.self_attn.q_proj.weight...
Froze layer encoder.layers.0.self_attn.q_proj.bias...
Froze layer encoder.layers.0.self_attn.out_proj.weight...
Froze layer encoder.layers.0.self_attn.out_proj.bias...
Froze layer encoder.layers.0.self_attn_layer_norm.weight...
Froze layer encoder.layers.0.self_attn_layer_norm.bias...
Froze layer encoder.layers.0.fc1.weight...
Froze layer encoder.layers.0.fc1.bias...
Froze layer encoder.layers.0.fc2.weight...
Froze layer encoder.layers.0.fc2.bias...
Froze layer encoder.layers.0.final_layer_norm.weight...
Froze layer encoder.layers.0.final_layer_norm.bias...
Froze layer encoder.layers.1.self_attn.k_proj.weight...
Froze layer encoder.layers.1.self_attn.k_proj.bias...
Froz

In [None]:
#Option 3
from torch import nn
from typing import Callable, Dict, Iterable, List, Tuple, Union


def freeze_params(model: nn.Module):
    """Set requires_grad=False for each of model.parameters()"""
    for par in model.parameters():
        par.requires_grad = False

def freeze_embeds(model):
    """Freeze token embeddings and positional embeddings for bart, just token embeddings for t5."""
    model_type = model.config.model_type

    if model_type == "t5":
        freeze_params(model.shared)
        for d in [model.encoder, model.decoder]:
            freeze_params(d.embed_tokens)
    elif model_type == "fsmt":
        for d in [model.model.encoder, model.model.decoder]:
            freeze_params(d.embed_positions)
            freeze_params(d.embed_tokens)
    else:
        freeze_params(model.model.shared)
        for d in [model.model.encoder, model.model.decoder]:
            freeze_params(d.embed_positions)
            freeze_params(d.embed_tokens)


def grad_status(model: nn.Module) -> Iterable:
    return (par.requires_grad for par in model.parameters())


def any_requires_grad(model: nn.Module) -> bool:
    return any(grad_status(model))


def assert_all_frozen(model):
    model_grads: List[bool] = list(grad_status(model))
    n_require_grad = sum(lmap(int, model_grads))
    npars = len(model_grads)
    assert not any(model_grads), f"{n_require_grad/npars:.1%} of {npars} weights require grad"


def assert_not_all_frozen(model):
    model_grads: List[bool] = list(grad_status(model))
    npars = len(model_grads)
    assert any(model_grads), f"none of {npars} weights require grad"

def lmap(f: Callable, x: Iterable) -> List:
    """list(map(f, x))"""
    return list(map(f, x))

#*******FREEZE SECTION *********
freeze_params(model.get_encoder())
assert_all_frozen(model.get_encoder())

###Transfer the model to GPU, or CPU

In [None]:
model = model.to(device) # bind the model to the GPU device

##Testing the model before retraining

In [None]:
src_texts = [ "Gitye ki cwiny calo pa lanebi Icaya"]

In [None]:
tokens=tokenizer.prepare_seq2seq_batch(src_texts)
tokens

{'input_ids': tensor([[ 1701,  6137,    49,  2944,  8134, 39413,  8134, 13849,  4822,  6919,
           181, 17142,    47,  2837, 17140,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [None]:
type(tokens)

transformers.tokenization_utils_base.BatchEncoding

In [None]:
tokens.to(device)

{'input_ids': tensor([[ 1701,  6137,    49,  2944,  8134, 39413,  8134, 13849,  4822,  6919,
           181, 17142,    47,  2837, 17140,     0]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [None]:
translated = model.generate(**tokens)

In [None]:
tgt_text = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]

In [None]:
tgt_text

['they are like the prophet isaiah']

In [None]:
test_predictions=[]

def test_model(model,tokenizer, test_df):

  for row in test_df["source_sentence"]:
    tokens=tokenizer.prepare_seq2seq_batch([row])
    tokens.to(device)
    translated = model.generate(**tokens)
    tgt_text = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
    test_predictions.append(tgt_text[0])

  reference_preds=pd.DataFrame({"Predictions": test_predictions, "Reference": test["target_sentence"]})

  return reference_preds

In [None]:
def gen_bleu_score(pred_ref_df):
  from nltk.translate.bleu_score import corpus_bleu
  ref_tokens=[]
  pred_tokens=[]

  for row in pred_ref_df["Reference"]:
    ref_tokens.append(row.split())
  for row in pred_ref_df["Predictions"]:
   pred_tokens.append(row.split())

   return corpus_bleu(ref_tokens, pred_tokens)

In [None]:
pred_ref_df=test_model(model, tokenizer, test)

In [None]:
test.reset_index(inplace=True, drop=True)

In [None]:
pred_ref_df.head()

Unnamed: 0,Predictions,Reference
65768,"A relatively short smile, a white horse, quick...","yes , we can because the very act of creation ..."
65769,He was also aware of the fact that if he is to...,he also knew that his disciples would need cou...
65770,I'm told you that they have been locked down a...,thanks to the explanations in the christian gr...
65771,"He is a lender, and he sees the oil that it is...",but jehovah saw a potential for good in us .
65772,"To illustrate: ""I've found it easier to turn b...",titus related that the brothers in corinth had...


In [None]:
pred_ref_df.to_csv("initial pred.csv",header=True)

In [None]:
from nltk.translate.bleu_score import corpus_bleu

ref_tokens=[]
pred_tokens=[]

for row in pred_ref_df["Reference"]:
    ref_tokens.append(row.split())
for row in pred_ref_df["Predictions"]:
   pred_tokens.append(row.split())

corpus_bleu(ref_tokens, pred_tokens)

Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


0.019551158342908225

##Transforming training data

In [None]:
train_tokens=tokenizer.prepare_seq2seq_batch(list(train["source_sentence"]), tgt_texts=list(train["target_sentence"]), padding=True,truncation=True, return_tensors="pt" )
train_tokens.to(device)

{'input_ids': tensor([[44148, 49453,    16,  ..., 52235, 52235, 52235],
        [   39,     1,   192,  ..., 52235, 52235, 52235],
        [  248, 26168,    16,  ..., 52235, 52235, 52235],
        ...,
        [   39, 21847,    44,  ..., 52235, 52235, 52235],
        [ 1474,    28,  6609,  ..., 52235, 52235, 52235],
        [   39, 13422,   453,  ..., 52235, 52235, 52235]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0'), 'labels': tensor([[  166,    93,    34,  ..., 52235, 52235, 52235],
        [ 2381,    52,    42,  ..., 52235, 52235, 52235],
        [   34,   250,  1196,  ..., 52235, 52235, 52235],
        ...,
        [  411,    54,     5,  ..., 52235, 52235, 52235],
        [   18,   182,  2678,  ..., 52235, 52235, 52235],
        [   79,    22,   150,  ..., 522

In [None]:
dev_tokens=tokenizer.prepare_seq2seq_batch(list(dev["source_sentence"]), tgt_texts=list(dev["target_sentence"]), padding=True,truncation=True, return_tensors="pt" )
dev_tokens.to(device)

{'input_ids': tensor([[   32,    44,  5586,  ..., 52235, 52235, 52235],
        [  277,    44,     3,  ..., 52235, 52235, 52235],
        [   32,  1925,  4786,  ..., 52235, 52235, 52235],
        ...,
        [ 6919,  7492,  7601,  ..., 52235, 52235, 52235],
        [  111, 47451,    13,  ..., 52235, 52235, 52235],
        [ 8134, 39370,   286,  ..., 52235, 52235, 52235]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0'), 'labels': tensor([[  276,   342,    44,  ..., 52235, 52235, 52235],
        [  277,    44,     3,  ..., 52235, 52235, 52235],
        [  765,    44,     3,  ..., 52235, 52235, 52235],
        ...,
        [    5,  3381,   639,  ..., 52235, 52235, 52235],
        [    5,  1273,    29,  ..., 52235, 52235, 52235],
        [ 5620,   799,   158,  ..., 522

In [None]:
class CustomDataset(torch.utils.data.Dataset):

    def __init__(self, data, maxlen, tokenizer, with_labels=True,):

        self.data = data  # pandas dataframe
        #Initialize the tokenizer
        self.tokenizer = tokenizer  

        # self.tokenized_data = self.tokenizer

        self.maxlen = maxlen
        self.with_labels = with_labels 

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):

        # print(type(self.data))
        # print("index:", index)

        # Selecting sentence1 and sentence2 at the specified index in the data frame
        sent1 = [self.data.loc[index, "source_sentence"]]
        sent2 = [self.data.loc[index, "target_sentence"]]

        # print("Sent1:", len(sent1))
        # print(sent1)
        # print("Sent2:", len(sent2))
        # print(sent2)

        # Tokenize the pair of sentences to get token ids, attention masks and token type ids
        encoded_pair = self.tokenizer.prepare_seq2seq_batch(sent1, sent2, 
                                      # max_length=tokenizer.max_len,
                                      padding='max_length',  # Pad to max_length
                                      truncation=True,  # Truncate to max_length
                                      return_tensors='pt')  # Return torch.Tensor objects

         
        # for key in encoded_pair.keys():
          # print("encoded pair : %s : %s"%(key, str(encoded_pair[key].size())))
          # print(encoded_pair[key])
          # print(tokenizer.convert_ids_to_tokens(encoded_pair[key].squeeze(0)))

        encoded_pair['input_ids'] = encoded_pair['input_ids'].squeeze(0)  # tensor of token ids
        encoded_pair['attention_mask'] = encoded_pair['attention_mask'].squeeze(0)  # binary tensor with "0" for padded values and "1" for the other values

        # print("ids:", input_ids.size())
        # print(attn_masks.size())

        if self.with_labels:  # True if the dataset has labels
            encoded_pair['labels'] = encoded_pair['labels'].squeeze(0)
            # return input_ids, attn_masks, label   -- This is what I saw in an example. It resulted to an erroe
            return encoded_pair
            # return sent1, sent2
        else:
            # return sent1, sent2
            return input_ids, attn_masks

In [None]:
dev.reset_index(inplace=True, drop=True)

In [None]:
train_dataset= CustomDataset(train, 1024, tokenizer, with_labels=True,)
eval_dataset= CustomDataset(dev, 1024, tokenizer, with_labels=True,)

# batch_size = 1

# train_loader= DataLoader(train,shuffle=False,batch_size=batch_size,)
# valid_loader =DataLoader(valid,shuffle=False,batch_size=batch_size,)

In [None]:
from transformers import Trainer, TrainingArguments

In [None]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total # of training epochs
    per_device_train_batch_size=4,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    #tokenizer=tokenizer
)

In [None]:
trainer.train()  

In [None]:
test_tokens=tokenizer.prepare_seq2seq_batch(list(test["source_sentence"]),padding=True,truncation=True, return_tensors="pt" )
test_tokens.to(device)

{'input_ids': tensor([[13819, 12963,    44,  ..., 52235, 52235, 52235],
        [   32,    42,   181,  ..., 52235, 52235, 52235],
        [ 8134, 39413,   293,  ..., 52235, 52235, 52235],
        ...,
        [   32,  2157,  6867,  ..., 52235, 52235, 52235],
        [ 3211,    16,  2824,  ..., 52235, 52235, 52235],
        [   28, 32022,  7219,  ..., 52235, 52235, 52235]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')}

##Model Testing

In [None]:
test_predictions

#Option 1:#Gives CUDA Runtime Error -Out of memory
#test_predictions = [tokenizer.decode(t, skip_special_tokens=True) for t in model.generate(**test_tokens)]

#option2
ref_preds_final=test_model(model,tokenizer,test)

In [None]:
#ref_preds_final=pd.DataFrame({"Predictions": test_predictions, "Reference": test["target_sentence"]})

In [None]:
ref_preds_final.head(20)

Unnamed: 0,Predictions,Reference
65768,"we can be sure, for as god created humans with...","yes , we can because the very act of creation ..."
65769,he also knew that he needed to encourage his d...,he also knew that his disciples would need cou...
65770,"thankfully, the greek scriptures assure us tha...",thanks to the explanations in the christian gr...
65771,but jehovah saw something good in our lives.,but jehovah saw a potential for good in us .
65772,it explains that the brothers in corinth appli...,titus related that the brothers in corinth had...
65773,how can we be sure that a person is not guided...,how can we identify a physical person ?
65774,( read psalm 119 : 130 ; john 16 : 13. ),( read psalm 119 : 130 ; john 16 : 13 . )
65775,he left behind and lost gods favor and his fav...,he went astray and lost the favor of our patie...
65776,the apostle paul explained this when he said :...,the apostle paul was being realistic when he w...
65777,he was very angry with his heart.,queen esther was greatly distressed .


In [None]:
from nltk.translate.bleu_score import corpus_bleu

In [None]:
ref_preds_final.to_csv("ach-en-pred-ref-v2.csv",header=True)

In [None]:
from nltk.translate.bleu_score import corpus_bleu

ref_tokens=[]
pred_tokens=[]

for row in pred_ref_df["Reference"]:
    ref_tokens.append(row.split())
for row in pred_ref_df["Predictions"]:
   pred_tokens.append(row.split())

corpus_bleu(ref_tokens, pred_tokens)

Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


0.4615432484914063

###Save the model

In [None]:
model.save("./model")