# DEPRECATED

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertConfig, BertForMaskedLM, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, TensorDataset
import torch
import mlflow
from nltk.translate.bleu_score import sentence_bleu
from datasets import Dataset, load_from_disk
from tqdm import tqdm
import os

In [None]:
data_file = "./en-ja.bicleaner05.txt"
data_modified = "./en-ja.txt"

In [None]:
with open(data_file, 'r', encoding='utf-8') as f_in:
    with open(data_modified, 'w', encoding='utf-8') as f_out:
        for line in f_in:
            columns = line.strip().split('\t')
            english_text = columns[-2]
            japanese_text = columns[-1]
            f_out.write(f"{english_text}\t{japanese_text}\n")

In [None]:
file_path = "./en-ja-test.txt"

with open(data_modified, 'r', encoding='utf-8') as file:
    lines = file.readlines()[:50]
with open(file_path, 'w', encoding='utf-8') as file_d:
    lines_d = file_d.writelines(lines)

pairs = [line.strip().split('\t') for line in lines]

In [2]:
data = load_from_disk("./traintmp")

In [3]:
# df = pd.DataFrame(pairs, columns=['english_sentence', 'japanese_sentence'])
df = data.to_pandas()
df.drop(["text"], axis=1, inplace=True)
df.dropna(inplace=True)
new_col = {"en": 'english_sentence', "jp": 'japanese_sentence'}
df = df.rename(columns=new_col)

train_df, val_df = train_test_split(df, test_size=0.3, random_state=0)

# train_input_ids, train_attention_masks = tokenize_sentences(train_df['japanese_sentence'])
# val_input_ids, val_attention_masks = tokenize_sentences(val_df['japanese_sentence'])

# train_labels, _ = tokenize_sentences(train_df['english_sentence'])
# val_labels, _ = tokenize_sentences(val_df['english_sentence'])

batch_size = 8
# train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
# train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_labels)
# val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

In [6]:
# torch.save(train_dataloader, "./BERT/train_dataloader")
# torch.save(val_dataloader, "./BERT/val_dataloader")

In [4]:
train_dataloader = torch.load("./BERT/train_dataloader")
val_dataloader = torch.load("./BERT/val_dataloader")

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', padding_side= "left")

def tokenize_sentences(sentences):
    input_ids = []
    attention_masks = []
    for sent in sentences:
        encoded_dict = tokenizer.encode_plus(
                            sent,                      
                            add_special_tokens = True, 
                            max_length = 128,         
                            pad_to_max_length = True,
                            return_attention_mask = True,  
                            return_tensors = 'pt',  
                       )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    
    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0)

In [12]:
config = BertConfig.from_pretrained('bert-base-multilingual-cased')
model = BertForMaskedLM(config=config)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_dataloader) * 20
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

def save_checkpoint(state, filename, model, model_filename):
    torch.save(state, filename)
    torch.save(model, model_filename)

def load_checkpoint(filename, model_filename, model, optimizer, scheduler):
    load = torch.load(filename)
    model = torch.load(model_filename)
    optimizer.load_state_dict(load['optimizer'])
    scheduler.load_state_dict(load['scheduler'])
    start_epoch = load['epoch']
    total_steps = load['total_steps']
    return start_epoch, total_steps, model, optimizer, scheduler

start_epoch = 0
checkpoint_file = './BERT/checkpoint.pt'
model_file = './BERT/checkpoint_model.pt'
if os.path.isfile(checkpoint_file):
    print('Recovering from previous checkpoint-----')
    start_epoch, total_steps, model, optimizer, scheduler = load_checkpoint(checkpoint_file, model_file, model, optimizer, scheduler)
    print('Starting Epoch: {}, Total Steps: {}'.format(start_epoch, total_steps))
else:
    print('No checkpoint file found')

mlflow.set_tracking_uri("./BERT")
mlflow.set_experiment('BERT Japanese-English Translation V5')

if os.path.exists("./last_run.txt"):
    with open("./last_run.txt", "r") as f:
        id = f.readline().strip()
        mlflow.start_run(id)
else:
    mlflow.start_run()
with open("./last_run.txt", "w") as f:
    f.write(str(mlflow.active_run().info.run_id))

num_epochs = 20

for epoch in range(start_epoch, num_epochs):
    model.train()
    total_loss = 0
    size = len(train_dataloader)
    print("Epoch {}".format(epoch))
    for step, batch in enumerate(tqdm(train_dataloader)):
        batch = tuple(t.to(device) for t in batch)
        input_ids, attention_masks, labels = batch
        
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_masks, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        if step % 10000 == 0:
            loss, current = loss.item(), step * batch_size
            tqdm.write(f" |   loss: {loss}  [{current}/{size}]")
            save_checkpoint({
            'optimizer': optimizer.state_dict(),
            'scheduler': scheduler.state_dict(),
            'epoch': epoch,
            'total_steps': total_steps},
            checkpoint_file,
            model,
            model_file)
    
    avg_train_loss = total_loss / len(train_dataloader)
    print(f'Epoch {epoch+1}/{num_epochs} - Average training loss: {avg_train_loss}')

    mlflow.log_metric('train_loss', avg_train_loss, step=epoch + 1)


def evaluate_model(model, dataloader):
    model.eval()
    total_bleu_score = 0
    with torch.no_grad():
        for batch in dataloader:
            batch = tuple(t.to(device) for t in batch)
            input_ids, attention_masks, labels = batch
            outputs = model.generate(input_ids=input_ids, attention_mask=attention_masks, max_length= 129)
            predicted_sentences = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
            reference_sentences = [tokenizer.decode(label, skip_special_tokens=True) for label in labels]
            for predicted, reference in zip(predicted_sentences, reference_sentences):
                bleu_score = sentence_bleu([reference.split()], predicted.split())
                total_bleu_score += bleu_score
    
    avg_bleu_score = total_bleu_score / len(dataloader.dataset)
    return avg_bleu_score

avg_val_bleu_score = evaluate_model(model, val_dataloader)
print(f'Average BLEU score on validation data: {avg_val_bleu_score}')

model_save_path = "./BERT/bert_translation_model"
torch.save(model.state_dict(), model_save_path)
mlflow.pytorch.log_model(model, artifact_path='models')
mlflow.log_metric('val_bleu_score', avg_val_bleu_score)
mlflow.end_run()




Recovering from previous checkpoint-----
Starting Epoch: 2 / 20, Total Steps: 882360
Epoch 2


  0%|          | 0/44118 [00:00<?, ?it/s]

 |   loss: 11.759979248046875  [0/44118]


  2%|▏         | 934/44118 [15:05<7:35:10,  1.56it/s]