#### Loading libraries

In [None]:
! pip install torch
! pip install transformers
! pip install tqdm
! pip install pandas
! pip install torchmetrics
! pip install -U sentence-transformers

In [4]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from tqdm import tqdm, trange
import pandas as pd
from torchmetrics.regression import PearsonCorrCoef
from sentence_transformers import SentenceTransformer, util, InputExample, losses, models, evaluation
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
def load_data(file_path):
    data = pd.read_table(file_path)
    # check if any missing values
    print(data.isnull().sum())
    key = data.keys()
    # some values were missing in sentence2 column, so did the below (sentence1 didnt split properly)
    # iterate through the rows in dataframe which have missing values
    for index, row in data[data.isnull().any(axis=1)].iterrows():
        if pd.isnull(row[key[2]]):
            if(len(row[key[1]].split('\t')) > 2 or len(row[key[1]].split('\t')) < 2):
                data.drop(index, inplace=True)
                continue
            # split the sentence1 into words into 2 parts based on \t and assign to sentence1 and sentence2
            sentence1, sentence2 = row[key[1]].split('\t')
            sentenceid = row[key[0]]
            # assign to the row
            data.at[index, key[1]] = sentence1
            data.at[index, key[2]] = sentence2
            data.at[index, key[0]] = sentenceid
    return data

In [6]:
test_data = load_data('test.csv')

id           0
sentence1    0
sentence2    0
dtype: int64


### Defining Device Variable

In [14]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda:0


### Task 1C

In [81]:
from sentence_transformers import SentenceTransformer, models
# Create the Sentence Transformer model
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

In [82]:
class ValidationDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence1 = self.data.iloc[idx, 1]
        sentence2 = self.data.iloc[idx, 2]
        id = torch.tensor(self.data.iloc[idx, 0])
        return [sentence1, sentence2], id

test_dataset = ValidationDataset(test_data)

test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=8, shuffle=False)

In [83]:
model.to(device)

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: DistilBertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [15]:
scores = []
#load checkpoint
checkpoint = torch.load('/1C_model.pth')
model.load_state_dict(checkpoint)
def validation(model, valid_loader):
    model.eval()
    all_scores = []
    with torch.no_grad():
        for batch in tqdm(valid_loader):
            sentences, targets = batch
            for i in range(len(targets)):
                sentence1_features = model.encode(sentences[0][i], convert_to_tensor=True).to(device)
                sentence2_features = model.encode(sentences[1][i], convert_to_tensor=True).to(device)
                score = util.pytorch_cos_sim(sentence1_features, sentence2_features)
                all_scores.append(score.item())
    return all_scores

In [80]:
output1c = validation(model, test_loader)

100%|██████████| 1/1 [00:00<00:00,  3.16it/s]

[0.9926149845123291, 0.7718663811683655, 0.6741741299629211, 0.9912800788879395, 0.7612790465354919, 0.9965555667877197]





In [7]:
scores = output1c
scores = [i * 5 for i in scores]
pearson = PearsonCorrCoef()
print(torch.tensor(scores))
print(torch.tensor([5.000, 4.750, 5.000, 2.400, 2.750, 2.615]))
pearson(torch.tensor(scores), torch.tensor([5.000, 4.750, 5.000, 2.400, 2.750, 2.615]))

In [13]:
#add scores to test_data
test_data['scores'] = scores
#make order as id, scores, sentence1, sentence2
test_data = test_data[['id', 'scores', 'sentence1', 'sentence2']]
test_data.to_csv('test_scores.csv', index=False, sep='\t')

## Task 2A

#### Installations

In [1]:
!pip install nltk
!pip install bert_score
!pip install datasets
!pip install evaluate
!pip install -U  torchdata
!pip install -U spacy
!python -m spacy download en_core_web_sm
!python -m spacy download de_core_news_sm




[notice] A new release of pip is available: 23.3.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 23.3.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 23.3.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 23.3.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 23.3.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 23.3.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.1/12.8 MB 465.5 kB/s eta 0:00:28
     - -------------------------------------- 0.4/12.8 MB 2.4 MB/s eta 0:00:06
     - -------------------------------------- 0.4/12.8 MB 2.4 MB/s eta 0:00:06
     -- ------------------------------------- 0.7/12.8 MB 2.8 MB/s eta 0:00:05
     -- ------------------------------------- 0.8/12.8 MB 3.1 MB/s eta 0:00:04
     --- ------------------------------------ 1.1/12.8 MB 3.3 MB/s eta 0:00:04
     ---- ----------------------------------- 1.4/12.8 MB 3.8 MB/s eta 0:00:04
     ---- ----------------------------------- 1.5/12.8


[notice] A new release of pip is available: 23.3.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting de-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.7.0/de_core_news_sm-3.7.0-py3-none-any.whl (14.6 MB)
     ---------------------------------------- 0.0/14.6 MB ? eta -:--:--
     --------------------------------------- 0.0/14.6 MB 330.3 kB/s eta 0:00:45
     --------------------------------------- 0.0/14.6 MB 330.3 kB/s eta 0:00:45
     --------------------------------------- 0.1/14.6 MB 491.5 kB/s eta 0:00:30
      --------------------------------------- 0.4/14.6 MB 2.2 MB/s eta 0:00:07
     -- ------------------------------------- 0.8/14.6 MB 3.7 MB/s eta 0:00:04
     --- ------------------------------------ 1.1/14.6 MB 4.5 MB/s eta 0:00:04
     ---- ----------------------------------- 1.7/14.6 MB 5.1 MB/s eta 0:00:03
     ----- ---------------------------------- 2.0/14.6 MB 5.7 MB/s eta 0:00:03
     ------ --------------------------------- 2.3/14.6 MB 5.5 MB/s eta 0:00:03
     ------- ----------------------


[notice] A new release of pip is available: 23.3.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import numpy as np
import matplotlib.pyplot as plt
from datasets import load_dataset
import tqdm
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader
from typing import Iterable, List
from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import Transformer
import math
from timeit import default_timer as timer
import nltk 
from nltk.tokenize import word_tokenize
import evaluate
import bert_score
import wandb

In [3]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#### Loading data and data visualization

In [4]:
train_data = load_dataset("wmt16", "de-en", split="train[:50000]")
val_data = load_dataset("wmt16", "de-en", split="validation")
test_data = load_dataset("wmt16", "de-en", split="test")

In [5]:
print(f"Train data size: {len(train_data)}, type: {type(train_data)}")
print(f"Validation data size: {len(val_data)}, type: {type(val_data)}")
print(f"Test data size: {len(test_data)}, type: {type(test_data)}")

Train data size: 50000, type: <class 'datasets.arrow_dataset.Dataset'>
Validation data size: 2169, type: <class 'datasets.arrow_dataset.Dataset'>
Test data size: 2999, type: <class 'datasets.arrow_dataset.Dataset'>


In [6]:
for i in range(5):
    data = train_data[i]
    german = data["translation"]["de"]
    english = data["translation"]["en"]
    print(f"German: {german}")
    print(f"English: {english}")

German: Wiederaufnahme der Sitzungsperiode
English: Resumption of the session
German: Ich erkläre die am Freitag, dem 17. Dezember unterbrochene Sitzungsperiode des Europäischen Parlaments für wiederaufgenommen, wünsche Ihnen nochmals alles Gute zum Jahreswechsel und hoffe, daß Sie schöne Ferien hatten.
English: I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.
German: Wie Sie feststellen konnten, ist der gefürchtete "Millenium-Bug " nicht eingetreten. Doch sind Bürger einiger unserer Mitgliedstaaten Opfer von schrecklichen Naturkatastrophen geworden.
English: Although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful.
German: Im Parlament besteht der Wunsch nach einer Aussprache im Verlauf dieser Sit

In [7]:
config = dict(
    source_language = "de", 
    target_language = "en",
    EMB_SIZE = 512,
    NHEAD = 8,
    FFN_HID_DIM = 512,
    BATCH_SIZE = 64,
    NUM_ENCODER_LAYERS = 3,
    NUM_DECODER_LAYERS = 3,
    NUM_EPOCHS = 8
)

In [8]:
source_language = config["source_language"]
target_language = config["target_language"]

#### Initialising tokenizer

In [9]:
# Tokenization
token_transform = {}
token_transform[source_language] = get_tokenizer('spacy', language='de_core_news_sm')
token_transform[target_language] = get_tokenizer('spacy', language='en_core_web_sm')

In [10]:
for i in range(5): 
    print(train_data[i])
    data_pt = train_data[i]
    data_src = data_pt['translation'][source_language]
    data_tgt = data_pt['translation'][target_language]
    print(f"German: {data_src}")
    print(f"English: {data_tgt}")
    print(f"Tokenized German: {token_transform[source_language](data_src)}")
    print(f"Tokenized English: {token_transform[target_language](data_tgt)}")
    break
    

{'translation': {'de': 'Wiederaufnahme der Sitzungsperiode', 'en': 'Resumption of the session'}}
German: Wiederaufnahme der Sitzungsperiode
English: Resumption of the session
Tokenized German: ['Wiederaufnahme', 'der', 'Sitzungsperiode']
Tokenized English: ['Resumption', 'of', 'the', 'session']


In [11]:
# helper function to yield list of tokens
def yield_tokens(data_iter: Iterable, language: str) -> List[str]:
    for data_sample in data_iter:
        yield token_transform[language](data_sample['translation'][language])

In [12]:
tokens = yield_tokens(train_data, source_language)
# Checking whether tokenization works or not
for token in tokens: 
    print(token)
    break

['Wiederaufnahme', 'der', 'Sitzungsperiode']


#### Text pre-processing

In [13]:
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

In [14]:
vocab_transform = {}   
for ln in [source_language, target_language]:
    # Create torchtext's Vocab object
    vocab_transform[ln] = build_vocab_from_iterator(yield_tokens(train_data, ln), min_freq=1, specials=special_symbols)

In [15]:
for ln in [source_language, target_language]:
  vocab_transform[ln].set_default_index(UNK_IDX)

In [16]:
SRC_VOCAB_SIZE = len(vocab_transform[source_language])
TGT_VOCAB_SIZE = len(vocab_transform[target_language])

In [17]:
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

In [18]:
def create_mask(src, tgt):
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len),device=DEVICE).type(torch.bool)

    src_padding_mask = (src == PAD_IDX).transpose(0, 1)
    tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

In [19]:
def sequential_transforms(*transforms):
    def func(txt_input):
        for transform in transforms:
            txt_input = transform(txt_input)
        return txt_input
    return func

In [20]:
def tensor_transform(token_ids: List[int]):
    return torch.cat((torch.tensor([BOS_IDX]), torch.tensor(token_ids), torch.tensor([EOS_IDX])))

In [21]:
text_transform = {}
for ln in [source_language, target_language]:
    text_transform[ln] = sequential_transforms(token_transform[ln], #Tokenization
                                               vocab_transform[ln], #Numericalization
                                               tensor_transform) # Add BOS/EOS and create tensor

#### Dataloaders

In [22]:
def collate_fn(batch):
    src_batch, tgt_batch = [], []
    for element in batch: 
        src_ = element["translation"][source_language]
        tgt_ = element["translation"][target_language]
        src_batch.append(text_transform[source_language](src_))
        tgt_batch.append(text_transform[target_language](tgt_))
    src_batch = nn.utils.rnn.pad_sequence(src_batch, padding_value=PAD_IDX)
    tgt_batch = nn.utils.rnn.pad_sequence(tgt_batch, padding_value=PAD_IDX)
    element = {'src': src_batch, 'tgt': tgt_batch}
    return element

#### Model Architecture

In [23]:
# helper Module that adds positional encoding to the token embedding to introduce a notion of word order.
class PositionalEncoding(nn.Module):
    def __init__(self, emb_size: int, dropout: float, maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])

In [24]:
# helper Module to convert tensor of input indices into corresponding tensor of token embeddings
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

In [25]:
# Seq2Seq Network
class Seq2SeqTransformer(nn.Module):
    def __init__(self, num_encoder_layers: int, num_decoder_layers: int, emb_size: int, nhead: int, src_vocab_size: int, tgt_vocab_size: int, dim_feedforward: int = 512, dropout: float = 0.1):
        super(Seq2SeqTransformer, self).__init__()
        self.transformer = Transformer(d_model=emb_size, nhead=nhead, num_encoder_layers=num_encoder_layers, num_decoder_layers=num_decoder_layers, dim_feedforward=dim_feedforward, dropout=dropout)
        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(emb_size, dropout=dropout)

    def forward(self, src: Tensor, trg: Tensor, src_mask: Tensor, tgt_mask: Tensor, src_padding_mask: Tensor, tgt_padding_mask: Tensor, memory_key_padding_mask: Tensor):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None, src_padding_mask, tgt_padding_mask, memory_key_padding_mask)
        return self.generator(outs)

    def encode(self, src: Tensor, src_mask: Tensor):
        return self.transformer.encoder(self.positional_encoding(self.src_tok_emb(src)), src_mask)

    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
        return self.transformer.decoder(self.positional_encoding(self.tgt_tok_emb(tgt)), memory, tgt_mask)

In [26]:
torch.manual_seed(0)

transformer = Seq2SeqTransformer(config["NUM_ENCODER_LAYERS"], config["NUM_DECODER_LAYERS"], config["EMB_SIZE"], config["NHEAD"], SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, config["FFN_HID_DIM"])

for p in transformer.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

transformer = transformer.to(DEVICE)

loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_IDX)

optimizer = torch.optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)



In [27]:
def evaluate(model, dataloader):
    model.eval()
    losses = 0

    for i, batch in enumerate(dataloader):
        src = batch["src"]
        tgt = batch["tgt"]
        src = src.to(DEVICE)
        tgt = tgt.to(DEVICE)

        tgt_input = tgt[:-1, :]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

        tgt_out = tgt[1:, :]
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        losses += loss.item()
    return losses / len(list(dataloader))

#### Evaluation

In [28]:
# function to generate output sequence using greedy algorithm
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
    for i in range(max_len-1):
        memory = memory.to(DEVICE)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0)).type(torch.bool)).to(DEVICE)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.item()

        ys = torch.cat([ys, torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
            break
    return ys

In [29]:
# actual function to translate input sentence into target language
def translate(model: torch.nn.Module, src_sentence: str):
    model.eval()
    src = text_transform[config["source_language"]](src_sentence).view(-1, 1)
    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = greedy_decode(
        model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()
    return " ".join(vocab_transform[config["target_language"]].lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "")

In [30]:
# load model
transformer = Seq2SeqTransformer(config["NUM_ENCODER_LAYERS"], config["NUM_DECODER_LAYERS"], config["EMB_SIZE"], config["NHEAD"], SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, config["FFN_HID_DIM"])
transformer.load_state_dict(torch.load("Task_2A/2A_transformer_run_3.pth"))
transformer = transformer.to(DEVICE)



In [None]:
# compute BLEU score using huggingface datasets library
from datasets import load_metric
metric = load_metric("bleu")
# compute actual BLEU score on validation dataset
train_predictions = []
train_references = []
# generate predictions and references for all sentences in the train dataset
for i in tqdm.tqdm(range(len(train_data))):
    src = train_data[i]["translation"][config["source_language"]]
    prediction = translate(transformer, src)
    reference = train_data[i]["translation"][config["target_language"]]
    train_predictions.append(prediction)
    train_references.append([reference])
train_reference = [r[0] for r in train_references]

In [31]:
val_predictions = []    
val_references = []
# generate predictions and references for all sentences in the validation dataset
for i in tqdm.tqdm(range(len(val_data))):
    src = val_data[i]["translation"][config["source_language"]]
    prediction = translate(transformer, src)
    reference = val_data[i]["translation"][config["target_language"]]
    val_predictions.append(prediction)
    val_references.append([reference])
val_reference = [r[0] for r in val_references]

  attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)
100%|██████████| 2169/2169 [11:18<00:00,  3.20it/s]


In [None]:
test_predictions = []
test_references = []
# generate predictions and references for all sentences in the test dataset
for i in tqdm.tqdm(range(len(test_data))):
    src = test_data[i]["translation"][config["source_language"]]
    prediction = translate(transformer, src)
    reference = test_data[i]["translation"][config["target_language"]]
    test_predictions.append(prediction)
    test_references.append([reference])
test_reference = [r[0] for r in test_references]

In [34]:
from evaluate import load
meteor = load('meteor')
train_meteor = meteor.compute(predictions=train_predictions, references=train_references)
val_meteor = meteor.compute(predictions=val_predictions, references=val_references)
test_meteor = meteor.compute(predictions=test_predictions, references=test_references)
print(f"Train METEOR score: {train_meteor}")
print(f"Validation METEOR score: {val_meteor}")
print(f"Test METEOR score: {test_meteor}")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\prakh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\prakh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\prakh\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Validation METEOR score: {'meteor': 0.23495087207185508}


In [None]:
bertScore = load("bertscore")
train_bert = bertScore.compute(predictions=train_predictions, references=train_references, lang="en")
train_precision = np.mean(train_bert["precision"])
train_recall = np.mean(train_bert["recall"])
train_f1 = np.mean(train_bert["f1"])
val_bert = bertScore.compute(predictions=val_predictions, references=val_references, lang="en")
val_precision = np.mean(val_bert["precision"])
val_recall = np.mean(val_bert["recall"])
val_f1 = np.mean(val_bert["f1"])
test_bert = bertScore.compute(predictions=test_predictions, references=test_references, lang="en")
test_precision = np.mean(test_bert["precision"])
test_recall = np.mean(test_bert["recall"])
test_f1 = np.mean(test_bert["f1"])
print(f"Train data: Precision: {train_precision}, Recall: {train_recall}, F1: {train_f1}")
print(f"Validation data: Precision: {val_precision}, Recall: {val_recall}, F1: {val_f1}")
print(f"Test data: Precision: {test_precision}, Recall: {test_recall}, F1: {test_f1}")

In [36]:
from nltk.translate.bleu_score import corpus_bleu
def compute_bleu(reference_validation,translations_validation):
    w1 = [1, 0, 0, 0]
    w2 = [0.5, 0.5, 0, 0]
    w3 = [1/3, 1/3, 1/3, 0]
    w4 = [0.25, 0.25, 0.25, 0.25]
    bleu1 = corpus_bleu([[ref] for ref in reference_validation], translations_validation, weights=w1)
    bleu2 = corpus_bleu([[ref] for ref in reference_validation], translations_validation, weights=w2)
    bleu3 = corpus_bleu([[ref] for ref in reference_validation], translations_validation, weights=w3)
    bleu4 = corpus_bleu([[ref] for ref in reference_validation], translations_validation, weights=w4)
    print("BLEU-1 score:", bleu1)
    print("BLEU-2 score:", bleu2)
    print("BLEU-3 score:", bleu3)
    print("BLEU-4 score:", bleu4)
# print("Training Dataset: ")
# compute_bleu(train_reference, train_predictions)
print("Validation Dataset: ")
compute_bleu(val_reference, val_predictions)
# print("Testing Dataset: ")
# compute_bleu(test_reference, test_predictions)

Validation Dataset: 
BLEU-1 score: 0.718031894381054
BLEU-2 score: 0.5519852998864307
BLEU-3 score: 0.41669291365055505
BLEU-4 score: 0.32828448914569114


## TASK 2B

In [None]:
csv_2b = pd.read_csv("file_path")

sentences_de = csv_2b['en']

In [None]:
model_name = "google-t5/t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
tranlations_en = []

# testing_data = test_Data
print(device)
model.to(device)

# for data in [validation_data, testing_data]:
for example in tqdm(sentences_de):
    # Prepend the prefix for translation task
    input_text = "translate English to German: " + example["translation"]["en"]  # Modified here
    # Tokenize the input text
    input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)
    # Generate translation
    translated_ids = model.generate(input_ids,  max_length=512, num_beams=4, early_stopping=True).to(device)
    # Decode the translated ids
    translated_text = tokenizer.decode(translated_ids[0], skip_special_tokens=True)
    # if data == validation_data:
    tranlations_en.append(translated_text)

# make a new csv with columns id, de, en
csv_2b['de'] = tranlations_en
csv_2b.to_csv('en_de.csv', index=False, sep='\t')


In [None]:
# compute BLEU score using huggingface datasets library
from datasets import load_metric
metric = load_metric("bleu")
# compute actual BLEU score on validation dataset
train_predictions = []
train_references = []
# generate predictions and references for all sentences in the train dataset
for i in tqdm.tqdm(range(len(train_data))):
    src = train_data[i]["translation"][config["source_language"]]
    prediction = translate(transformer, src)
    reference = train_data[i]["translation"][config["target_language"]]
    train_predictions.append(prediction)
    train_references.append([reference])
train_reference = [r[0] for r in train_references]

In [None]:
val_predictions = []    
val_references = []
# generate predictions and references for all sentences in the validation dataset
for i in tqdm.tqdm(range(len(val_data))):
    src = val_data[i]["translation"][config["source_language"]]
    prediction = translate(transformer, src)
    reference = val_data[i]["translation"][config["target_language"]]
    val_predictions.append(prediction)
    val_references.append([reference])
val_reference = [r[0] for r in val_references]

In [None]:
test_predictions = []
test_references = []
# generate predictions and references for all sentences in the test dataset
for i in tqdm.tqdm(range(len(test_data))):
    src = test_data[i]["translation"][config["source_language"]]
    prediction = translate(transformer, src)
    reference = test_data[i]["translation"][config["target_language"]]
    test_predictions.append(prediction)
    test_references.append([reference])
test_reference = [r[0] for r in test_references]

In [None]:
meteor = evaluate.load('meteor')
# train_meteor = meteor.compute(predictions=train_predictions, references=train_references)
val_meteor = meteor.compute(predictions=val_predictions, references=val_references)
# test_meteor = meteor.compute(predictions=test_predictions, references=test_references)
# print(f"Train METEOR score: {train_meteor}")
print(f"Validation METEOR score: {val_meteor}")
# print(f"Test METEOR score: {test_meteor}")

In [None]:
bertScore = load("bertscore")
# train_bert = bertScore.compute(predictions=train_predictions, references=train_references, lang="en")
# train_precision = np.mean(train_bert["precision"])
# train_recall = np.mean(train_bert["recall"])
# train_f1 = np.mean(train_bert["f1"])
val_bert = bertScore.compute(predictions=val_predictions, references=val_references, lang="en")
val_precision = np.mean(val_bert["precision"])
val_recall = np.mean(val_bert["recall"])
val_f1 = np.mean(val_bert["f1"])
# test_bert = bertScore.compute(predictions=test_predictions, references=test_references, lang="en")
# test_precision = np.mean(test_bert["precision"])
# test_recall = np.mean(test_bert["recall"])
# test_f1 = np.mean(test_bert["f1"])
# print(f"Train data: Precision: {train_precision}, Recall: {train_recall}, F1: {train_f1}")
print(f"Validation data: Precision: {val_precision}, Recall: {val_recall}, F1: {val_f1}")
# print(f"Test data: Precision: {test_precision}, Recall: {test_recall}, F1: {test_f1}")

In [None]:
from nltk.translate.bleu_score import corpus_bleu
def compute_bleu(reference_validation,translations_validation):
    w1 = [1, 0, 0, 0]
    w2 = [0.5, 0.5, 0, 0]
    w3 = [1/3, 1/3, 1/3, 0]
    w4 = [0.25, 0.25, 0.25, 0.25]
    bleu1 = corpus_bleu([[ref] for ref in reference_validation], translations_validation, weights=w1)
    bleu2 = corpus_bleu([[ref] for ref in reference_validation], translations_validation, weights=w2)
    bleu3 = corpus_bleu([[ref] for ref in reference_validation], translations_validation, weights=w3)
    bleu4 = corpus_bleu([[ref] for ref in reference_validation], translations_validation, weights=w4)
    print("BLEU-1 score:", bleu1)
    print("BLEU-2 score:", bleu2)
    print("BLEU-3 score:", bleu3)
    print("BLEU-4 score:", bleu4)
# print("Training Dataset: ")
# compute_bleu(train_reference, train_predictions)
print("Validation Dataset: ")
compute_bleu(val_reference, val_predictions)
# print("Testing Dataset: ")
# compute_bleu(test_reference, test_predictions)

## TASK 2C

In [None]:
csv_2c = pd.read_csv("file_path")

sentences_de = csv_2c['de']

In [None]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
from datasets import load_dataset, load_metric
from tqdm import tqdm

# Load the fine-tuned model
model_path = "/content/drive/MyDrive/fine_tuned_t5_small_task2C"  # Path to the saved fine-tuned model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = T5ForConditionalGeneration.from_pretrained(model_path).to(device)
tokenizer = T5Tokenizer.from_pretrained(model_path)

# Load the test dataset
test_dataset = load_dataset("wmt16", "de-en", split="test")

# Define the BLEU metric
bleu_metric = load_metric("sacrebleu")

# Generate translations for the test dataset
translations = []
references = []
for example in tqdm(sentences_de):
    input_text = "translate German to English: " + example
    input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=128, truncation=True).to(device)
    translated_ids = model.generate(input_ids, max_length=128, num_beams=4, early_stopping=True)
    translated_text = tokenizer.decode(translated_ids[0], skip_special_tokens=True)
    translations.append(translated_text)
    references.append(example["translation"]["en"])

# make a new csv with columns id, de, en
csv_2c['en'] = translations
csv_2c.to_csv('de_en.csv', index=False, sep='\t')

