In [2]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

model = T5ForConditionalGeneration.from_pretrained('t5-large')
tokenizer = T5Tokenizer.from_pretrained('t5-large')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# LINK PREDICTION

# load the dataset

In [None]:
### IGNORE
with open('/kaggle/input/project-dataset/kb.txt', 'r') as file:
    lines = file.readlines()
    number_of_lines = len(lines)
print(number_of_lines)

predict_head = []
predict_tail = []
inputs = []
targets = []

for line_index in range(len(lines)):
    line_split = lines[line_index].strip().split('|')
    subject = line_split[0]
    predicate = line_split[1]
    obj = line_split[2]
    if(line_index < len(lines)/2):
        predict_head_input = f"? | {predicate} | {obj}"
        predict_head_output = subject
        predict_head.append((predict_head_input, predict_head_output))
        inputs.append(predict_head_input)
        targets.append(predict_head_output)
    else:
        predict_tail_input = f"{subject} | {predicate} | ?"
        predict_tail_output = obj
        predict_tail.append((predict_tail_input, predict_tail_output))
        inputs.append(predict_tail_input)
        targets.append(predict_tail_output)
        
# print(len(predict_head[0]))
# print(predict_tail[:10])
print(inputs[:10], targets[:10])


# split the data for training, validation & testing

In [None]:
### IGNORE
from sklearn.model_selection import train_test_split

# Split data into training and temporary data (temp will be split into validation and test)
input_train, input_temp, target_train, target_temp = train_test_split(
    inputs, targets, test_size=0.2, random_state=42)  # 80% train, 20% temp

# Split the temporary data into validation and test sets
input_val, input_test, target_val, target_test = train_test_split(
    input_temp, target_temp, test_size=0.5, random_state=42)  # Split temp equally into val and test

# Print the sizes of each dataset to verify the splits
print(f"Training set size: {len(input_train)}")
print(f"Validation set size: {len(input_val)}")
print(f"Test set size: {len(input_test)}")

In [None]:
### IGNORE
def save_dataset(file_path, inputs, targets):
    with open(file_path, 'w', encoding='utf-8') as file:
        for input_text, target_text in zip(inputs, targets):
            file.write(f"{input_text}\t{target_text}\n")

# Save datasets to separate files
save_dataset("/kaggle/working/train_dataset.txt", input_train, target_train)
save_dataset("/kaggle/working/validation_dataset.txt", input_val, target_val)
save_dataset("/kaggle/working/test_dataset.txt", input_test, target_test)


In [None]:
### IGNORE
import pandas as pd

def save_dataset_pandas(file_path, inputs, targets):
    df = pd.DataFrame({'input': inputs, 'target': targets})
    df.to_csv(file_path, index=False, sep='\t')

# Save datasets to separate files
save_dataset_pandas("/kaggle/working/train_dataset.tsv", input_train, target_train)
save_dataset_pandas("/kaggle/working/validation_dataset.tsv", input_val, target_val)
save_dataset_pandas("/kaggle/working/test_dataset.tsv", input_test, target_test)


In [5]:
def load_dataset(file_path):
    inputs = []
    targets = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            input_text, target_text = line.strip().split('\t')
            inputs.append(input_text)
            targets.append(target_text)
    return inputs, targets

# Load datasets
train_inputs, train_targets = load_dataset("/kaggle/input/project-dataset-split/train_dataset.txt")
# validation_inputs, validation_targets = load_dataset("/kaggle/input/project-dataset-split/validation_dataset.txt")
# test_inputs, test_targets = load_dataset("/kaggle/input/project-dataset-split/test_dataset.txt")


#  tokenize data

In [None]:
def tokenize_data(inputs, targets, tokenizer, max_input_length=128, max_target_length=30):
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize training, validation, and test sets
train_data = tokenize_data(train_inputs, train_targets, tokenizer)
# val_data = tokenize_data(input_val, target_val, tokenizer)
# test_data = tokenize_data(input_test, target_test, tokenizer)


In [None]:
print(type(model_inputs))

In [None]:
from torch.utils.data import DataLoader, Dataset

class T5Dataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings.input_ids)

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

train_dataset = T5Dataset(train_data)
# val_dataset = T5Dataset(val_data)
# test_dataset = T5Dataset(test_data)

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
# val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False)
# test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [None]:
from transformers import AdamW, get_scheduler
import torch

optimizer = AdamW(model.parameters(), lr=5e-4)
num_epochs = 2
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

# Training loop
for epoch in range(num_epochs):
    model.train()
    print(f"Epoch : {epoch}")
    i = 1
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        if i%10 == 0:
            print(f"{i} Training loss: {loss.item()}")
        i = i+1


In [None]:
# Saving only the model weights
model_weights_path = "/kaggle/working/model_weights.pth"
torch.save(model.state_dict(), model_weights_path)


In [None]:
model.save_pretrained('/kaggle/working/T5-finetuned')

# load the model

In [2]:
import torch

# Path to the saved weights
model_weights_path = '/kaggle/input/t5-linkpred/model_weights.pth'

# Load the weights into the model
model.load_state_dict(torch.load(model_weights_path))


<All keys matched successfully>

# load validation and test data

In [3]:
def load_dataset(file_path):
    inputs = []
    targets = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            input_text, target_text = line.strip().split('\t')
            inputs.append(input_text)
            targets.append(target_text)
    return inputs, targets

# Load datasets
# train_inputs, train_targets = load_dataset("train_dataset.txt")
validation_inputs, validation_targets = load_dataset("/kaggle/input/project-dataset-split/validation_dataset.txt")
test_inputs, test_targets = load_dataset("/kaggle/input/project-dataset-split/test_dataset.txt")


In [4]:
print(len(validation_inputs))
print(len(test_inputs))

13474
13475


In [5]:
def tokenize_data(inputs, targets, tokenizer, max_input_length=128, max_target_length=30):
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize training, validation, and test sets
# train_data = tokenize_data(input_train, target_train, tokenizer)
val_data = tokenize_data(validation_inputs, validation_targets, tokenizer)
test_data = tokenize_data(test_inputs, test_targets, tokenizer)




In [6]:
from torch.utils.data import DataLoader, Dataset

class T5Dataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings.input_ids)

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

# train_dataset = T5Dataset(train_data)
val_dataset = T5Dataset(val_data)
test_dataset = T5Dataset(test_data)

# train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Evaluate

In [7]:
model.eval()

T5ForConditionalGeneration(
  (shared): Embedding(32128, 1024)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 1024)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1024, out_features=1024, bias=False)
              (k): Linear(in_features=1024, out_features=1024, bias=False)
              (v): Linear(in_features=1024, out_features=1024, bias=False)
              (o): Linear(in_features=1024, out_features=1024, bias=False)
              (relative_attention_bias): Embedding(32, 16)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=1024, out_features=4096, bias=False)
              (wo): Linear(in_features=4096, out_features=1024, bias=False)
              (d

In [13]:
import torch

def evaluate_model(model, dataloader):
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    model.to(device)
    model.eval()  # Make sure model is in eval mode
    total_loss = 0
    predictions, true_labels = [], []

    with torch.no_grad():  # Disable gradient calculation
        for batch in dataloader:
            # Move batch to the same device as the model
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)

            # Collect the loss and calculate the average
            loss = outputs.loss
            total_loss += loss.item()

            # Decode predictions
            predicted_ids = torch.argmax(outputs.logits, dim=-1)
#             predicted_tokens = [tokenizer.decode(ids) for ids in predicted_ids]
            predicted_tokens = [tokenizer.decode(ids, skip_special_tokens=True) for ids in predicted_ids]
            predictions.extend(predicted_tokens)
            true_labels.extend([tokenizer.decode(ids, skip_special_tokens=True) for ids in batch['labels']])

    # Calculate average loss
    average_loss = total_loss / len(dataloader)
    return predictions, true_labels, average_loss

# Evaluate on validation and test datasets
print("----- Validation -----")
val_predictions, val_labels, val_loss = evaluate_model(model, val_dataloader)
print(f"Validation loss: {val_loss}")
print("----- Test -----")
test_predictions, test_labels, test_loss = evaluate_model(model, test_dataloader)
print(f"Test loss: {test_loss}")

# Here you can further calculate accuracy or other metrics based on predictions and true_labels


----- Validation -----
Validation loss: 0.3567079286900755
----- Test -----
Test loss: 0.3577617426300615


In [16]:
def calculate_accuracy(predictions, true_labels):
    correct_predictions = sum(1 for pred, true in zip(predictions, true_labels) if pred == true)
    total_predictions = len(predictions)
    accuracy = correct_predictions / total_predictions
    print(correct_predictions)
    return accuracy

# Calculate accuracy
val_accuracy = calculate_accuracy(val_predictions, val_labels)
test_accuracy = calculate_accuracy(test_predictions, test_labels)

print(f"Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}")
print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy}")

340
363
Validation Loss: 0.3567079286900755, Validation Accuracy: 0.025233783583197267
Test Loss: 0.3577617426300615, Test Accuracy: 0.026938775510204082


In [18]:
# Print some sample predictions and labels
# for i in range(len(val_predictions)):
#     if val
#     print(f"Prediction: {val_predictions[i]}, True Label: {val_labels[i]}")
for pred, true in zip(val_predictions, val_labels):
    if pred == true:
        print(f"Prediction: {pred}, True Label: {true}")

Prediction: Drama, True Label: Drama
Prediction: Drama, True Label: Drama
Prediction: Drama, True Label: Drama
Prediction: bd-r, True Label: bd-r
Prediction: Drama, True Label: Drama
Prediction: Drama, True Label: Drama
Prediction: Drama, True Label: Drama
Prediction: 2007, True Label: 2007
Prediction: Drama, True Label: Drama
Prediction: 2007, True Label: 2007
Prediction: Drama, True Label: Drama
Prediction: bd-r, True Label: bd-r
Prediction: bd-r, True Label: bd-r
Prediction: Drama, True Label: Drama
Prediction: French, True Label: French
Prediction: 2007, True Label: 2007
Prediction: Drama, True Label: Drama
Prediction: French, True Label: French
Prediction: French, True Label: French
Prediction: French, True Label: French
Prediction: French, True Label: French
Prediction: Drama, True Label: Drama
Prediction: bd-r, True Label: bd-r
Prediction: good, True Label: good
Prediction: good, True Label: good
Prediction: 2007, True Label: 2007
Prediction: Drama, True Label: Drama
Prediction:

In [21]:
def save_predictions_labels(predictions, labels, file_name):
    with open(file_name, 'w', encoding='utf-8') as file:
        for pred, label in zip(predictions, labels):
            file.write(f"{pred}\t{label}\n")

# Assuming you have val_predictions, val_labels, test_predictions, and test_labels
save_predictions_labels(val_predictions, val_labels, "/kaggle/working/validation_predictions_labels.txt")
save_predictions_labels(test_predictions, test_labels, "/kaggle/working/test_predictions_labels.txt")


# QUESTION ANWERING

In [6]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

model = T5ForConditionalGeneration.from_pretrained('t5-large')
tokenizer = T5Tokenizer.from_pretrained('t5-large')

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# preprocess and load dataset

In [1]:
def preprocess_data(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    processed_lines = []

    for line in lines:
        question, answers = line.strip().split('\t')
        # Ensure the question ends with a question mark
        question = question.strip()
        if not question.endswith('?'):
            question += '?'

        # Split answers if there are multiple answers separated by '|'
        answers = answers.split('|')
        for answer in answers:
            processed_line = f"{question}\t{answer.strip()}\n"
            processed_lines.append(processed_line)

    # Write the processed lines to the output file
    with open(output_file, 'w', encoding='utf-8') as file:
        file.writelines(processed_lines)

# Define your input and output file paths
input_file_path = '/kaggle/input/metaqa/qa_train.txt'
output_file_path = '/kaggle/working/qa_train_pp.txt'

# Call the function with the file paths
preprocess_data(input_file_path, output_file_path)

In [4]:
def load_dataset(file_path):
    inputs = []
    targets = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            input_text, target_text = line.strip().split('\t')
            inputs.append(input_text)
            targets.append(target_text)
    return inputs, targets
qa_train_inputs, qa_train_targets = load_dataset("/kaggle/working/qa_train_pp.txt")
print(qa_train_inputs[:10], qa_train_targets[:10])

['what movies are about [ginger rogers]?', 'what movies are about [ginger rogers]?', 'what movies are about [ginger rogers]?', 'which movies can be described by [moore]?', 'which movies can be described by [moore]?', 'what films can be described by [occupation]?', 'what films can be described by [occupation]?', 'which films are about [jacques tati]?', 'which films are about [jacques tati]?', 'which films are about [jacques tati]?'] ['Top Hat', 'Kitty Foyle', 'The Barkleys of Broadway', 'Fahrenheit 9/11', 'Far from Heaven', 'Red Dawn', 'The Teahouse of the August Moon', 'Mon Oncle', 'Playtime', 'Trafic']


# tokenize data

In [8]:
def tokenize_data(inputs, targets, tokenizer, max_input_length=128, max_target_length=50):
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize training, validation, and test sets
qa_train_data = tokenize_data(qa_train_inputs, qa_train_targets, tokenizer)


In [9]:
from torch.utils.data import DataLoader, Dataset

class T5Dataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings.input_ids)

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

qa_train_dataset = T5Dataset(qa_train_data)
# val_dataset = T5Dataset(val_data)
# test_dataset = T5Dataset(test_data)

qa_train_dataloader = DataLoader(qa_train_dataset, batch_size=8, shuffle=True)
# val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False)
# test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# load finetuned T5

In [10]:
import torch

# Path to the saved weights
model_weights_path = '/kaggle/input/t5-linkpred/model_weights.pth'

# Load the weights into the model
model.load_state_dict(torch.load(model_weights_path))

<All keys matched successfully>

In [13]:
from transformers import AdamW, get_scheduler
import torch

optimizer = AdamW(model.parameters(), lr=5e-4)
num_epochs = 1
num_training_steps = num_epochs * len(qa_train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

# Training loop
for epoch in range(num_epochs):
    model.train()
    print(f"Epoch : {epoch}")
    i = 1
    for batch in qa_train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        if i%10 == 0:
            print(f"{i} Training loss: {loss.item()}")
        i = i+1


Epoch : 0
10 Training loss: 0.21799565851688385
20 Training loss: 0.2503877580165863
30 Training loss: 0.24944278597831726
40 Training loss: 0.3157113194465637
50 Training loss: 0.30038830637931824
60 Training loss: 0.2885052561759949
70 Training loss: 0.2732456922531128
80 Training loss: 0.21573850512504578
90 Training loss: 0.2787550687789917
100 Training loss: 0.2711539566516876
110 Training loss: 0.41609352827072144
120 Training loss: 0.2582980990409851
130 Training loss: 0.23949582874774933
140 Training loss: 0.227314755320549
150 Training loss: 0.2575443685054779
160 Training loss: 0.2113952785730362
170 Training loss: 0.17997941374778748
180 Training loss: 0.30445680022239685
190 Training loss: 0.26993030309677124
200 Training loss: 0.2831249535083771
210 Training loss: 0.28735119104385376
220 Training loss: 0.35535141825675964
230 Training loss: 0.30472004413604736
240 Training loss: 0.20521700382232666
250 Training loss: 0.30886223912239075
260 Training loss: 0.374611258506774

In [14]:
# Saving only the model weights
model_weights_path = "/kaggle/working/model_weights_qa.pth"
torch.save(model.state_dict(), model_weights_path)


# load validation and test dataset

In [16]:
def preprocess_data(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    processed_lines = []

    for line in lines:
        question, answers = line.strip().split('\t')
        # Ensure the question ends with a question mark
        question = question.strip()
        if not question.endswith('?'):
            question += '?'

        # Split answers if there are multiple answers separated by '|'
        answers = answers.split('|')
        for answer in answers:
            processed_line = f"{question}\t{answer.strip()}\n"
            processed_lines.append(processed_line)

    # Write the processed lines to the output file
    with open(output_file, 'w', encoding='utf-8') as file:
        file.writelines(processed_lines)

# Define your input and output file paths
input_file_path = '/kaggle/input/metaqa/qa_test.txt'
output_file_path = '/kaggle/working/qa_test_pp.txt'

# Call the function with the file paths
preprocess_data(input_file_path, output_file_path)

In [17]:
def load_dataset(file_path):
    inputs = []
    targets = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            input_text, target_text = line.strip().split('\t')
            inputs.append(input_text)
            targets.append(target_text)
    return inputs, targets
qa_val_inputs, qa_val_targets = load_dataset("/kaggle/working/qa_dev_pp.txt")
qa_test_inputs, qa_test_targets = load_dataset("/kaggle/working/qa_test_pp.txt")
print(qa_val_inputs[:10], qa_val_targets[:10])
print(qa_test_inputs[:10], qa_test_targets[:10])

['what movies did [Temuera Morrison] act in?', 'what movies did [Temuera Morrison] act in?', 'what movies did [Temuera Morrison] act in?', 'what movies did [Evelyn Venable] act in?', 'what movies did [Evelyn Venable] act in?', 'what movies did [Evelyn Venable] act in?', 'what does [Tom Cullen] act in?', 'what movies was [Shareeka Epps] an actor in?', 'what does [Peter Franzén] appear in?', 'what does [Peter Franzén] appear in?'] ['Once Were Warriors', 'Tracker', 'River Queen', 'Alice Adams', 'Death Takes a Holiday', 'The Little Colonel', 'Weekend', 'Half Nelson', 'Ambush', 'Dog Nail Clipper']
['what does [Grégoire Colin] appear in?', '[Joe Thomas] appears in which movies?', '[Joe Thomas] appears in which movies?', 'what films did [Michelle Trachtenberg] star in?', 'what films did [Michelle Trachtenberg] star in?', 'what films did [Michelle Trachtenberg] star in?', 'what films did [Michelle Trachtenberg] star in?', 'what films did [Michelle Trachtenberg] star in?', 'what does [Helen Mac

# tokenise data

In [18]:
def tokenize_data(inputs, targets, tokenizer, max_input_length=128, max_target_length=50):
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize training, validation, and test sets
qa_val_data = tokenize_data(qa_val_inputs, qa_val_targets, tokenizer)
qa_test_data = tokenize_data(qa_test_inputs, qa_test_targets, tokenizer)


In [19]:
from torch.utils.data import DataLoader, Dataset

class T5Dataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings.input_ids)

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

# qa_train_dataset = T5Dataset(qa_train_data)
qa_val_dataset = T5Dataset(qa_val_data)
qa_test_dataset = T5Dataset(qa_test_data)

# qa_train_dataloader = DataLoader(qa_train_dataset, batch_size=8, shuffle=True)
qa_val_dataloader = DataLoader(qa_val_dataset, batch_size=8, shuffle=False)
qa_test_dataloader = DataLoader(qa_test_dataset, batch_size=8, shuffle=False)

In [21]:
import torch

def evaluate_model(model, dataloader):
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    model.to(device)
    model.eval()  # Make sure model is in eval mode
    total_loss = 0
    predictions, true_labels = [], []

    with torch.no_grad():  # Disable gradient calculation
        for batch in dataloader:
            # Move batch to the same device as the model
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)

            # Collect the loss and calculate the average
            loss = outputs.loss
            total_loss += loss.item()

            # Decode predictions
            predicted_ids = torch.argmax(outputs.logits, dim=-1)
#             predicted_tokens = [tokenizer.decode(ids) for ids in predicted_ids]
            predicted_tokens = [tokenizer.decode(ids, skip_special_tokens=True) for ids in predicted_ids]
            predictions.extend(predicted_tokens)
            true_labels.extend([tokenizer.decode(ids, skip_special_tokens=True) for ids in batch['labels']])

    # Calculate average loss
    average_loss = total_loss / len(dataloader)
    return predictions, true_labels, average_loss

# Evaluate on validation and test datasets
print("----- Validation -----")
qa_val_predictions, qa_val_labels, qa_val_loss = evaluate_model(model, qa_val_dataloader)
print(f"Validation loss: {qa_val_loss}")
print("----- Test -----")
qa_test_predictions, qa_test_labels, qa_test_loss = evaluate_model(model, qa_test_dataloader)
print(f"Test loss: {qa_test_loss}")

# Here you can further calculate accuracy or other metrics based on predictions and true_labels


----- Validation -----
Validation loss: 0.20789015198650354
----- Test -----
Test loss: 0.20737239391877538


In [24]:
def calculate_accuracy(predictions, true_labels):
    correct_predictions = sum(1 for pred, true in zip(predictions, true_labels) if pred == true)
    total_predictions = len(predictions)
    accuracy = correct_predictions / total_predictions
    print(correct_predictions)
    return accuracy

# Calculate accuracy
qa_val_accuracy = calculate_accuracy(qa_val_predictions, qa_val_labels)
qa_test_accuracy = calculate_accuracy(qa_test_predictions, qa_test_labels)

print(f"Validation Loss: {qa_val_loss}, Validation Accuracy: {qa_val_accuracy}")
print(f"Test Loss: {qa_test_loss}, Test Accuracy: {qa_test_accuracy}")

558
623
Validation Loss: 0.20789015198650354, Validation Accuracy: 0.029391624967079272
Test Loss: 0.20737239391877538, Test Accuracy: 0.03220303938798718


In [25]:
def calculate_hits_at_1(predictions, true_labels):
    hits = 0
    total = len(predictions)  # Total number of predictions

    # Iterate over each prediction and corresponding true label
    for pred, true in zip(predictions, true_labels):
        # Increment the hits count if the top prediction matches the true label
        if pred == true:
            hits += 1

    # Calculate the Hits@1 score
    hits_at_1 = hits / total
    return hits_at_1

# Assuming val_predictions, val_labels, test_predictions, test_labels are defined
val_hits_at_1 = calculate_hits_at_1(qa_val_predictions, qa_val_labels)
test_hits_at_1 = calculate_hits_at_1(qa_test_predictions, qa_test_labels)

print(f"Validation Hits@1: {val_hits_at_1:.2f}")
print(f"Test Hits@1: {test_hits_at_1:.2f}")


Validation Hits@1: 0.03
Test Hits@1: 0.03


In [37]:
input_text = "[Mr. North] is a film written by this person?"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)

# Generate the output
with torch.no_grad():
    outputs = model.generate(input_ids, max_length=50, num_beams=5, early_stopping=True)

# Decode the output
output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(output_text)

Claude Chabrol


In [29]:
# Print some sample predictions and labels
# for i in range(len(val_predictions)):
#     if val
#     print(f"Prediction: {val_predictions[i]}, True Label: {val_labels[i]}")
for pred, true in zip(qa_val_predictions, qa_val_labels):
    if pred == true:
        print(f"Prediction: {pred}, True Label: {true}")

Prediction: John Wayne, True Label: John Wayne
Prediction: John Wayne, True Label: John Wayne
Prediction: John Wayne, True Label: John Wayne
Prediction: John Wayne, True Label: John Wayne
Prediction: John Wayne, True Label: John Wayne
Prediction: John Wayne, True Label: John Wayne
Prediction: John Wayne, True Label: John Wayne
Prediction: John Wayne, True Label: John Wayne
Prediction: John Wayne, True Label: John Wayne
Prediction: John Wayne, True Label: John Wayne
Prediction: John Wayne, True Label: John Wayne
Prediction: John Huston, True Label: John Huston
Prediction: Comedy, True Label: Comedy
Prediction: Drama, True Label: Drama
Prediction: Comedy, True Label: Comedy
Prediction: Comedy, True Label: Comedy
Prediction: Comedy, True Label: Comedy
Prediction: Comedy, True Label: Comedy
Prediction: Comedy, True Label: Comedy
Prediction: Comedy, True Label: Comedy
Prediction: Drama, True Label: Drama
Prediction: Comedy, True Label: Comedy
Prediction: Comedy, True Label: Comedy
Predictio