In [None]:
pip install transformers datasets torch



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd

# Load the Excel file
file_path = '/content/drive/MyDrive/June-2024 Work/REBEL_Train.xlsx'
data = pd.read_excel(file_path)

# Prepare the data
def prepare_data(row):
    return f"Head: {row['Head']}, Relation: {row['Relation']}, Tail: {row['Tail']} -> {row['Text']}"

data['input_text'] = data.apply(prepare_data, axis=1)
texts = data['input_text'].tolist()


In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import Dataset
import torch

In [None]:
!pip install transformers datasets torch accelerate -U




In [None]:
# Load pre-trained GPT-2 tokenizer and model
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Add padding token to the tokenizer
tokenizer.pad_token = tokenizer.eos_token

# Tokenize the data
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512)

# Convert the data to the Hugging Face dataset format
dataset = Dataset.from_dict({'text': texts})
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=['text'])

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Define data collator
def data_collator(features):
    batch = tokenizer.pad(
        features,
        padding=True,
        max_length=512,
        return_tensors="pt"
    )
    batch['labels'] = batch['input_ids'].clone()
    return batch

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    data_collator=data_collator,
)

# Fine-tune the model
trainer.train()

# Save the model
model.save_pretrained('./fine-tuned-gpt2')
tokenizer.save_pretrained('./fine-tuned-gpt2')

Map:   0%|          | 0/1142 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,0.3509,0.218811
2,0.2353,0.190473
3,0.2119,0.173052
4,0.1931,0.159347
5,0.186,0.148428
6,0.1735,0.139907
7,0.1643,0.133214
8,0.153,0.128709
9,0.1468,0.125622
10,0.1476,0.124481




('./fine-tuned-gpt2/tokenizer_config.json',
 './fine-tuned-gpt2/special_tokens_map.json',
 './fine-tuned-gpt2/vocab.json',
 './fine-tuned-gpt2/merges.txt',
 './fine-tuned-gpt2/added_tokens.json')

In [None]:
# Load the fine-tuned model and tokenizer
model = GPT2LMHeadModel.from_pretrained('./fine-tuned-gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('./fine-tuned-gpt2')

# Generate text from a triplet
input_text = "Head:   The Playtones , Relation: workperiod(start), Tail:2008 "
inputs = tokenizer.encode(input_text, return_tensors='pt')
outputs = model.generate(inputs, max_length=100, num_return_sequences=1)

generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Head:   Philippine president , Relation: haslist, Tail: 14th President of the Philippines  -> He was elected on 14 December 2011, and was re-elected on 15 December 2014.


In [None]:
from google.colab import files
# Download a file from Colab VM to your local machine
files.download('results')