In [None]:
!pip install transformers datasets
!pip install transformers[torch]


In [None]:
#import pymupdf
import pandas as pd
from datasets import Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from google.colab import userdata, drive



In [None]:
data = pd.read_csv('/content/drive/MyDrive/SLPA/input_output.csv', delimiter='\t')

# Convert the dataframe to a Hugging Face Dataset
dataset = Dataset.from_pandas(data)

# Initialize the tokenizer and model
model_name = 't5-small'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
def preprocess_function(examples):
    inputs = ['JSONify: '+ex for ex in examples['description']]
    targets = [ex for ex in examples['json']]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding='max_length')

    # Tokenize targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=512, truncation=True, padding='max_length')

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

In [None]:
# Split the dataset into training and validation sets
tokenized_dataset = dataset.map(preprocess_function, batched=True)
split = tokenized_dataset.train_test_split(test_size=0.1)
train_dataset = split['train']
eval_dataset = split['test']

In [None]:
from torch.utils.data import DataLoader

dataset.set_format(type="torch", columns=['input_text', 'target_text'])
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8)
#valid_dataloader = DataLoader(dataset['validation'], batch_size=4)
test_dataloader = DataLoader(eval_dataset, batch_size=4)

In [None]:
# Set up training arguments
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/SLPA/results',          # output directory
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=8,   # batch size for training
    per_device_eval_batch_size=8,    # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='/content/drive/MyDrive/SLPA/logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy="epoch",     # evaluate each epoch
    save_strategy="epoch"            # save each epoch
)


In [None]:
# Initialize the Trainer
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=eval_dataset,           # evaluation dataset
)

In [None]:
# Fine-tune the model
trainer.train()

In [None]:
model_save_path = '/content/drive/MyDrive/SLPA/results/trained_model'
tokenizer_save_path = '/content/drive/MyDrive/SLPA/results/trained_model'

model.save_pretrained(model_save_path)
tokenizer.save_pretrained(tokenizer_save_path)

print(f"Model and tokenizer saved to {model_save_path}")

In [None]:
eval_results = trainer.evaluate()

print(f"Evaluation results: {eval_results}")

# Generate predictions on the validation set
predictions = trainer.predict(eval_dataset)

# Convert predictions to text
decoded_preds = tokenizer.batch_decode(predictions.predictions, skip_special_tokens=True)

# Optionally, you can also convert the labels to text for comparison
decoded_labels = tokenizer.batch_decode(predictions.label_ids, skip_special_tokens=True)

# Print some example predictions and their corresponding labels
for i in range(5):
    print(f"Input: {eval_dataset[i]['input_text']}")
    print(f"Prediction: {decoded_preds[i]}")
    print(f"Actual: {decoded_labels[i]}")
    print("------")

In [None]:
def load_model():
  model_load_path = '/content/drive/MyDrive/SLPA/results/trained_model'
  tokenizer = T5Tokenizer.from_pretrained(model_load_path)
  model = T5ForConditionalGeneration.from_pretrained(model_load_path)
  return model, tokenizer


In [None]:
def inference(model, tokenizer, input_text):
  #input_text = "Show me all users who joined after January 1st, 2020"
  input_ids = tokenizer(input_text, return_tensors='pt').input_ids

  # Generate the output
  output_ids = model.generate(input_ids, num_beams=4, early_stopping=True)
  output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

  return output_text

In [None]:
model, tokenizer = load_model()

In [None]:
output = inference(model, tokenizer, "summarize: C shaped hands raised and moving forward")
print(output)