In [None]:
!pip install transformers datasets
!pip install transformers[torch]


In [9]:
#import pymupdf
import pandas as pd
from datasets import Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, TrainerCallback
from google.colab import userdata, drive
from torch.utils.data import DataLoader
from torch import tensor

In [10]:
data = pd.read_csv('/content/drive/MyDrive/SLPA/description_to_handshape.csv', delimiter=',')
dataset = Dataset.from_pandas(data)

In [None]:
model_name = 't5-small'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

In [12]:
def preprocess(examples):
    inputs = ['Summarize: '+ex for ex in examples['input_text']]
    targets = [ex for ex in examples['target_text']]
    model_inputs = tokenizer(inputs, truncation=True, padding=True, max_length=512)

    # Tokenize targets
    with tokenizer.as_target_tokenizer():
        target_encoding = tokenizer(targets, truncation=True, padding=True, max_length=512)
    labels = target_encoding.input_ids

    model_inputs['labels'] = labels
    return model_inputs

In [None]:
# Split the dataset into training and validation sets
tokenized_dataset = dataset.map(preprocess, batched=True)
split = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = split['train']
eval_dataset = split['test']

In [14]:
dataset.set_format(type="torch", columns=['input_text', 'target_text'])
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8)
test_dataloader = DataLoader(eval_dataset, batch_size=4)

In [None]:
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/SLPA/results',          # output directory
    num_train_epochs=5,              # number of training epochs
    per_device_train_batch_size=8,   # batch size for training
    per_device_eval_batch_size=8,    # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='/content/drive/MyDrive/SLPA/logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy="epoch",     # evaluate each epoch
    #save_strategy="epoch",            # save each epoch, this quickly uses up space!
    learning_rate=1e-4
)


In [17]:
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=eval_dataset,           # evaluation dataset
)

In [None]:
# Fine-tune the model
# This step takes up to 4 hours, don't let your computer fall asleep!
trainer.train()

In [None]:
model_save_path = '/content/drive/MyDrive/SLPA/results/trained_model'
tokenizer_save_path = '/content/drive/MyDrive/SLPA/results/trained_model'

model.save_pretrained(model_save_path)
tokenizer.save_pretrained(tokenizer_save_path)

print(f"Model and tokenizer saved to {model_save_path}")

In [None]:
def load_model(name):
  model_load_path = f'/content/drive/MyDrive/SLPA/results/{name}'
  tokenizer = T5Tokenizer.from_pretrained(model_load_path)
  model = T5ForConditionalGeneration.from_pretrained(model_load_path)
  return model, tokenizer


In [None]:
def inference(model, tokenizer, input_text):
  #input_text = "Show me all users who joined after January 1st, 2020"
  input_text = 'summarize: '+input_text
  input_ids = tokenizer(input_text, return_tensors='pt').input_ids

  # Generate the output
  output_ids = model.generate(input_ids, num_beams=4, early_stopping=True)
  output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

  return output_text

In [None]:
tests = ["horizontal right extended 'A' hand palm down is placed on the facing palm of the left extended 'B' the right thumb is drawn downward across the left palm",
         "right extended '8' hand palm facing up fingers pointing forward is moved from left to right in a series of successive short arcs",
         "vertical left extended 'B' hand is held in a fixed position with palm facing right. Vertical right one hand palm forward is placed against the left palm and is pushed upward",
         "right flat '0' hand is held with fingertips touching or close to the centre of the forehead while left flat '0' hand is held farther forward and slightly lower"]

In [None]:
for t in tests:
  output = inference(model, tokenizer, f"summarize: {t}")
  print(output)

right hand extended 'A' left hand.
