In [1]:
!pip install transformers datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-18.0.0-cp310-cp310-win_amd64.whl.metadata (3.4 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting pandas (from datasets)
  Downloading pandas-2.2.3-cp310-cp310-win_amd64.whl.metadata (19 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-win_amd64.whl.metadata (13 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting aiohttp (from datasets)
  Downloading aiohttp-3.11.4-cp310-cp310-win_amd64.whl.metadata (8.0 kB)
Collecting aiohappyeyeballs>=2.3.0 (from aiohttp->datasets)
  Downloading aiohappyeyeballs-2.4.3-py3-none-

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Load the dataset
data_path = '/content/drive/MyDrive/bio_summary_final.csv' 
df = pd.read_csv(data_path)

In [6]:
# Convert the DataFrame to a Hugging Face Dataset format
dataset = Dataset.from_pandas(df)

In [7]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]



In [8]:
# Tokenization function with padding
def preprocess_data(examples):
    inputs = ["summarize: " + doc for doc in examples["longtext"]]
    model_inputs = tokenizer(inputs, max_length=512, padding="max_length", truncation=True)  # Adding padding

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=150, padding="max_length", truncation=True)  # Adding padding

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [9]:
print(df.columns)

Index(['longtext', 'summary'], dtype='object')


In [None]:
# Apply preprocessing to the dataset
tokenized_dataset = dataset.map(preprocess_data, batched=True)

Map:   0%|          | 0/499 [00:00<?, ? examples/s]



In [11]:
# Load the model
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
# Set up training arguments
training_args = TrainingArguments(
    output_dir="./flan_t5_finetuned",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,  
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    push_to_hub=False
)



In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset, 
)

In [14]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,2.628453
2,No log,2.55938
3,No log,2.540879


TrainOutput(global_step=375, training_loss=2.9246923828125, metrics={'train_runtime': 436.3881, 'train_samples_per_second': 3.43, 'train_steps_per_second': 0.859, 'total_flos': 1025081756614656.0, 'train_loss': 2.9246923828125, 'epoch': 3.0})

In [15]:
model.save_pretrained('/content/drive/MyDrive/flan_t5_finetuned_model')
tokenizer.save_pretrained('/content/drive/MyDrive/flan_t5_finetuned_model')

('/content/drive/MyDrive/flan_t5_finetuned_model/tokenizer_config.json',
 '/content/drive/MyDrive/flan_t5_finetuned_model/special_tokens_map.json',
 '/content/drive/MyDrive/flan_t5_finetuned_model/spiece.model',
 '/content/drive/MyDrive/flan_t5_finetuned_model/added_tokens.json',
 '/content/drive/MyDrive/flan_t5_finetuned_model/tokenizer.json')

In [None]:
# Define the path 
model_save_path = '/content/drive/MyDrive/flan_t5_finetuned_model_2nd'  # Update with your desired path

In [17]:
# Save the model
trainer.model.save_pretrained(model_save_path)

In [18]:
# Save the tokenizer
tokenizer.save_pretrained(model_save_path)

('/content/drive/MyDrive/flan_t5_finetuned_model_2nd/tokenizer_config.json',
 '/content/drive/MyDrive/flan_t5_finetuned_model_2nd/special_tokens_map.json',
 '/content/drive/MyDrive/flan_t5_finetuned_model_2nd/spiece.model',
 '/content/drive/MyDrive/flan_t5_finetuned_model_2nd/added_tokens.json',
 '/content/drive/MyDrive/flan_t5_finetuned_model_2nd/tokenizer.json')