In [1]:
%pip install --upgrade pip
%pip install --disable-pip-version-check \
    torch==1.13.1 \
    torchdata==0.5.1 --quiet

%pip install \
    transformers==4.27.2 \
    datasets==2.11.0 \
    evaluate==0.4.0 \
    rouge_score==0.1.2 \
    loralib==0.1.1 \
    peft==0.3.0 --quiet

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
from datasets import load_from_disk, Dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import evaluate
import pandas as pd
import numpy as np



In [3]:
# using GPU if you have one
device = "cuda:0" if torch.cuda.is_available() else "cpu"
# device = "cpu"

In [4]:
device

'cuda:0'

## 1. Load Dataset and Prepare Data for Weak Supervision

In [14]:
dataset = load_from_disk("../input/chat-weak/chat_weak")

In [15]:
dataset.save_to_disk("/kaggle/working/chat_weak")

Saving the dataset (0/1 shards):   0%|          | 0/15098 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/23421 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/18729 [00:00<?, ? examples/s]

In [16]:
dataset = load_from_disk("/kaggle/working/chat_weak")

## 2. Load Model

In [21]:
model_name='google/flan-t5-base'

original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map = 'auto')
tokenizer = AutoTokenizer.from_pretrained(model_name)

## 2. Instruction Fine-tuning

In [17]:
def tokenize_function(example):
    start_prompt = 'Evaluate the sentiment of the following sentence.\n\n'
    end_prompt = '\n\nSentiment: '
    prompt = [start_prompt + message + end_prompt for message in example["message"]]
    example['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids
    example['labels'] = tokenizer(example["sentiment"], padding="max_length", truncation=True, return_tensors="pt").input_ids
    
    return example

# The dataset actually contains 3 diff splits: train, validation, test.
# The tokenize_function code is handling all data across all splits in batches.
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['conversation_id', 'message', 'sentiment', '__index_level_0__', 'speaker_id', ])


Map:   0%|          | 0/15098 [00:00<?, ? examples/s]

Map:   0%|          | 0/23421 [00:00<?, ? examples/s]

Map:   0%|          | 0/18729 [00:00<?, ? examples/s]

In [18]:
print(f"Shapes of the datasets:")
print(f"Training: {tokenized_datasets['train'].shape}")
print(f"Validation: {tokenized_datasets['val'].shape}")
print(f"Test: {tokenized_datasets['test'].shape}")

print(tokenized_datasets)


Shapes of the datasets:
Training: (15098, 2)
Validation: (23421, 2)
Test: (18729, 2)
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 15098
    })
    val: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 23421
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 18729
    })
})


### 2.1 Fine-tune Model

In [19]:
output_dir = f'/kaggle/working/model/instruction-{str(int(time.time()))}'

training_args = TrainingArguments(
    output_dir=output_dir,
    learning_rate=1e-5,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_steps=1,
    max_steps=1,
#     per_device_train_batch_size=1,
#     per_device_eval_batch_size=1
)

trainer = Trainer(
    model=original_model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['val']
)

In [20]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/wandb/sdk/wandb_init.py", line 1160, in init
    wi.setup(kwargs)
  File "/opt/conda/lib/python3.10/site-packages/wandb/sdk/wandb_init.py", line 306, in setup
    wandb_login._login(
  File "/opt/conda/lib/python3.10/site-packages/wandb/sdk/wandb_login.py", line 298, in _login
    wlogin.prompt_api_key()
  File "/opt/conda/lib/python3.10/site-packages/wandb/sdk/wandb_login.py", line 221, in prompt_api_key
    key, status = self._prompt_api_key()
  File "/opt/conda/lib/python3.10/site-packages/wandb/sdk/wandb_login.py", line 201, in _prompt_api_key
    key = apikey.prompt_api_key(
  File "/opt/conda/lib/

Error: An unexpected error occurred