In [1]:
!pip install transformers
!pip install datasets
!pip install torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m84.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m92.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.28.1
Looking in indexes: https://pypi.org/simple, https://us

In [12]:
import os
import pandas as pd
from datasets import load_dataset

from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments

tokenizer = AutoTokenizer.from_pretrained("facebook/opt-1.3b")
model = AutoModelForCausalLM.from_pretrained("facebook/opt-1.3b")


In [22]:
def preprocess_data(data):
    data["source"] = data["question"].apply(
        lambda x: x.replace("<extra_id_0>", " ").replace("<extra_id_1>", " ")
    )
    return data

def tokenize(batch):
    tokenized_input = tokenizer(
        batch["source"], padding="max_length", truncation=True, max_length=256
    )
    tokenized_label = tokenizer(
        batch["answer"], padding="max_length", truncation=True, max_length=256
    )
    return {
        "input_ids": tokenized_input["input_ids"],
        "attention_mask": tokenized_input["attention_mask"],
        "labels": tokenized_label["input_ids"],
    }

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split

train_data_csv = "genV2-original-plutchik-v1-train.csv"
test_data_csv = "genV2-original-plutchik-v1-test.csv"

# Read the CSV files
train_data = pd.read_csv(train_data_csv)
test_data = pd.read_csv(test_data_csv)

# Combine the datasets
total_data = pd.concat([train_data, test_data]).reset_index(drop=True)

# Preprocess the data
total_data = preprocess_data(total_data)

# Split the data into a 95:5 ratio for training and testing
train_data, test_data = train_test_split(total_data, test_size=0.05, random_state=42)

# Save the new train and test data as CSV files if needed
train_data.to_csv("combined_train_data.csv", index=False)
test_data.to_csv("combined_test_data.csv", index=False)

In [9]:
train_dataset = load_dataset('csv', data_files='combined_train_data.csv')
test_dataset = load_dataset('csv', data_files='combined_test_data.csv')




  0%|          | 0/1 [00:00<?, ?it/s]

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-fc5f917968857b7a/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-fc5f917968857b7a/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [23]:
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=40,
    save_steps=10_000,
    save_total_limit=2,
    evaluation_strategy="epoch",
    logging_dir="./logs",
    logging_strategy="epoch",
    learning_rate=1e-5,
)

In [24]:
train_dataset = train_dataset.map(tokenize, batched=True, batch_size=256)
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=256)

train_dataset.set_format(
    "torch", columns=["input_ids", "attention_mask", "labels"]
)
test_dataset.set_format(
    "torch", columns=["input_ids", "attention_mask", "labels"]
)


Map:   0%|          | 0/27447 [00:00<?, ? examples/s]

Map:   0%|          | 0/1445 [00:00<?, ? examples/s]

In [25]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset['train'],
    eval_dataset=test_dataset['train'],
)

trainer.train()


Epoch,Training Loss,Validation Loss
1,0.2909,0.27253
2,0.2664,0.273151
3,0.2629,0.269508
4,0.2594,0.268754
5,0.2563,0.268253


TrainOutput(global_step=17155, training_loss=0.26716739095289277, metrics={'train_runtime': 18688.2931, 'train_samples_per_second': 7.343, 'train_steps_per_second': 0.918, 'total_flos': 2.5476492457672704e+17, 'train_loss': 0.26716739095289277, 'epoch': 5.0})