In [None]:
# Install dependencies (restart environment after installation)
%pip install transformers
%pip install matplotlib
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
%pip install accelerate
%pip install pandas
%pip install datasets
%pip install numpy
%pip install evaluate

In [None]:
# import dependencies
from matplotlib import pyplot as plt
from transformers import AutoTokenizer, AutoModelForCausalLM, GPTJForCausalLM, TrainingArguments, Trainer, \
    DataCollatorForLanguageModeling
import torch
import pandas as pd
import numpy as np

In [None]:
# check for cuda gpu
print("GPU Available: ", torch.cuda.is_available())

In [None]:
# load ElutherAi gpt-j model
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = GPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B", torch_dtype=torch.float16)

In [None]:
model.eval()

In [None]:
model.save_pretrained("./model", safe_serialization=True)

In [None]:
prompt = (
    "What are you thoughts on today's weather?"
)

# input_ids = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=128).input_ids.to("cuda")

input_ids = tokenizer.encode(prompt, return_tensors="pt", max_length=128, padding=True, truncation=True).to("cuda")

gen_tokens = model.generate(
    input_ids,
    pad_token_id=tokenizer.eos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    do_sample=True,
    temperature=0.9,
    max_new_tokens=150,
)

In [None]:
# Output from pre-trained model
gen_text = tokenizer.batch_decode(gen_tokens)[0]
print(gen_text)


In [None]:
# Load dataset into pandas data-frame
splits = {'train': 'train.csv', 'validation': 'validation.csv', 'test': 'test.csv'}
df = pd.read_csv("hf://datasets/knkarthick/dialogsum/" + splits["train"])

In [None]:
df.head()

In [None]:
# Load dataset for fine-tuning
from datasets import load_dataset

ds = load_dataset("knkarthick/dialogsum")

In [None]:
# Tokenize dataset
def tokenize(raw_data):
    current_tokenizer_result = tokenizer(raw_data["dialogue"], padding="max_length", truncation=True)
    return current_tokenizer_result

tokenized_dataset = ds.map(tokenize, batched=True)

In [None]:
# Take portion of dataset for testing
small_train_dataset = tokenized_dataset["train"].select(range(20))
small_test_dataset = tokenized_dataset["test"].select(range(20))

In [None]:
# Define parameters for training
training_config = TrainingArguments(
    output_dir = "./fine_tuned_models/gpt-j-6B",
    report_to="all",
    logging_dir = "./logs",
    per_device_train_batch_size=1,
    use_cpu=False
)

In [None]:
import evaluate

# Define function for metrics computation
metric = evaluate.load("accuracy")

def compute_metrics(evaluate_prediction):
    logits, labels = evaluate_prediction
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
# Create data collator to prevent out of memory error
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [None]:
# Train the model
trainer = Trainer(
    model = model,
    args = training_config,
    train_dataset = small_train_dataset,
    eval_dataset = small_test_dataset,
    compute_metrics = compute_metrics,
    data_collator = data_collator,
)

print("Started training.")
trainer.train()
print(f"Finished training.")