# This is the merged code for Finetune process

In [1]:
import torch
print("CUDA Available: ", torch.cuda.is_available())

CUDA Available:  True


#### Settings 

In [2]:
import copy
import logging
from dataclasses import dataclass, field
from typing import Dict, Optional, Sequence

import torch
import transformers
import utils
from torch.utils.data import Dataset
from transformers import Trainer

# import wandb
# wandb.login(relogin=True)



IGNORE_INDEX = -100
DEFAULT_PAD_TOKEN = "[PAD]"
DEFAULT_EOS_TOKEN = "</s>"
DEFAULT_BOS_TOKEN = "<s>"
DEFAULT_UNK_TOKEN = "<unk>"
PROMPT_DICT = {
    "prompt_input": (
        "Below is an instruction that describes a task, paired with an input that provides further context. "
        "Write a response that appropriately completes the request.\n\n"
        "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"
    ),
    "prompt_no_input": (
        "Below is an instruction that describes a task. "
        "Write a response that appropriately completes the request.\n\n"
        "### Instruction:\n{instruction}\n\n### Response:"
    ),
}

#### Set parameters

In [3]:
@dataclass
class ModelArguments:
    model_name_or_path: str = "Toflamus/GPT-2_para3M"
    # Add other model-related arguments here

@dataclass
class DataArguments:
    data_path: str = "/workspace/lost+found/Final_Project/alpaca_data.json"
    # Add other data-related arguments here

@dataclass
class TrainingArguments:
    model_max_length: int = 512
    cache_dir: str = "/workspace/lost+found/Final_Project/cache"  # Set your desired cache directory
    bf16: bool = True
    output_dir: str = '/workspace/lost+found/Final_Project/Output'
    num_train_epochs: int = 3
    gradient_accumulation_steps: int = 8
    # evaluation_strategy: str = "no"
    save_strategy: str = "steps"
    save_steps: int = 2000
    save_total_limit: int = 1
    learning_rate: float = 2e-5
    weight_decay: float = 0.0
    warmup_ratio: float = 0.03
    lr_scheduler_type: str = "cosine"
    logging_steps: int = 1
    # fsdp: str = "full_shard auto_wrap"
    # fsdp_transformer_layer_cls_to_wrap: str = 'LlamaDecoderLayer'
    tf32: bool = True
    full_determinism = False 
    seed = 42
    accelerate_version = 1

# Create instances of the dataclasses with your desired parameter values
model_args = ModelArguments()
data_args = DataArguments()
training_args = TrainingArguments()



#### Load the model

In [4]:
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_args.model_name_or_path,
    cache_dir=training_args.cache_dir,
)

#### Define the tokenizer

In [5]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_args.model_name_or_path,
    cache_dir=training_args.cache_dir,
    model_max_length=training_args.model_max_length,
    padding_side="right",
    use_fast=False,
)

#### Define special tokens

In [6]:
special_tokens_dict = dict()
if tokenizer.pad_token is None:
    special_tokens_dict["pad_token"] = DEFAULT_PAD_TOKEN
if tokenizer.eos_token is None:
    special_tokens_dict["eos_token"] = DEFAULT_EOS_TOKEN
if tokenizer.bos_token is None:
    special_tokens_dict["bos_token"] = DEFAULT_BOS_TOKEN
if tokenizer.unk_token is None:
    special_tokens_dict["unk_token"] = DEFAULT_UNK_TOKEN

## Load preprocessing dataset

#### Some functions to load the dataset

In [7]:
import dataclasses
# import logging
import math
import os
import io
import sys
import time
import json
from typing import Optional, Sequence, Union

def _make_w_io_base(f, mode: str):
    if not isinstance(f, io.IOBase):
        f_dirname = os.path.dirname(f)
        if f_dirname != "":
            os.makedirs(f_dirname, exist_ok=True)
        f = open(f, mode=mode)
    return f

def _make_r_io_base(f, mode: str):
    if not isinstance(f, io.IOBase):
        f = open(f, mode=mode)
    return f


def jdump(obj, f, mode="w", indent=4, default=str):
    """Dump a str or dictionary to a file in json format.

    Args:
        obj: An object to be written.
        f: A string path to the location on disk.
        mode: Mode for opening the file.
        indent: Indent for storing json dictionaries.
        default: A function to handle non-serializable entries; defaults to `str`.
    """
    f = _make_w_io_base(f, mode)
    if isinstance(obj, (dict, list)):
        json.dump(obj, f, indent=indent, default=default)
    elif isinstance(obj, str):
        f.write(obj)
    else:
        raise ValueError(f"Unexpected type: {type(obj)}")
    f.close()


def jload(f, mode="r"):
    """Load a .json file into a dictionary."""
    f = _make_r_io_base(f, mode)
    jdict = json.load(f)
    f.close()
    return jdict

#### Load the raw data

In [8]:
from datasets import load_dataset
data_path = "/workspace/lost+found/Final_Project/alpaca_data.json"
train_path = "/workspace/lost+found/Final_Project/alpaca_train.json"
test_path = "/workspace/lost+found/Final_Project/alpaca_test.json"
eval_data = load_dataset("json", data_files = train_path)
# print(eval_data)
logging.warning("Loading data...")
# list_data_dict = train_data["train"].to_dict()
# Load a dataset
train_data = load_dataset("json", data_files = test_path)
# Custom function to format the dataset
def custom_format(entry):
    return {
        'instruction': entry['instruction'],
        'input': entry['input'] if 'input' in entry else '',
        'output': entry['output']
    }

# Convert the dataset using the custom format function

# list_data_dict = [custom_format(train_data["train"][i]) for i in range(train_data["train"].num_rows)]
# list_eval_dict = [custom_format(eval_data["train"][i]) for i in range(eval_data["train"].num_rows)]
# # list_data_dict = jload(data_path)
# # list_data_dict = train_data
# list_data_dict



#### Formatting inputs

In [9]:
# logging.warning("Formatting inputs...")
# prompt_input, prompt_no_input = PROMPT_DICT["prompt_input"], PROMPT_DICT["prompt_no_input"]
# # print(prompt_input)
# # print(prompt_no_input)
# sources = [
#     prompt_input.format_map(example) if example.get("input", "") != "" else prompt_no_input.format_map(example)
#     for example in list_data_dict
# ]# here we use the template to generate the sources
# targets = [f"{example['output']}{tokenizer.eos_token}" for example in list_data_dict]
# print(sources[:2])
# print(targets[:2])

#### Tokenizing inputs

In [10]:
def _tokenize_fn(strings: Sequence[str], tokenizer: transformers.PreTrainedTokenizer) -> Dict:
    """Tokenize a list of strings. This can only tokenize a single string"""
    tokenized_list = [
        tokenizer(
            text,
            return_tensors="pt",
            padding="longest",
            max_length=tokenizer.model_max_length,
            truncation=True,
        )
        for text in strings
    ]
    input_ids = labels = [tokenized.input_ids[0] for tokenized in tokenized_list]
    # here we defined a label for trainer
    input_ids_lens = labels_lens = [
        tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item() for tokenized in tokenized_list
    ]
    return dict(
        input_ids=input_ids,
        labels=labels,
        input_ids_lens=input_ids_lens,
        labels_lens=labels_lens,
    )

def preprocess(
    sources: Sequence[str],
    targets: Sequence[str],
    tokenizer: transformers.PreTrainedTokenizer,
) -> Dict:
    """Preprocess the data by tokenizing."""
    examples = [s + t for s, t in zip(sources, targets)]# combine the sentences in sources and targets
    examples_tokenized, sources_tokenized = [_tokenize_fn(strings, tokenizer) for strings in (examples, sources)]
    input_ids = examples_tokenized["input_ids"]
    labels = copy.deepcopy(input_ids)
    for label, source_len in zip(labels, sources_tokenized["input_ids_lens"]):
        label[:source_len] = IGNORE_INDEX
    return dict(input_ids=input_ids, labels=labels)
# 这里mask了一下 把label的问题部分给盖掉了

# logging.warning("Tokenizing inputs... This may take some time...")
# data_dict = preprocess(sources, targets, tokenizer)
# data_dict

In [11]:
# print(data_dict['input_ids'][:2])
# print(data_dict['labels'][:2])
# print(tokenizer.decode(data_dict['input_ids'][0]))#, skip_special_tokens=True))

In [12]:
# data_dict.keys()

#### Resize tokenizer and embedding

In [13]:
def smart_tokenizer_and_embedding_resize(
    special_tokens_dict: Dict,
    tokenizer: transformers.PreTrainedTokenizer,
    model: transformers.PreTrainedModel,
):
    """Resize tokenizer and embedding.

    Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
    """
    num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
    model.resize_token_embeddings(len(tokenizer))

    if num_new_tokens > 0:
        input_embeddings = model.get_input_embeddings().weight.data
        output_embeddings = model.get_output_embeddings().weight.data

        input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
        output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)

        input_embeddings[-num_new_tokens:] = input_embeddings_avg
        output_embeddings[-num_new_tokens:] = output_embeddings_avg

smart_tokenizer_and_embedding_resize(
        special_tokens_dict=special_tokens_dict,
        tokenizer=tokenizer,
        model=model,
    )

You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embeding dimension will be 50257. This might induce some performance reduction as *Tensor Cores* will not be available. For more details  about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc


#### Define datacollector

What is a Data Collator?[here](https://saturncloud.io/blog/creating-a-custom-data-collator-for-huggingface-trainer-a-comprehensive-guide/#what-is-a-data-collator)
A data_collator is a function that takes a batch of data and collates it into a format suitable for model training. The default data_collator in HuggingFace Trainer handles most common scenarios, but there are cases where you might need a custom one

In [14]:
from datasets import load_dataset
@dataclass
class DataCollatorForSupervisedDataset(object):
    """Collate examples for supervised fine-tuning."""

    tokenizer: transformers.PreTrainedTokenizer

    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
        input_ids, labels = tuple([instance[key] for instance in instances] for key in ("input_ids", "labels"))
        input_ids = torch.nn.utils.rnn.pad_sequence(
            input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id
        )
        labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX)
        return dict(
            input_ids=input_ids,
            labels=labels,
            attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
        )
def custom_format(entry):
    return {
        'instruction': entry['instruction'],
        'input': entry['input'] if 'input' in entry else '',
        'output': entry['output']
    }

# Convert the dataset using the custom format function


class SupervisedDataset(Dataset):
    """Dataset for supervised fine-tuning."""

    def __init__(self, data_path: str, tokenizer: transformers.PreTrainedTokenizer):
        super(SupervisedDataset, self).__init__()
        logging.warning("Loading data...")
        train_data = load_dataset("json", data_files = data_path)
        list_data_dict = [custom_format(train_data["train"][i]) for i in range(train_data["train"].num_rows)]
#         list_data_dict = train_data["train"].to_dict()
#         There used jload and utils

        # here we got the dataset
#         list_data_dict = jload(data_path)
        logging.warning("Formatting inputs...")
        prompt_input, prompt_no_input = PROMPT_DICT["prompt_input"], PROMPT_DICT["prompt_no_input"]
        sources = [
            prompt_input.format_map(example) if example.get("input", "") != "" else prompt_no_input.format_map(example)
            for example in list_data_dict
        ]# here we use the template to generate the sources
        targets = [f"{example['output']}{tokenizer.eos_token}" for example in list_data_dict]

        logging.warning("Tokenizing inputs... This may take some time...")
        data_dict = preprocess(sources, targets, tokenizer)

        self.input_ids = data_dict["input_ids"]
        self.labels = data_dict["labels"]

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
        return dict(input_ids=self.input_ids[i], labels=self.labels[i])
data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
train_dataset = SupervisedDataset(tokenizer=tokenizer, data_path=train_path)
eval_dataset = SupervisedDataset(tokenizer=tokenizer,data_path=test_path)
# train_datset = data_dict
# preprocess evaldata



In [15]:
data_module = dict(train_dataset = train_dataset, eval_dataset = eval_dataset, data_collator = data_collator)

#### See what the dataset look like

#### Convert the dataset to huggingface dataset

In [16]:
from datasets import Dataset
# dataset = Dataset.from_dict(train_dataset)

### Define trainer

#### Login the wandb

In [17]:
import wandb
wandb.login(relogin=True)

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

#### Tokenizing inputs

#### Login to hub

In [18]:
!git config --global user.email "toflamus12138@gmail.com"
!git config --global user.name "Toflamus"

In [19]:
from huggingface_hub import login
login(token='')

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
from transformers import Trainer, TrainingArguments

training_args1 = TrainingArguments(
    output_dir="/workspace/lost+found/Final_Project/Output",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="steps",
    eval_steps=100,
    logging_steps=100,
    gradient_accumulation_steps=8,
    num_train_epochs=5,
    weight_decay=0.0,
    warmup_steps=100,
    lr_scheduler_type="cosine",
    learning_rate=2e-5,
    save_steps=1000,
    fp16=True,
    # push_to_hub=True,
    report_to="wandb",
    tf32=True,
    full_determinism=False,
    seed=42,
)


trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args1,
    **data_module,  
)

trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mtoflamus12138[0m ([33mtoflamusteam[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss
100,7.6797,7.035451
200,6.9842,6.675406


In [None]:

trainer.push_to_hub("Toflamus/Finetuned_with_eval")


In [None]:
# !zip -r /workspace/lost+found/Final_Project/Finetune2.zip /workspace/lost+found/Final_Project/Output

In [38]:
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")
dataset

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 36718
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})

In [40]:
dataset['test'][:6]

{'text': ['',
  ' = Robert Boulter = \n',
  '',
  ' Robert Boulter is an English film , television and theatre actor . He had a guest @-@ starring role on the television series The Bill in 2000 . This was followed by a starring role in the play Herons written by Simon Stephens , which was performed in 2001 at the Royal Court Theatre . He had a guest role in the television series Judge John Deed in 2002 . In 2004 Boulter landed a role as " Craig " in the episode " Teddy \'s Story " of the television series The Long Firm ; he starred alongside actors Mark Strong and Derek Jacobi . He was cast in the 2005 theatre productions of the Philip Ridley play Mercury Fur , which was performed at the Drum Theatre in Plymouth and the Menier Chocolate Factory in London . He was directed by John Tiffany and starred alongside Ben Whishaw , Shane Zaza , Harry Kent , Fraser Ayres , Sophie Stanton and Dominic Hall . \n',
  ' In 2006 , Boulter starred alongside Whishaw in the play Citizenship written by Ma

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset, load_metric

# Load your fine-tuned model and tokenizer
model_name = "Toflamus/Finetuned3"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Initialize the metric for perplexity
perplexity_metric = load_metric("perplexity")

# Tokenize and evaluate the dataset for perplexity
def compute_metrics(p):
#     input_ids = p["text"]
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(p["text"]).input_ids
    loss = model(labels, labels=labels).loss
    return perplexity_metric.compute(predictions=loss.exp(), references=[1.0])

results = dataset['test'].map(compute_metrics, batched=True)
perplexity = results["perplexity"]

In [None]:

def compute_bleu(p):
    input_ids = p.input_ids
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(p["text"]).input_ids
    predicted_text = tokenizer.decode(model.generate(input_ids))
    return bleu_metric.compute(predictions=predicted_text, references=p["text"])

results = dataset.map(compute_bleu, batched=True)
bleu_score = results["bleu"]

In [None]:
model_gpt2 = 'gpt2'
mode_2 = AutoModelForCausalLM.from_pretrained(model_gpt2)
tokenizer_gpt2 = AutoTokenizer.from_pretrained(model_gpt2)

# Tokenize and evaluate the dataset for perplexity
def compute_metrics(p):
    input_ids = p.input_ids
    with tokenizer_gpt2.as_target_tokenizer():
        labels = tokenizer_gpt2(p["text"]).input_ids
    loss = model_gpt2(input_ids, labels=labels).loss
    return perplexity_metric.compute(predictions=loss.exp(), references=[1.0])

results_gpt2 = dataset.map(compute_metrics, batched=True)
gpt2_perplexity = results_gpt2["perplexity"]

# Calculate BLEU score
bleu_metric = load_metric("bleu")

def compute_bleu(p):
    input_ids = p.input_ids
    with tokenizer_gpt2.as_target_tokenizer():
        labels = tokenizer_gpt2(p["text"]).input_ids
    predicted_text = tokenizer_gpt2.decode(model.generate(input_ids))
    return bleu_metric.compute(predictions=predicted_text, references=p["text"])

results_gpt2 = dataset.map(compute_bleu, batched=True)
gpt2_bleu = results_gpt2["bleu"]



In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Define the metrics and their values for your model and the "gpt2" model
metrics = ["Perplexity", "BLEU"]
your_model_values = [perplexity, bleu_score]
gpt2_values = [gpt2_perplexity, gpt2_bleu]

# Create a radar plot
fig, ax = plt.subplots(subplot_kw={'projection': 'polar'})
angles = np.linspace(0, 2 * np.pi, len(metrics), endpoint=False).tolist()
angles += angles[:1]

your_model_values += your_model_values[:1]
gpt2_values += gpt2_values[:1]

ax.fill(angles, your_model_values, 'b', alpha=0.1)
ax.fill(angles, gpt2_values, 'r', alpha=0.1)

ax.set_xticks(angles[:-1])
ax.set_xticklabels(metrics)

# Display the radar plot
plt.show()
