In [1]:
import torch
import copy
import datasets

from peft import (
    LoftQConfig,
    LoraConfig,
    TaskType,
    get_peft_model,
    PeftModel
)

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    AutoConfig,
    BitsAndBytesConfig,
    StoppingCriteria,
    StoppingCriteriaList,
    DataCollatorForSeq2Seq,
    TrainingArguments,
    Trainer
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from dataclasses import dataclass
@dataclass
class DataClass:
    MODEL_PATH = "Qwen/Qwen2-0.5B-Instruct"      # Qwen/Qwen2-0.5B
    MAX_LENGTH = 64
    EPOCH = 1
    LORA_RANK = 2
    LORA_ALPHA = 2 * LORA_RANK
    LORA_DROPOUT = 0.3
    LORA_MODULES = ["o_proj", "qjv_proj", "gate_up_proj"]
    LR = 5e-5
    MODEL_SAVE_FOLDER = 'weights'

In [3]:
model_config = AutoConfig.from_pretrained(
    DataClass.MODEL_PATH,
    trust_remote_code = True,
    attn_implementation = 'eager', #'flash_attention_2'
)

tokenizer = AutoTokenizer.from_pretrained(
    DataClass.MODEL_PATH,
    trust_remote_code = True
)

tokenizer.pad_token = tokenizer.eos_token

# quant_config = BitsAndBytesConfig(
#     load_in_4bit = False,
#     bnb_4bit_quant_type="n4f",
#     bnb4bit_compute_dtype=torch.bfloat16,
#     bnb_4bit_use_double_quant=True
# )

model = AutoModelForCausalLM.from_pretrained(
    DataClass.MODEL_PATH,
    device_map='mps',
    low_cpu_mem_usage=True,
    # load_in_8bit=True,
    # load_in_4bit=True,
    attn_implementation='eager', #'flash_attention_2',
    torch_dtype=torch.bfloat16, # NOTE: MPS does not support torch.bfloat16 finetuning
    trust_remote_code=True,
    # quantization_config=quant_config
)

In [4]:
import os

# Macbook MPS
os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'

def inference(input_text):
    input_ids = tokenizer(input_text, return_tensors="pt")
    # print(input_ids.keys())
    outputs = model.generate(
        # **input_ids
        max_new_tokens=100,
        do_sample=False,
        num_beams=1,
        temperature=None,
        top_k=None,
        top_p=None,
        input_ids=input_ids['input_ids'].to('mps'),
        attention_mask=input_ids['attention_mask'].to('mps')
    )
    return tokenizer.decode(outputs[0])

In [5]:
input_text = "Write a poem in machine learning.\n"
print(inference(input_text=input_text))

Write a poem in machine learning.
Machine Learning is the art of using algorithms to learn from data and make predictions or decisions. It's a powerful tool that can be used for many different purposes, including image recognition, natural language processing, and predictive analytics.

In this poem, I'll use the concept of "machine learning" as a metaphor for how we can harness the power of AI to improve our lives. The poem will explore the various ways in which machine learning can be applied to solve problems, and how it can help us become


In [6]:
# Let's see the chat template

prompt = "Write a poem in machine learning."
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": prompt}
]

text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)

print("Chat Template:")
print(text)
print('-----')
print("Output:")
print(inference(text))

Chat Template:
<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
Write a poem in machine learning.<|im_end|>
<|im_start|>assistant

-----
Output:
<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
Write a poem in machine learning.<|im_end|>
<|im_start|>assistant
Machine Learning is the art of building models that can learn from data and make predictions or decisions based on those learned patterns.
In this poem, I will use the example of a simple neural network to illustrate how Machine Learning works.

Neural Networks are like computers that can learn from data without being explicitly programmed. They are made up of layers of interconnected nodes called neurons, which communicate with each other through weighted connections.
The first layer of a neural network is called the input layer, where we feed our data


In [7]:
from datasets import load_dataset

train_dataset = load_dataset("json", data_files="train.json")
test_dataset = load_dataset("json", data_files="valid.json")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [8]:
print(train_dataset)
print(test_dataset)

DatasetDict({
    train: Dataset({
        features: ['input_disfluent', 'output_original'],
        num_rows: 7182
    })
})
DatasetDict({
    train: Dataset({
        features: ['input_disfluent', 'output_original'],
        num_rows: 7182
    })
})


In [9]:
def prompt_formatter(data):
    """Formatting the prompt"""
    data = \
f'''<|im_start|>system
You are an advanced language model adept at interpreting and refining noisy or imperfect user inputs. 
Given user data, your task is to accurately extract the intended question and provide precise answers or predictions, even if the input contains errors or discontinuities.<|im_end|>
<|im_start|>user
{data['input_disfluent']}<|im_end|>
<|im_start|>assistant
{data['output_original']}<|im_end|>'''

    return {'sentence': data, 'input_ids': '', 'attention_mask': '', 'labels': ''}

print(prompt_formatter(train_dataset['train'][0])['sentence'])

<|im_start|>system
You are an advanced language model adept at interpreting and refining noisy or imperfect user inputs. 
Given user data, your task is to accurately extract the intended question and provide precise answers or predictions, even if the input contains errors or discontinuities.<|im_end|>
<|im_start|>user
What do petrologists no what do unstable isotope studies indicate?<|im_end|>
<|im_start|>assistant
What do unstable isotope studies indicate?<|im_end|>


In [10]:
def batch_tokenizer(batch):
    """Tokenization of data"""
    model_inputs = tokenizer(
        batch["sentence"],
        max_length=DataClass.MAX_LENGTH,
        truncation=True,
        padding='max_length'
    )
    # HF automatically performs right shift
    model_inputs['labels'] = copy.deepcopy(model_inputs['input_ids'])
    return model_inputs

In [11]:
train_dataset = train_dataset.map(prompt_formatter)
train_dataset = train_dataset.map(batch_tokenizer, batched=True, remove_columns = [])
test_dataset = test_dataset.map(prompt_formatter)
test_dataset = test_dataset.map(batch_tokenizer, batched=True, remove_columns = [])

Map: 100%|██████████| 7182/7182 [00:00<00:00, 7992.01 examples/s]
Map: 100%|██████████| 7182/7182 [00:00<00:00, 8180.97 examples/s]


In [12]:
data_collator = DataCollatorForSeq2Seq(
    model = model,
    tokenizer = tokenizer,
    max_length = DataClass.MAX_LENGTH,
    pad_to_multiple_of = 8,
    padding = 'max_length'
)

training_args = TrainingArguments(
    disable_tqdm=True,
    output_dir = DataClass.MODEL_SAVE_FOLDER,
    overwrite_output_dir=True,
    # fp16=True,
    # turncation=True,
    per_device_eval_batch_size=1,
    learning_rate=DataClass.LR,
    num_train_epochs=DataClass.EPOCH,
    logging_strategy='epoch',
    eval_strategy='epoch',
    save_strategy='epoch',
    push_to_hub=False,
    weight_decay=0.9,
    report_to=[]
)

In [13]:
trainer = Trainer(
    model = model,
    tokenizer= tokenizer,
    args = training_args,
    train_dataset= train_dataset['train'],
    eval_dataset=test_dataset['train'],
    data_collator=data_collator
)

model.config.use_cache = False
trainer.train()

KeyboardInterrupt: 