### Install Requirements

In [1]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
!pip install -q bitsandbytes datasets accelerate loralib
!pip install -q git+https://github.com/huggingface/transformers.git@main git+https://github.com/huggingface/peft.git

### Model loading

We will be using [Phi-2](https://huggingface.co/microsoft/phi-2) Model

In [3]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
import torch
import torch.nn as nn
import bitsandbytes as bnb
from transformers import BitsAndBytesConfig
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM

model_id = "microsoft/Phi-3-mini-4k-instruct"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map='auto',
    attn_implementation="eager",
    trust_remote_code=True
)

tokenizer = AutoTokenizer.from_pretrained(model_id)

config.json:   0%|          | 0.00/904 [00:00<?, ?B/s]

configuration_phi3.py:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi3.py:   0%|          | 0.00/73.8k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-4k-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/16.3k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/172 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.17k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/568 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
print(model)

Phi3ForCausalLM(
  (model): Phi3Model(
    (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x Phi3DecoderLayer(
        (self_attn): Phi3Attention(
          (o_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (qkv_proj): Linear4bit(in_features=3072, out_features=9216, bias=False)
          (rotary_emb): Phi3RotaryEmbedding()
        )
        (mlp): Phi3MLP(
          (gate_up_proj): Linear4bit(in_features=3072, out_features=16384, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=3072, bias=False)
          (activation_fn): SiLU()
        )
        (input_layernorm): Phi3RMSNorm()
        (resid_attn_dropout): Dropout(p=0.0, inplace=False)
        (resid_mlp_dropout): Dropout(p=0.0, inplace=False)
        (post_attention_layernorm): Phi3RMSNorm()
      )
    )
    (norm): Phi3RMSNorm()
  )
  (lm_head): Linear(in_features=3072, o

### Post-processing on the model

Then we have to apply some preprocessing to the model to prepare it for training. For that use the `prepare_model_for_kbit_training` method from PEFT.

In [5]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

### Apply LoRA

In [6]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [7]:
import torch
from transformers import Conv1D

def get_specific_layer_names(model):
    # Create a list to store the layer names
    layer_names = []

    # Recursively visit all modules and submodules
    for name, module in model.named_modules():
        # Check if the module is an instance of the specified layers
        if isinstance(module, (torch.nn.Linear, torch.nn.Embedding, torch.nn.Conv2d, Conv1D)):
            # model name parsing

            layer_names.append('.'.join(name.split('.')[4:]).split('.')[0])

    return layer_names

list(set(get_specific_layer_names(model)))

['', 'gate_up_proj', 'down_proj', 'o_proj', 'qkv_proj']

In [8]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=[
        "o_proj",
        "down_proj",
        "qkv_proj",
        "gate_up_proj",
    ],
    bias="none",
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
)

# peft_config = {
#     "r": 16,
#     "lora_alpha": 32,
#     "lora_dropout": 0.05,
#     "bias": "none",
#     "task_type": "CAUSAL_LM",
#     "target_modules": "all-linear",
#     "modules_to_save": None,
# }

model = get_peft_model(model, config)
print(print_trainable_parameters(model))

trainable params: 25165824 || all params: 2034306048 || trainable%: 1.237071679786895
None


Dataset

In [9]:
import pandas as pd
from datasets import Dataset
# Load and process the dataset
def load_custom_dataset(file_path):
    df = pd.read_csv(file_path)
    dataset = Dataset.from_pandas(df)
    return dataset

def apply_template(example):
    body = example["Body "]
    hook = example["Hook"]
    cta = example["CTA"]
    input_text = f"Body: {body}\n"
    output_text = f"Hook: {hook}\nCTA: {cta}"
    example["text"] = input_text + output_text
    return example

custom_dataset = load_custom_dataset("/kaggle/input/phifinetuning/Copy of AI Project Data Training Sheet - Data Sheet.csv")

processed_dataset = custom_dataset.map(
    apply_template,
    num_proc=10,
    remove_columns=["Body ", "Hook", "CTA"],
    desc="Applying template to custom dataset",
)

# Tokenize the dataset
def tokenize_function(example):
    tokenized_output = tokenizer(example["text"], padding="max_length", truncation=True, max_length=tokenizer.model_max_length)
    return tokenized_output

tokenized_dataset = processed_dataset.map(
    tokenize_function,
    batched=True,
    num_proc=10,
    remove_columns=["text"],
    desc="Tokenizing the dataset",
)

# Split the dataset into train and test sets
split_dataset = tokenized_dataset.train_test_split(test_size=0.1)
train_dataset = split_dataset["train"]
test_dataset = split_dataset["test"]

Applying template to custom dataset (num_proc=10):   0%|          | 0/43 [00:00<?, ? examples/s]

Tokenizing the dataset (num_proc=10):   0%|          | 0/43 [00:00<?, ? examples/s]

In [None]:
# import pandas as pd
# from datasets import Dataset
# # Load your custom dataset
# def load_custom_dataset(file_path):
#     df = pd.read_csv(file_path)
#     dataset = Dataset.from_pandas(df)
#     return dataset

# def apply_template(example, tokenizer):
#     Body = example["Body "]
#     hook = example["Hook"]
#     cta = example["CTA"]
#     input_text = f"Body: {Body }\n"
#     output_text = f"Hook: {hook}\nCTA: {cta}"
#     example["text"] = input_text + output_text
#     return example

# custom_dataset = load_custom_dataset("/kaggle/input/phifinetuning/Copy of AI Project Data Training Sheet - Data Sheet.csv")

# # Process the dataset
# processed_dataset = custom_dataset.map(
#     apply_template,
#     fn_kwargs={"tokenizer": tokenizer},
#     num_proc=10,
#     remove_columns=["Body ", "Hook", "CTA"],
#     desc="Applying template to custom dataset",
# )

In [None]:
# # Tokenize the dataset
# def tokenize_function(example):
#     return tokenizer(example["text"], padding="max_length", truncation=True, max_length=tokenizer.model_max_length)

# tokenized_dataset = processed_dataset.map(
#     tokenize_function,
#     batched=True,
#     num_proc=10,
#     remove_columns=["text"],
#     desc="Tokenizing the dataset",
# )

In [None]:
# # Split the dataset into train and test
# split_dataset = processed_dataset.train_test_split(test_size=0.1)
# train_dataset = split_dataset["train"]
# test_dataset = split_dataset["test"]

In [None]:
# !pip install trl transformers[torch] torch accelerate -q

In [None]:
# !pip install git+https://github.com/huggingface/transformers.git

In [None]:
# import sys
# import logging
# import pandas as pd
# from datasets import Dataset
# from peft import LoraConfig
# import torch
# from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
# from trl import SFTTrainer
# from transformers.utils import is_torch_mlu_available

# # # Disable the MLU error
# # sys.modules['transformers.utils'] = transformers.utils

# trainer = SFTTrainer(
#     model=model,
#     args=train_conf,
#     peft_config=peft_conf,
#     train_dataset=train_dataset,
#     eval_dataset=test_dataset,
#     max_seq_length=2048,
#     dataset_text_field="input_ids",
#     tokenizer=tokenizer,
#     packing=True
# )

# train_result = trainer.train()
# metrics = train_result.metrics
# trainer.log_metrics("train", metrics)
# trainer.save_metrics("train", metrics)
# trainer.save_state()

In [None]:
# import pandas as pd

# # Load CSV file
# file_path = '/kaggle/input/phifinetuning/Copy of AI Project Data Training Sheet - Data Sheet.csv'
# data = pd.read_csv(file_path)

# # Ensure columns are correctly read
# data.columns = data.columns.str.strip()  # Strip any extra whitespace from column names

# # Extract necessary columns
# data = data[['Hook', 'Build Up', 'Body', 'CTA']]

# # Combine the text columns into a single input for the model
# data['input_text'] = data['Body']

# # Display the first few rows of the dataframe
# print(data.head())
# # Create a list of dictionaries for training
# training_data = [
#     {"input_text": row['input_text'], "output_text": row['CTA']} 
#     for _, row in data.iterrows()
# ]

# # Display the first few training examples
# print(training_data[:5])

In [10]:
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from datasets import Dataset

2024-05-19 11:10:45.798373: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-19 11:10:45.798478: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-19 11:10:45.892091: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [None]:
# # Create a Hugging Face Dataset from the training data
# dataset = Dataset.from_pandas(pd.DataFrame(training_data))

In [None]:
# # Tokenize the inputs and outputs
# def tokenize_function(examples):
#     input_encodings = tokenizer(examples['input_text'], truncation=True, padding="max_length", max_length=1024)
#     output_encodings = tokenizer(examples['output_text'], truncation=True, padding="max_length", max_length=1024)
#     encodings = {"input_ids": input_encodings['input_ids'], "attention_mask": input_encodings['attention_mask'], "labels": output_encodings['input_ids']}
#     return encodings

# tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=['input_text', 'output_text'])

In [None]:
# from datasets import load_dataset

# data = load_dataset("Abirate/english_quotes")
# data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)

### Training

In [12]:
!pip install trl -q

In [13]:
import sys
import logging
import pandas as pd
from datasets import Dataset
from peft import LoraConfig
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
from trl import SFTTrainer
from transformers.utils import is_torch_mlu_available
# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=3,
    logging_dir='./logs',
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
)

# trainer = SFTTrainer(
#     model=model,
#     args=train_conf,
#     peft_config=peft_conf,
#     train_dataset=train_dataset,
#     eval_dataset=test_dataset,
#     max_seq_length=2048,
#     dataset_text_field="input_ids",
#     tokenizer=tokenizer,
#     packing=True
# )

# train_result = trainer.train()
# metrics = train_result.metrics
# trainer.log_metrics("train", metrics)
# trainer.save_metrics("train", metrics)
# trainer.save_state()

# Train the model
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc




OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU 0 has a total capacty of 15.89 GiB of which 3.02 GiB is free. Process 3579 has 12.87 GiB memory in use. Of the allocated memory 12.14 GiB is allocated by PyTorch, and 448.85 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
model.push_to_hub("piika919/phi_bnb", token=True)

In [None]:
# Save the trained model and tokenizer
model_save_path = './saved_model'
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

print(f"Model and tokenizer saved to {model_save_path}")

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, pipeline
import torch 

# Load the saved model and tokenizer
model_save_path = '/kaggle/working/saved_model'
model = AutoModelForCausalLM.from_pretrained(model_save_path, torch_dtype=torch.float32)
tokenizer = AutoTokenizer.from_pretrained(model_save_path)

# Move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Initialize the pipeline
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1  # Ensure pipeline uses GPU if available
)

# Define the messages
messages = [
    {"role": "user", "content": "Can you provide ways to eat combinations of bananas and dragonfruits?"},
    {"role": "assistant", "content": "Sure! Here are some ways to eat bananas and dragonfruits together: 1. Banana and dragonfruit smoothie: Blend bananas and dragonfruits together with some milk and honey. 2. Banana and dragonfruit salad: Mix sliced bananas and dragonfruits together with some lemon juice and honey."},
    {"role": "user", "content": "What about solving an 2x + 3 = 7 equation?"},
]

# Define the generation arguments
generation_args = {
    "max_new_tokens": 500,
    "return_full_text": False,
    "temperature": 0.3,
    "do_sample": True,
}

# Generate the output
output = pipe(messages, **generation_args)
print(output[0]['generated_text'])

In [None]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import AutoTokenizer, Phi3ForCausalLM
from transformers import Phi3Model, Phi3Config
import bitsandbytes as bnb
from transformers import BitsAndBytesConfig

from transformers import Phi3Model, Phi3Config

# # Initializing a Phi-3 style configuration
# configuration = Phi3Config.from_pretrained("microsoft/Phi-3-mini-4k-instruct")

# # Initializing a model from the configuration
# model = Phi3Model(configuration)

# # Accessing the model configuration
# configuration = model.config

peft_model_id = "piika919/phi_bnb"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

config = Phi3Config.from_pretrained(peft_model_id)
model = Phi3ForCausalLM.from_pretrained(config.base_model_name_or_path, return_dict=True, load_in_8bit=True, device_map='auto')
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the Lora model
model = PeftModel.from_pretrained(model, peft_model_id)

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

messages = "When was Taj mahal built?"

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

generation_args = {
    "max_new_tokens": 500,
    "return_full_text": False,
    "temperature": 0.4,
    "do_sample": True,
}

output = pipe(messages, **generation_args)
print(output[0]['generated_text'])


In [None]:
batch = tokenizer("So many books, so little ?", return_tensors='pt')

with torch.cuda.amp.autocast():
  output_tokens = model.generate(**batch, max_new_tokens=50)

print('\n\n', tokenizer.decode(output_tokens[0], skip_special_tokens=True))

In [None]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

peft_model_id = "/kaggle/working/saved_model"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, return_dict=True, load_in_8bit=True, device_map='auto')
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the Lora model
model = PeftModel.from_pretrained(model, peft_model_id)

In [None]:
import transformers

tokenizer.pad_token = tokenizer.eos_token

trainer = transformers.Trainer(
    model=model,
    train_dataset=data["train"],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        max_steps=10,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir="outputs",
        optim="paged_adamw_8bit"
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

In [None]:
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

## Share adapters on the 🤗 Hub

In [None]:
model.push_to_hub("piika919/fine_tune_phi", use_auth_token=True)

## Load adapters from the Hub

You can also directly load adapters from the Hub using the commands below:

In [None]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

peft_model_id = "vishesh-t27/fine_tune_phi"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, return_dict=True, load_in_8bit=True, device_map='auto')
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the Lora model
model = PeftModel.from_pretrained(model, peft_model_id)

## Inference

You can then directly use the trained model or the model that you have loaded from the 🤗 Hub for inference as you would do it usually in `transformers`.

In [None]:
batch = tokenizer("So many books, so little ?", return_tensors='pt')

with torch.cuda.amp.autocast():
  output_tokens = model.generate(**batch, max_new_tokens=50)

print('\n\n', tokenizer.decode(output_tokens[0], skip_special_tokens=True))

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

messages = "When was Taj mahal built?"

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

generation_args = {
    "max_new_tokens": 500,
    "return_full_text": False,
    "temperature": 0.4,
    "do_sample": True,
}

output = pipe(messages, **generation_args)
print(output[0]['generated_text'])


In [None]:
!pip install peft -q

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import load_dataset
from peft import QLoRA, prepare_model_for_qLoRA

import pandas as pd

# Load CSV file
file_path = '/kaggle/input/phifinetuning/Copy of AI Project Data Training Sheet - Data Sheet.csv'
data = pd.read_csv(file_path)

# Ensure columns are correctly read
data.columns = data.columns.str.strip()  # Strip any extra whitespace from column names

# Extract necessary columns
data = data[['Hook', 'Build Up', 'Body', 'CTA']]

# Combine the text columns into a single input for the model
data['input_text'] = data['Hook'] + " " + data['Build Up'] + " " + data['Body'] + " " + data['CTA']

# Display the first few rows of the dataframe
print(data.head())
# Create a list of dictionaries for training
training_data = [
    {"input_text": row['input_text'], "output_text": row['CTA']} 
    for _, row in data.iterrows()
]

# Display the first few training examples
print(training_data[:5])

# Create a Hugging Face Dataset from the training data
dataset = Dataset.from_pandas(pd.DataFrame(training_data))

# Tokenize the inputs and outputs
def tokenize_function(examples):
    input_encodings = tokenizer(examples['input_text'], truncation=True, padding="max_length", max_length=1024)
    output_encodings = tokenizer(examples['output_text'], truncation=True, padding="max_length", max_length=1024)
    encodings = {"input_ids": input_encodings['input_ids'], "attention_mask": input_encodings['attention_mask'], "labels": output_encodings['input_ids']}
    return encodings

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=['input_text', 'output_text'])


# Step 2: Set Up the Environment
# Install necessary libraries if not already installed
# !pip install transformers torch datasets peft

# Step 3: Load the Pre-trained Model
model_name = 'microsoft/Phi3'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Step 4: Configure QLoRA
# Prepare the model for QLoRA fine-tuning
model = prepare_model_for_qLoRA(model)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=5e-5,
    per_device_train_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Step 5: Train the Model

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
)

# Start training
trainer.train()

# Step 6: Evaluate and Save the Model
# Save the model
model.save_pretrained('./finetuned_phi3')
tokenizer.save_pretrained('./finetuned_phi3')

print("Model fine-tuning completed and saved!")


As you can see by fine-tuning for few steps we have almost recovered the quote from Albert Einstein that is present in the [training data](https://huggingface.co/datasets/Abirate/english_quotes).