<a href="https://colab.research.google.com/github/SmittyB00p/PhiuSFBv0NkHjLd3/blob/main/fine_tuning_Phi3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# RAG Implementation

-- Note --

If using this notebook in its entirety, you will need a GPU and at least 15GB of working memory.

The top portion of this notebook implements Retrieval-Augmented Generation (RAG).

The bottom portion fine-tunes the ```microsoft/Phi-3-mini-4k-instruct``` model from HuggingFace with the help of ```unsloth``` to make the process go 2X faster.

In [2]:
!pip install trl
!pip install -U bitsandbytes
# !pip install datasets
!pip install deepspeed
!pip install llama-index
!pip install flash_attn
!pip install llama-index-embeddings-huggingface
!pip install llama-index-llms-huggingface
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import huggingface_hub, torch
# !pip install -U transformers==4.31.0
import transformers
import trl
import datasets
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, RagTokenizer, RagRetriever, RagSequenceForGeneration
from trl import SFTTrainer
import peft
import bitsandbytes
import sys, logging
from peft import LoraConfig
from trl import SFTTrainer

Collecting trl
  Downloading trl-0.13.0-py3-none-any.whl.metadata (11 kB)
Collecting datasets>=2.21.0 (from trl)
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets>=2.21.0->trl)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets>=2.21.0->trl)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets>=2.21.0->trl)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets>=2.21.0->trl)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading trl-0.13.0-py3-none-any.whl (293 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m293.4/293.4 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━



In [3]:
import llama_index
from llama_index.llms.openai import OpenAI

In [4]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = 'expandable_segments:True'

from google.colab import drive
drive = drive.mount('/content/drive')
from google.colab import userdata
user_data = userdata.get('HUGGINGFACE_TOKEN') ## enter your own api key from hugging face

cwd = os.getcwd()
df = pd.read_csv(cwd+'/drive/MyDrive/Data2/pre-ranked_df.csv')
candidates = df['job_title'].tolist()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
logger = logging.getLogger(__name__)


###################
# Hyper-parameters
###################
training_config = {
    "bf16": False,
    "do_eval": False,
    "learning_rate": 5.0e-06,
    "log_level": "info",
    "logging_steps": 20,
    "logging_strategy": "steps",
    "lr_scheduler_type": "cosine",
    "num_train_epochs": 1,
    "max_steps": -1,
    "output_dir": "/content/MyDrive/Phi-3FineTuning/checkpoint_dir",
    "overwrite_output_dir": True,
    "per_device_eval_batch_size": 2,
    "per_device_train_batch_size": 2,
    "remove_unused_columns": True,
    "save_steps": 60,
    "save_total_limit": 1,
    "seed": 0,
    "gradient_checkpointing": True,
    "gradient_checkpointing_kwargs":{"use_reentrant": False},
    "gradient_accumulation_steps": 1,
    "warmup_ratio": 0.2,
    # "deepspeed": "/content/drive/MyDrive/Phi-3FineTuning/deepspeed.json",
    }

peft_config = {
    "r": 16,
    "lora_alpha": 32,
    "lora_dropout": 0.05,
    "bias": "none",
    "task_type": "CAUSAL_LM",
    "target_modules": "all-linear",
    "modules_to_save": None,
}

if torch.cuda.is_bf16_supported():
    compute_dtype = torch.bfloat16
else:
    compute_dtype = torch.float16

train_conf = TrainingArguments(**training_config)
peft_conf = LoraConfig(**peft_config)

In [6]:
###############
# Setup logging
###############
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
)
log_level = train_conf.get_process_log_level()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()

# Log on each process a small summary
logger.warning(
    f"Process rank: {train_conf.local_rank}, device: {train_conf.device}, n_gpu: {train_conf.n_gpu}"
    + f" distributed training: {bool(train_conf.local_rank != -1)}, 16-bits training: {train_conf.fp16}"
)
logger.info(f"Training/evaluation parameters {train_conf}")
logger.info(f"PEFT parameters {peft_conf}")

INFO:__main__:Training/evaluation parameters TrainingArguments(
_n_gpu=0,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=Non

In [7]:
# ################
# # Model Loading
# ################
checkpoint_path = "microsoft/Phi-3-mini-4k-instruct"
# # checkpoint_path = "microsoft/Phi-3-mini-128k-instruct"
model_kwargs = dict(
    use_cache=False,
    trust_remote_code=True,
    attn_implementation="eager",  # loading the model with flash-attenstion support
    torch_dtype=torch.bfloat16,
    device_map=None
)
model = AutoModelForCausalLM.from_pretrained(checkpoint_path, **model_kwargs)
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
tokenizer.model_max_length = 2048
tokenizer.pad_token = tokenizer.unk_token  # use unk rather than eos token to prevent endless generation
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
tokenizer.padding_side = 'right'

In [8]:
## after you have loaded the model and tokenizer you can save them as to not have to not load them in again
## you can then comment out the last 6 lines above
## after you have saved the model and tokenizer you can then comment these lines out and uncomment the lines in the next code block

model.save_pretrained(cwd+'/drive/MyDrive/phi3')
tokenizer.save_pretrained(cwd+'/drive/MyDrive/phi3')

In [9]:
# model = AutoModelForCausalLM.from_pretrained(cwd+'/drive/MyDrive/phi3', **model_kwargs)
# tokenizer = AutoTokenizer.from_pretrained(cwd+'/drive/MyDrive/phi3')

[INFO|tokenization_utils_base.py:2028] 2025-01-16 12:38:50,395 >> loading file tokenizer.model
[INFO|tokenization_utils_base.py:2028] 2025-01-16 12:38:50,402 >> loading file tokenizer.json
[INFO|tokenization_utils_base.py:2028] 2025-01-16 12:38:50,404 >> loading file added_tokens.json
[INFO|tokenization_utils_base.py:2028] 2025-01-16 12:38:50,424 >> loading file special_tokens_map.json
[INFO|tokenization_utils_base.py:2028] 2025-01-16 12:38:50,427 >> loading file tokenizer_config.json
[INFO|tokenization_utils_base.py:2028] 2025-01-16 12:38:50,437 >> loading file chat_template.jinja
[INFO|tokenization_utils_base.py:2300] 2025-01-16 12:38:52,014 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings

In [11]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, SummaryIndex, SimpleDirectoryReader
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.huggingface import HuggingFaceLLM
from accelerate import init_empty_weights, load_checkpoint_and_dispatch

def messages_to_prompt(messages):
  prompt = ''
  system_found = False
  for message in messages:
    if message.role == 'system':
      prompt += f'<|system|>\n{message.content}<|end|>\n'
      system_found = True
    elif message.role == 'user':
      prompt += f'<|user|>\n{message.content}<|end|>\n'
    elif message.role == 'assistant':
      prompt += f'<|assistant|>\n{message.content}<|end|>\n'
    else:
      prompt += f'<|user|>\n{message.content}<|end|>\n'

  prompt += '<|assistant|>\n'

  if not system_found:
    prompt = (
        '<|system|>\nYou are a helpful AI assistant that answers questions to the best of your knowledge base.<|end|>' + prompt
    )

  return prompt

llm = HuggingFaceLLM(
    # disk_offload,
    model_name="microsoft/Phi-3-mini-4k-instruct",
    tokenizer=tokenizer,
    model_kwargs={'trust_remote_code':True},
    generate_kwargs={'do_sample': True, 'temperature': 0.1},
    query_wrapper_prompt=(
        '<|system|>\n'
        'You are a helpful AI assistant that answers questions to the best of your knowledge base.<|end|>\n'
        '<|user|>\n'
        '{query_str}<|end|>\n'
        '<|assistant|>\n'
    ),
    messages_to_prompt=messages_to_prompt,
    is_chat_model=True
)

config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

[INFO|configuration_utils.py:695] 2025-01-16 12:38:55,263 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/0a67737cc96d2554230f90338b163bc6380a2a85/config.json


configuration_phi3.py:   0%|          | 0.00/11.2k [00:00<?, ?B/s]

- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
[INFO|configuration_utils.py:695] 2025-01-16 12:38:55,556 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/0a67737cc96d2554230f90338b163bc6380a2a85/config.json
[INFO|configuration_utils.py:762] 2025-01-16 12:38:55,563 >> Model config Phi3Config {
  "_name_or_path": "microsoft/Phi-3-mini-4k-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3-mini-4k-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3-mini-4k-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_rang

modeling_phi3.py:   0%|          | 0.00/73.2k [00:00<?, ?B/s]

- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/16.5k [00:00<?, ?B/s]

[INFO|modeling_utils.py:3953] 2025-01-16 12:38:58,670 >> loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/0a67737cc96d2554230f90338b163bc6380a2a85/model.safetensors.index.json


Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

[INFO|configuration_utils.py:1140] 2025-01-16 12:42:00,805 >> Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 32000,
  "pad_token_id": 32000
}



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO|modeling_utils.py:4849] 2025-01-16 12:42:43,388 >> All model checkpoint weights were used when initializing Phi3ForCausalLM.

[INFO|modeling_utils.py:4857] 2025-01-16 12:42:43,391 >> All the weights of Phi3ForCausalLM were initialized from the model checkpoint at microsoft/Phi-3-mini-4k-instruct.
If your task is similar to the task the model of the checkpoint was trained on, you can already use Phi3ForCausalLM for predictions without further training.


generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

[INFO|configuration_utils.py:1095] 2025-01-16 12:42:43,926 >> loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3-mini-4k-instruct/snapshots/0a67737cc96d2554230f90338b163bc6380a2a85/generation_config.json
[INFO|configuration_utils.py:1140] 2025-01-16 12:42:43,930 >> Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": [
    32000,
    32001,
    32007
  ],
  "pad_token_id": 32000
}



In [12]:
Settings.llm = llm

Settings.embed_model = HuggingFaceEmbedding(
    model_name="BAAI/bge-small-en-v1.5"
)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

[INFO|configuration_utils.py:695] 2025-01-16 12:42:48,464 >> loading configuration file config.json from cache at /tmp/llama_index/models--BAAI--bge-small-en-v1.5/snapshots/5c38ec7c405ec4b44b94cc5a9bb96e735b38267a/config.json
[INFO|configuration_utils.py:762] 2025-01-16 12:42:48,467 >> Model config BertConfig {
  "_name_or_path": "BAAI/bge-small-en-v1.5",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.47.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "voca

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

[INFO|modeling_utils.py:3953] 2025-01-16 12:42:51,378 >> loading weights file model.safetensors from cache at /tmp/llama_index/models--BAAI--bge-small-en-v1.5/snapshots/5c38ec7c405ec4b44b94cc5a9bb96e735b38267a/model.safetensors
[INFO|modeling_utils.py:4849] 2025-01-16 12:42:51,685 >> All model checkpoint weights were used when initializing BertModel.

[INFO|modeling_utils.py:4857] 2025-01-16 12:42:51,686 >> All the weights of BertModel were initialized from the model checkpoint at BAAI/bge-small-en-v1.5.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.


tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

[INFO|tokenization_utils_base.py:2030] 2025-01-16 12:42:53,243 >> loading file vocab.txt from cache at /tmp/llama_index/models--BAAI--bge-small-en-v1.5/snapshots/5c38ec7c405ec4b44b94cc5a9bb96e735b38267a/vocab.txt
[INFO|tokenization_utils_base.py:2030] 2025-01-16 12:42:53,244 >> loading file tokenizer.json from cache at /tmp/llama_index/models--BAAI--bge-small-en-v1.5/snapshots/5c38ec7c405ec4b44b94cc5a9bb96e735b38267a/tokenizer.json
[INFO|tokenization_utils_base.py:2030] 2025-01-16 12:42:53,246 >> loading file added_tokens.json from cache at None
[INFO|tokenization_utils_base.py:2030] 2025-01-16 12:42:53,247 >> loading file special_tokens_map.json from cache at /tmp/llama_index/models--BAAI--bge-small-en-v1.5/snapshots/5c38ec7c405ec4b44b94cc5a9bb96e735b38267a/special_tokens_map.json
[INFO|tokenization_utils_base.py:2030] 2025-01-16 12:42:53,248 >> loading file tokenizer_config.json from cache at /tmp/llama_index/models--BAAI--bge-small-en-v1.5/snapshots/5c38ec7c405ec4b44b94cc5a9bb96e735

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [13]:
# from llama_index import node_parser

# node_parser = SimpleNodeParser(chunk_size_limit=512)
documents = SimpleDirectoryReader("drive/MyDrive/Data").load_data()
documents

[Document(id_='6e3cf0c7-acf7-4ad6-999a-09ec39bba816', embedding=None, metadata={'file_path': '/content/drive/MyDrive/Data/potential-talents - Aspiring human resources - seeking human resources.csv', 'file_name': 'potential-talents - Aspiring human resources - seeking human resources.csv', 'file_type': 'text/csv', 'file_size': 9314, 'creation_date': '2025-01-12', 'last_modified_date': '2025-01-12'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text="1, 2019 C.T. Bauer College of Business Graduate (Magna Cum Laude) and aspiring Human Resources professional, Houston, Texas, 85, nan\n2, Native English Teacher at EPIK (English Program in Korea), K

In [14]:
vector_index = VectorStoreIndex.from_documents(documents)

summary_index = SummaryIndex.from_documents(documents)
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "caching_allocator"
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [15]:
print(os.environ['PYTORCH_CUDA_ALLOC_CONF'])

expandable_segments:True


In [None]:
import gc
gc.collect()

query_engine = vector_index.as_query_engine(response_mode='compact')

response = query_engine.query("aspiring")



In [None]:
from llama_index.core.response.notebook_utils import display_response
display_response(response)

In [None]:
## this line might help, but I never was able to run it...my apologies

# response.save_to_disk(cwd+"/drive/MyDrive/RAG/response.json")

## Fine Tuning

In [None]:
# MAX_SEQUENCE_LENGTH = 2048
# DATASET = load_dataset("huawei-noah/human_rank_eval", split='HumanRankEvalSoftEng')

# DATASET

In [None]:
# train_dataset = DATASET.select(range(int(DATASET.shape[0]*.80)))
# train_dataset

In [None]:
# ##################
# # Data Processing
# ##################

# # tokenizer = get_chat_template(tokenizer, chat_template = "phi-3", mapping={'role': 'text', 'content': 'votes', 'user': 'human', 'assistant': 'model'})

# def formatting_prompts_func(examples, tokenizer):
#   answers = examples['answers']
#   examples["text"] = [tokenizer.apply_chat_template(
#           answer, tokenize=False, add_generation_prompt=False) for answer in answers]
#   return examples

# train_dataset = train_dataset.map(
#     formatting_prompts_func,
#     fn_kwargs={"tokenizer": tokenizer},
#     batched=True)

# print(f"\ntrain_dataset features: {train_dataset.features}\n")

# train_dataset['text'][0]

In [None]:
# ###########
# # Training
# ###########
# trainer = SFTTrainer(
#     model=model,
#     args=train_conf,
#     peft_config=peft_conf,
#     train_dataset=train_dataset,
#     # eval_dataset=processed_test_dataset,
#     # max_seq_length=MAX_SEQUENCE_LENGTH,
#     # dataset_text_field="text",
#     tokenizer=tokenizer,
#     # packing=True
# )
# train_result = trainer.train()
# metrics = train_result.metrics
# trainer.log_metrics("train", metrics)
# trainer.save_metrics("train", metrics)
# trainer.save_state()

In [None]:
# #############
# # Evaluation
# #############
# tokenizer.padding_side = 'left'
# # metrics = trainer.evaluate()
# # metrics["eval_samples"] = len(processed_test_dataset)
# trainer.log_metrics("eval", metrics)
# trainer.save_metrics("eval", metrics)


# # ############
# # # Save model
# # ############
# trainer.save_model(train_conf.output_dir)

In [None]:
# cwd = os.getcwd()
# model_path = cwd + '/drive/MyDrive/Phi-3FineTuning/checkpoint_dir'
# model = AutoModelForCausalLM.from_pretrained(model_path, **model_kwargs)

In [None]:
# messages = [
#     {
#         "role": "user",
#         "content": "Given a phrase, answer the question by giving the top items, which will be given by the user."},
# ]

# model_inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
# input_length = model_inputs.shape[1]
# generated_ids = model.generate(model_inputs, do_sample=True, max_new_tokens=500)

In [None]:
# print(tokenizer.batch_decode(generated_ids[:, input_length:], skip_special_tokens=True)[0])

In [None]:
# model.save_pretrained('/drive/MyDrive/Phi-3FineTuning/post_finetuned_model')
# tokenizer.save_pretrained('/drive/MyDrive/Phi-3FineTuning/post_finetuned_model')

In [None]:
# model = AutoModelForCausalLM.from_pretrained('/drive/MyDrive/Phi-3FineTuning/post_finetuned_model', **model_kwargs)
# tokenizer = AutoTokenizer.from_pretrained('/drive/MyDrive/Phi-3FineTuning/post_finetuned_model')

In [None]:
# def generate_candidates(candidate_list, top_n_cands):
#   '''
#   --Parameters--
#   candidate_list: list of candidates
#   top_n_cands: number of candidates to return

#   --Returns--
#   list of top n candidates
#   '''
#   message = [
#       {"role": "user",
#        "content": f"Can you rank these candidates in this list {candidate_list} by the phrase 'aspiring human resources' and return the top {top_n_cands}?"},
#   ]


#   model_inputs = tokenizer.apply_chat_template(message, add_generation_prompt=True, return_tensors="pt")
#   input_length = model_inputs.shape[1]
#   with torch.no_grad():
#     outputs = model.generate(model_inputs, max_new_tokens=500)
#   response = tokenizer.batch_decode(outputs[:, input_length:], skip_special_tokens=True)[0]
#   return response

In [None]:
# generate_candidates(candidates, 15)

## Unsloth

Run if you have access to a GPU

In [None]:
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps trl peft accelerate bitsandbytes
from unsloth import FastLanguageModel, is_bfloat16_supported
from unsloth.chat_templates import get_chat_template
import datasets
from trl import SFTTrainer
from transformers import TrainingArguments
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
# import xformers

MAX_SEQUENCE_LENGTH = 2048
MODEL ="unsloth/Phi-3-mini-4k-instruct"
DATASET = load_dataset('huawei-noah/human_rank_eval', split='HumanRankEvalSoftEng')

model = AutoModelForCausalLM.from_pretrained(MODEL, trust_remote_code=True, torch_dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL)

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL,
    max_seq_length = MAX_SEQUENCE_LENGTH,
    dtype = None,
    load_in_4bit = False,
)

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    max_seq_length = MAX_SEQUENCE_LENGTH,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

In [None]:
DATASET

In [None]:
DATASET.select(range(int(DATASET.shape[0]*.80)))

In [None]:
DATASET['answers'][0]

In [None]:
tokenizer = get_chat_template(tokenizer, chat_template = "phi-3", mapping={'role': 'text', 'content': 'votes', 'user': 'human', 'assistant': 'model'})

def formatting_prompts_func(examples, tokenizer):
  answers = examples['answers']
  examples["text"] = [tokenizer.apply_chat_template(
          answer, tokenize=False, add_generation_prompt=False) for answer in answers]
  return examples

DATASET = DATASET.map(
    formatting_prompts_func,
    fn_kwargs={"tokenizer": tokenizer},
    batched=True)

DATASET['text'][0]

In [None]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "caching_allocator"

trainer = SFTTrainer(
    model = model,
    train_dataset = DATASET,
    dataset_text_field = "text",
    max_seq_length = MAX_SEQUENCE_LENGTH,
    tokenizer = tokenizer,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 10,
        max_steps = 60,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        output_dir = "outputs",
        optim = "adamw_8bit",
        seed = 3407
    ),
)


In [None]:
import wandb
trainer.train()

In [None]:
tokenizer = get_chat_template(tokenizer, chat_template = "phi-3", add_eos_token = True)

FastLanguageModel.for_inference(model)

messages = [
    {
        "role": "system",
        "content": "You are a ranking algorithm that ranks a list of items based on how similar each item is to a given phrase and returns the top 'n' items that the user requests."},
    {
        'role': 'user',
        'content': ''
    }
]

model_inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
# print(model_inputs.shape)
input_length = model_inputs.shape[1]
generated_ids = model.generate(model_inputs, do_sample=True, max_new_tokens=50)

In [None]:
print(tokenizer.batch_decode(generated_ids[:, input_length:], skip_special_tokens=True)[0])