<a href="https://colab.research.google.com/github/TheRealSirBen/LLM-Finetuning-Using-REFT/blob/main/LLM_Finetune_Inference_PyReft.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install libraries

In [1]:
!pip install git+https://github.com/stanfordnlp/pyreft.git
!pip install huggingface_hub

Collecting git+https://github.com/stanfordnlp/pyreft.git
  Cloning https://github.com/stanfordnlp/pyreft.git to /tmp/pip-req-build-wyqivtcj
  Running command git clone --filter=blob:none --quiet https://github.com/stanfordnlp/pyreft.git /tmp/pip-req-build-wyqivtcj
  Resolved https://github.com/stanfordnlp/pyreft.git to commit 97768c053c3bbb05194ffef00f523ca9952c9a2c
  Running command git submodule update --init --recursive -q
  Preparing metadata (setup.py) ... [?25l[?25hdone


# Import libraries

In [25]:
import pyreft
import transformers
import torch
import pandas as pd
from google.colab import drive
from json import load
import os
from re import sub
import time
from huggingface_hub import interpreter_login

# Project directories

In [3]:
drive_dir = '/content/drive'
drive.mount(drive_dir)

Mounted at /content/drive


In [4]:
data_dir = '/content/drive/MyDrive/Models Data'
saved_models_dir = '/content/drive/MyDrive/Models'

# Helper methods

In [5]:
# Get device
def get_device():
  return 'cuda' if torch.cuda.is_available() else 'cpu'

In [6]:
# Get model token
def get_model_token(_config_obj: dict, _selected_model: str) -> str:

  if _selected_model in _config_obj:
    _token = _config_obj[_selected_model]
    return _token

  return _config_obj['sirben public']


In [7]:
# Prompt template
def prompt_template(prompt):
    return f"""<s>[INST]<<sys>>You are a helpful assistant<</sys>>
        {prompt}
        [/INST]"""

In [8]:
def remove_special_chars(input_string: str) -> str:
    # Define a regular expression pattern to match special characters
    pattern = r'[^a-zA-Z0-9\s]'

    # Use the re.sub() function to replace special characters with an empty string
    cleaned_string = sub(pattern, '', input_string)

    return cleaned_string

# Enviroment variables

In [9]:
device = get_device()
config_file_path = '/content/drive/MyDrive/Configs/HF Keys.json'
selected_model = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0'

In [10]:
device

'cpu'

In [34]:
with open(config_file_path, 'r') as config_file:
  config_obj = load(config_file)
  config_file.close()

In [26]:
interpreter_login()


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: ··········
Add token as git credential? (Y/n) y
Token is valid (permission: fineGrained).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [27]:
model_name = selected_model
token = get_model_token(config_obj, selected_model)

# Running inference on models

### Load models

In [28]:
# Get base model
model_infer = transformers.AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map=device,
    token=token
)

In [29]:
reft_model_infer = pyreft.ReftModel.load(
    "TheRealSirBen/TinyLlama-TinyLlama-1.1B-Chat-v1.0-uap-agent",
    model_infer
)

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/518 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/31.0 [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

(…)comp.block_output.unit.pos.nunit.1#0.bin:   0%|          | 0.00/51.3k [00:00<?, ?B/s]



In [30]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_name,
    model_max_length=2048,
    padding_side="right",
    use_fast=False,
    token=token
)
tokenizer.pad_token = tokenizer.unk_token

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

In [35]:
# Test case
prompt = prompt_template("...")

### Run on base model

In [36]:
model_infer_tokens = tokenizer.encode(prompt, return_tensors='pt').to(device)
model_infer_response = model_infer.generate(model_infer_tokens)
print(tokenizer.decode(model_infer_response[0]))

### Run on reft model

In [37]:
# Generate a prediction
reft_model_infer_tokens = tokenizer(prompt, return_tensors='pt').to(device)
base_unit_position = reft_model_infer_tokens['input_ids'].shape[-1] -1
_, reft_model_infer_response = reft_model_infer.generate(
    reft_model_infer_tokens,
    unit_locations={'sources->base':(None, [[[base_unit_position]]])},
    intervene_on_prompt=True
)
print(tokenizer.decode(reft_model_infer_response[0]))

In [33]:
#