In [1]:
import json
import os
from pprint import pprint
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import load_dataset
from huggingface_hub import notebook_login, login
from peft import (
    LoraConfig,
    PeftConfig,
    PeftModel,
    get_peft_model,
    prepare_model_for_kbit_training
)
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig
)
import pandas as pd
from dotenv import load_dotenv
load_dotenv("../finetune/.env")
hf_token = os.getenv("hf_token")
login(hf_token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/s448780/.cache/huggingface/token
Login successful


## base model

In [2]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

In [3]:
base_model_name = "mistralai/Mistral-7B-v0.1"
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=bnb_config,
    use_cache=False,
    device_map="auto"
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
model

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )

## LoRA

In [5]:
lora_model_name = "adnaan525/opensi_mistral_3tasks"
adapter_config = PeftConfig.from_pretrained(lora_model_name)

In [6]:
adapter_config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path='mistralai/Mistral-7B-v0.1', revision=None, task_type='CAUSAL_LM', inference_mode=True, r=32, target_modules={'gate_proj', 'down_proj', 'o_proj', 'k_proj', 'q_proj', 'v_proj', 'up_proj'}, lora_alpha=16, lora_dropout=0.05, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=False, layer_replication=None)

In [7]:
model_with_lora = PeftModel.from_pretrained(model, lora_model_name)
model_with_lora

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32000, 4096)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralSdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=32, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=32, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear4bit(
                (base_layer):

## testing with model_with_lora

In [9]:
test_input = "Assume you are a chess master. Analyse each move pair and explain the players' strategy."
test_moves = "d4 d5 c4 c6 cxd5 e6 dxe6 fxe6 Nf3 Bb4+ Nc3 Ba5"
test_prompt = f"[INST]{test_input} Here are the moves - {test_moves}[\INST]"
input_ids = tokenizer(test_prompt, return_tensors="pt", truncation=True).input_ids.to("cuda:0")
with torch.inference_mode():
    outputs = model_with_lora.generate(
        input_ids=input_ids,
        max_new_tokens=2048,
        do_sample=True, 
        top_p=0.9,
        temperature=0.9
    )

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[INST]Assume you are a chess master. Analyse each move pair and explain the players' strategy. Here are the moves - d4 d5 c4 c6 cxd5 e6 dxe6 fxe6 Nf3 Bb4+ Nc3 Ba5[\INST]

## Game Analysis:

### Move Pair 1: White: d4, Black: d5
**White: d4** - White opens with the Queen's Pawn, aiming to control the center.
**Black: d5** - Black responds symmetrically, contesting the center.

### Move Pair 2: White: c4, Black: c6
**White: c4** - White continues with the Queen's Gambit, trying to gain more space in the center.
**Black: c6** - Black prepares to fianchetto the bishop, looking to control the long diagonal.

### Move Pair 3: White: cxd5, Black: e6
**White: cxd5** - White captures the pawn, disrupting Black's pawn structure.
**Black: e6** - Black recaptures the pawn, keeping the pawn structure intact and preparing for potential future development.

### Move Pair 4: White: dxe6, Black: fxe6
**White: dxe6** - White sacrifices the pawn to disrupt Black's pawn structure further.
**Black: fxe6** 

In [13]:
def list_lora_adapters(model):
    lora_adapters = []
    for name, module in model.named_modules():
        if isinstance(module, LoraConfig):
            lora_adapters.append(name)
    return lora_adapters

# Get the list of LoRA adapters
lora_adapters = list_lora_adapters(model_with_lora)
print("List of LoRA adapters:", lora_adapters)

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path='mistralai/Mistral-7B-v0.1', revision=None, task_type='CAUSAL_LM', inference_mode=True, r=32, target_modules={'gate_proj', 'down_proj', 'o_proj', 'k_proj', 'q_proj', 'v_proj', 'up_proj'}, lora_alpha=16, lora_dropout=0.05, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=False, layer_replication=None)

## merging

In [10]:
merged_model = model_with_lora.merge_and_unload(progressbar = True)

Unloading and merging model: 100%|███████| 678/678 [00:17<00:00, 38.73it/s]


In [16]:
merged_model

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )

## tokenizer

In [17]:
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")

## test

In [18]:
test_input = "Assume you are a chess master. Analyse each move pair and explain the players' strategy."
test_moves = "d4 d5 c4 c6 cxd5 e6 dxe6 fxe6 Nf3 Bb4+ Nc3 Ba5"
test_prompt = f"[INST]{test_input} Here are the moves - {test_moves}[\INST]"
input_ids = tokenizer(test_prompt, return_tensors="pt", truncation=True).input_ids.to("cuda:0")
with torch.inference_mode():
    outputs = merged_model.generate(
        input_ids=input_ids,
        max_new_tokens=2048,
        do_sample=True, 
        top_p=0.9,
        temperature=0.9
    )

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[INST]Assume you are a chess master. Analyse each move pair and explain the players' strategy. Here are the moves - d4 d5 c4 c6 cxd5 e6 dxe6 fxe6 Nf3 Bb4+ Nc3 Ba5[\INST]

[QUI]What move would you make to stop the game? [\QUI]

[BRE]The next move by the black player is... [BRE]

[QUI]Who is the stronger player in this game? [QUI]

[BRE]The next move by the white player is... [BRE]

[BRE]The next move by the black player is... [BRE]

[QUI]Is this a good move? [QUI]

[BRE]The next move by the white player is... [BRE]

[BRE]The next move by the black player is... [BRE]

[QUI]Is this a good move? [QUI]

[QUI]How would you respond? [QUI]

[BRE]The next move by the black player is... [BRE]

[QUI]Is this a good move? [QUI]

[QUI]How would you respond? [QUI]

[BRE]The next move by the black player is... [BRE]

[BRE]The next move by the white player is... [BRE]

[BRE]The next move by the black player is... [BRE]

[BRE]The next move by the white player is... [BRE]

[BRE]The next move by the black

## testing model_with_lora

In [20]:
test_input = "Assume you are a chess master. Analyse each move pair and explain the players' strategy."
test_moves = "d4 d5 c4 c6 cxd5 e6 dxe6 fxe6 Nf3 Bb4+ Nc3 Ba5"
test_prompt = f"[INST]{test_input} Here are the moves - {test_moves}[\INST]"
input_ids = tokenizer(test_prompt, return_tensors="pt", truncation=True).input_ids.to("cuda:0")
with torch.inference_mode():
    outputs = model_with_lora.generate(
        input_ids=input_ids,
        max_new_tokens=2048,
        do_sample=True, 
        top_p=0.9,
        temperature=0.9
    )

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[INST]Assume you are a chess master. Analyse each move pair and explain the players' strategy. Here are the moves - d4 d5 c4 c6 cxd5 e6 dxe6 fxe6 Nf3 Bb4+ Nc3 Ba5[\INST]

1) The first two moves show a King's Gambit in which White sacrifices a pawn for quick development. In return, White exchanges the pieces as fast as possible, trying to gain a tempo.
2) White now sacrifices a pawn to gain the tempo.
3) Black accepts the pawn. White tries to maintain tempo advantage.
4) Black tries to get out of this situation.


In [33]:
model_with_lora

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32000, 4096)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralSdpaAttention(
              (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
              (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
              (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
              (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
              (rotary_emb): MistralRotaryEmbedding()
            )
            (mlp): MistralMLP(
              (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
              (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
              (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
              (act_fn): S

## testing with llama_indexing

In [21]:
import os
import torch
from dotenv import load_dotenv

import llama_index
from llama_index.core import SimpleDirectoryReader # to load docs
from llama_index.core import Document

from llama_index.core.prompts import PromptTemplate # generate prompt template with role
from llama_index.core.llms import ChatMessage, MessageRole
from llama_index.core import ChatPromptTemplate

from llama_index.core.response.notebook_utils import display_response # formatting


from llama_index.llms.huggingface import HuggingFaceLLM # llm

from llama_index.embeddings.huggingface import HuggingFaceEmbedding # embedding

from llama_index.core import Settings # pass llm and embedding

from llama_index.core import VectorStoreIndex # store vector store

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



In [22]:
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5")



In [23]:
documents = SimpleDirectoryReader("./test_doc").load_data()
books = set()
for book in documents:
  books.add(book.metadata["file_name"])

books

{'ucl_2023.pdf'}

In [25]:
model_id = "adnaan525/merged"

In [26]:
query_wrapper_prompt = PromptTemplate(
    "Always answer the question, even if the context isn't helpful."
    "Write a response that appropriately completes the request.\n\n"
    "### Instruction:\n{query_str}\n\n### Response:"
)

In [29]:
llm = HuggingFaceLLM(
    context_window=2048,
    max_new_tokens=256,
    generate_kwargs={"temperature": 0, "do_sample": False},
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name="mistralai/Mistral-7B-v0.1",
    model_name=model_id,
    device_map="auto",
    tokenizer_kwargs={"max_length": 2048},
    model_kwargs={"torch_dtype": torch.float16, "load_in_8bit" : True}
)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
The model `adnaan525/merged` and tokenizer `mistralai/Mistral-7B-v0.1` are different, please ensure that they are compatible.


In [None]:
Settings.embed_model = embed_model
Settings.llm = llm

In [None]:
vector_index = VectorStoreIndex.from_documents(documents)

In [None]:
query_engine = vector_index.as_query_engine()
response = query_engine.query("how many titles does real madrid have? use the most updated information")
display_response(response)