In [16]:
import logging
import os

logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
                    handlers=[
                        logging.FileHandler('eval.log', mode="a"),
                        logging.StreamHandler()
                    ])
logger = logging.getLogger(__name__)

In [17]:
folders = [name for name in os.listdir('.') if os.path.isdir(name) and name[0]!= "."]
logger.info(f"{len(folders)} test(s) found.")
logger.info(f"Tests: {folders}")

2024-07-23 00:34:26,977 - __main__ - INFO - 1 tests found.
2024-07-23 00:34:26,980 - __main__ - INFO - Tests: ['memory']


# LLM

In [20]:
from dotenv import load_dotenv
from huggingface_hub import login
load_dotenv("../../finetune/.env")
login(os.getenv("hf_token"))

2024-07-23 00:37:49,913 - urllib3.connectionpool - DEBUG - Starting new HTTPS connection (1): huggingface.co:443


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.


2024-07-23 00:37:50,166 - urllib3.connectionpool - DEBUG - https://huggingface.co:443 "GET /api/whoami-v2 HTTP/11" 200 389


Token is valid (permission: write).
Your token has been saved to /home/s448780/.cache/huggingface/token
Login successful


In [21]:
models = {
    "finetuned" : "mistralai/Mistral-7B-v0.1",
    "lora_adapter" : "OpenSI/cognitive_AI"
}

In [22]:
from transformers import pipeline
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_id = models["finetuned"]

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_id, 
    quantization_config=bnb_config, 
    use_cache=True, 
    device_map="auto"
)

tokenizer = AutoTokenizer.from_pretrained(model_id, 
                                          add_bos_token = True)
tokenizer.pad_token = tokenizer.eos_token

2024-07-23 00:48:26,152 - urllib3.connectionpool - DEBUG - Resetting dropped connection: huggingface.co
2024-07-23 00:48:28,358 - urllib3.connectionpool - DEBUG - https://huggingface.co:443 "HEAD /mistralai/Mistral-7B-v0.1/resolve/main/config.json HTTP/11" 200 0
2024-07-23 00:48:28,833 - bitsandbytes.cextension - DEBUG - Loading bitsandbytes native library from: /home/s448780/miniconda3/envs/pydev/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cuda121.so
2024-07-23 00:48:29,429 - accelerate.utils.modeling - INFO - We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

2024-07-23 00:48:36,025 - urllib3.connectionpool - DEBUG - https://huggingface.co:443 "HEAD /mistralai/Mistral-7B-v0.1/resolve/main/generation_config.json HTTP/11" 200 0
2024-07-23 00:48:36,829 - urllib3.connectionpool - DEBUG - https://huggingface.co:443 "HEAD /mistralai/Mistral-7B-v0.1/resolve/main/tokenizer_config.json HTTP/11" 200 0


In [25]:
from peft import PeftConfig, PeftModel
lora = models["lora_adapter"]
adapter_config = PeftConfig.from_pretrained(lora)

model_f = PeftModel.from_pretrained(model, lora)

2024-07-23 00:53:46,141 - urllib3.connectionpool - DEBUG - https://huggingface.co:443 "HEAD /OpenSI/cognitive_AI/resolve/main/adapter_config.json HTTP/11" 200 0
2024-07-23 00:53:53,170 - urllib3.connectionpool - DEBUG - https://huggingface.co:443 "HEAD /OpenSI/cognitive_AI/resolve/main/adapter_config.json HTTP/11" 200 0
2024-07-23 00:53:55,149 - urllib3.connectionpool - DEBUG - https://huggingface.co:443 "HEAD /OpenSI/cognitive_AI/resolve/main/adapter_config.json HTTP/11" 200 0
2024-07-23 00:53:55,184 - peft.tuners.tuners_utils - INFO - Already found a `peft_config` attribute in the model. This will lead to having multiple adapters in the model. Make sure to know what you are doing!
2024-07-23 00:53:58,128 - urllib3.connectionpool - DEBUG - https://huggingface.co:443 "HEAD /OpenSI/cognitive_AI/resolve/main/adapter_model.safetensors HTTP/11" 302 0
2024-07-23 00:54:00,903 - urllib3.connectionpool - DEBUG - https://huggingface.co:443 "HEAD /OpenSI/cognitive_AI/resolve/main/adapter_model.s

In [24]:
model_f.eval()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32000, 4096)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralSdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=32, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=32, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear4bit(
                (base_layer):

# Inference

In [27]:
prompt = '''### Question: What is the name of the piece that moves in an L shape?
        ## Response:'''

In [29]:
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda:0")
input_ids

tensor([[    1,   774, 22478, 28747,  1824,   349,   272,  1141,   302,   272,
          5511,   369, 11662,   297,   396,   393,  5843, 28804,    13,  5390,
           531, 12107, 28747]], device='cuda:0')

In [30]:
with torch.inference_mode():
    outputs = model.generate(
        input_ids=input_ids,
        attention_mask = torch.where(input_ids == 2, 0, 1),
        max_new_tokens=2048,
        do_sample=True, 
        top_p=0.9,
        temperature=0.5
    )

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


### Question: What is the name of the piece that moves in an L shape?
        ## Response: The piece that moves in an L shape is called a knight.

Here's why:
- The knight is unique among chess pieces in that it moves in an L shape: two squares in one direction and then one square perpendicular to that.
- This unusual move makes the knight particularly powerful, as it can jump over other pieces and quickly reposition itself on the board.

Therefore, the piece that moves in an L shape is called a knight.

<ANSWER>: Knight


# Memory

In [31]:
import pandas as pd
memory = pd.read_csv("../../data/test_framework/memory/memory.csv")
memory.head()

Unnamed: 0,Question,Answer
0,What is the name of the piece that moves in an...,Knight
1,How many squares are there on a standard chess...,64
2,What is the term for a situation where a king ...,Checkmate
3,Which piece is the most powerful in chess?,Queen
4,What is the initial position of the white queen?,d1


In [None]:
memory_questions = memory["Question"]
memory_answers = memory["Answer"]

memory_result_finetuned = pd.DataFrame(columns=["Question", "GT", "Answer", "Score"])

from tqdm.auto import tqdm

for i, question in tqdm(enumerate(memory_questions)):
    gt = memory_answers[i]
    prompt = f'''### Question: {question}
    ## Response:
    '''
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda:0")
    with torch.inference_mode():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask = torch.where(input_ids == 2, 0, 1),
            max_new_tokens=2048,
            do_sample=True, 
            top_p=0.9,
            temperature=0.5)
        answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
        print(question)
        print(answer)
        print("-"*10)
        memory_result_finetuned.loc[memory_result_finetuned.shape[0]] = {"Question" : question, "GT" : gt, "Answer" : answer, "Score" : 0}

In [36]:
memory_result_finetuned.to_csv("memory_ft.csv", index = False)