In [1]:
import json
import os
from pprint import pprint
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import load_dataset
from huggingface_hub import notebook_login, login
from peft import (
    LoraConfig,
    PeftConfig,
    PeftModel,
    get_peft_model,
    prepare_model_for_kbit_training
)
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig
)
import pandas as pd
from dotenv import load_dotenv
load_dotenv("../finetune/.env")

False

In [2]:
hf_token = os.getenv("hf_token")

In [3]:
login(hf_token)

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Load model

In [4]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [5]:
model_id = "mistralai/Mistral-7B-v0.1"

# load model 
model = AutoModelForCausalLM.from_pretrained(
    model_id, 
    quantization_config=bnb_config, 
    use_cache=False, 
    device_map="auto"
)
model.config.pretraining_tp = 1 #parallel GPU

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_id, 
                                          add_eos_token = True, 
                                          add_bos_token = True)
tokenizer.pad_token = tokenizer.eos_token # default is none
tokenizer.eos_token_id # for attention mask? 

2

## dataset

In [7]:
dataset = load_dataset("csv", data_files="../data/sub_finetune_2.csv")
dataset

DatasetDict({
    train: Dataset({
        features: ['Question', 'Context', 'Answer'],
        num_rows: 348
    })
})

In [8]:
def create_text_row(question, context, answer):
    return f"""<s>### Instruction:\n{question}\n### Context: \n{context}\n### Response: {answer}</s>"""

In [9]:
def formatting_func(df):
    questions = df["Question"]
    contexts = df["Context"]
    answers = df["Answer"]
    texts = []
    for q, c, a in zip(questions, contexts, answers):
        text = create_text_row(q, c, a)
        texts.append(text)
    return {"text" : texts}

In [10]:
dataset = dataset.map(formatting_func, batched = True)

In [11]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Question', 'Context', 'Answer', 'text'],
        num_rows: 348
    })
})

In [12]:
print(dataset["train"]["text"][0])

<s>### Instruction:
Assume you are a chess master, explain the strategy used by each player based on the provided chess moves. - d4 d5 Nf3 Nf6 Bf4 Bf5 c4 e6 e3 Bb4+ Nc3 O-O Qb3 c5 Be2 dxc4 Bxc4 Nc6 dxc5 Bxc5 Qxb7 Na5 Qb5 Nxc4 Qxc4 Qa5 O-O Rac8 Qb3 Rfd8 Rfd1 Rxd1+ Rxd1 Bg4 a3 Bxf3 gxf3 h6 Qc4
### Context: 
None
### Response: ## Game Analysis

1. **White: d4, Black: d5**
   - **White's move (d4)**: Common opening move controlling the center and allowing the development of the bishop and queen.
   - **Black's move (d5)**: Symmetrical response, fighting for central control.

2. **White: Nf3, Black: Nf6**
   - **White's move (Nf3)**: Develops the knight, attacking the d5 pawn and preparing to castle.
   - **Black's move (Nf6)**: Develops the knight, defends the d5 pawn, and prepares to castle.

3. **White: Bf4, Black: Bf5**
   - **White's move (Bf4)**: Develops the bishop to an active square, supporting the d-pawn and planning to play e3.
   - **Black's move (Bf5)**: Mirrors White's move, d

## LoRA

In [13]:
model

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )

In [14]:
'''
lora_alpha - scaling factor applied to the low-rank matrices. It helps in balancing the contribution of the low-rank update to the original weights. 
Higher values of lora_alpha can increase the influence of the low-rank updates. It's a form of regularization to ensure the model doesn't deviate too much from the original weights.

bias - "none", "all", or "lora_only".
need more research on this.

'''
peft_config = LoraConfig(
    r=32,
    lora_alpha=16, 
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj"
    ],
    bias="none",
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
)

## Training

In [15]:
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

In [16]:
from trl import SFTTrainer, SFTConfig
from transformers import TrainingArguments

model_args = TrainingArguments(
    output_dir="mistral_7b_sub_df",
    num_train_epochs=3,
    # max_steps=50,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
    optim="paged_adamw_32bit", # apparently more efficient for 32 bit GPUs
    logging_steps=20,
    save_strategy="epoch",
    learning_rate=2e-4,
    bf16=True,
    tf32=True,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    lr_scheduler_type="constant",
    disable_tqdm=False
)

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    dataset_text_field = "text",
    peft_config=peft_config,
    max_seq_length=2048,
    tokenizer=tokenizer,
    packing=True,
    args=model_args,
)

trainer.train()


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Step,Training Loss
20,0.8177
40,0.6007



Cannot access gated repo for url https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/config.json.
Access to model mistralai/Mistral-7B-v0.1 is restricted and you are not in the authorized list. Visit https://huggingface.co/mistralai/Mistral-7B-v0.1 to ask for access. - silently ignoring the lookup for the file config.json in mistralai/Mistral-7B-v0.1.

Cannot access gated repo for url https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/config.json.
Access to model mistralai/Mistral-7B-v0.1 is restricted and you are not in the authorized list. Visit https://huggingface.co/mistralai/Mistral-7B-v0.1 to ask for access. - silently ignoring the lookup for the file config.json in mistralai/Mistral-7B-v0.1.

Cannot access gated repo for url https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/config.json.
Access to model mistralai/Mistral-7B-v0.1 is restricted and you are not in the authorized list. Visit https://huggingface.co/mistralai/Mistral-7B-v0.1 to ask for

TrainOutput(global_step=57, training_loss=0.6536283994975843, metrics={'train_runtime': 1015.9703, 'train_samples_per_second': 0.446, 'train_steps_per_second': 0.056, 'total_flos': 4.004818228504166e+16, 'train_loss': 0.6536283994975843, 'epoch': 3.0})

## test

In [17]:
# test_prompt = '''[INST] Assume you are a chess master, explain the strategy used by each player based on the provided chess moves. Here are the chess moves in Algenraic Notaion - e4 e6 d4 b6 e5 Bb7 Nf3 h6 Bd3 g5 O-O g4 Nfd2 h5 Ne4 Nc6 Be3 Qe7 Qd2 Bh6 Bxh6 Nxh6 Nf6+ Kd8 Bh7 Nf5 Bxf5 exf5 c3 h4 Qg5 g3 fxg3 hxg3 Qxg3 Qf8 Rxf5 Ne7 Rg5 Ng6 Nd2 Qh6 Rh5 Qg7 Qg4 Bc8 Rxh8+ Qxh8 Rf1 d6 Qg5 Qh4 Qe3 Bb7 e6 [/INST]'''
# test_prompt = '''### Question:
# What is the capital of Nepal?
# ### Context: 
# ### Response:
# '''
test_prompt = '''### Question : Who discovered the Ruy Lopez opening?
### Context: This opening is popularly known as the Spanish game and was named after a
Spanish priest, Ruy Lopez, who discovered this opening in the year of 1561. This
opening was however not appreciated or used much at that point of time. Only
over the years, this has become a favorite among pros (grandmaster levels as well)
and is regarded as one of the most powerful chess openings. It is used as White’s
best attempt in gaining an advantage after double king pawn formations. A major
plus of this opening is that it gives the white player enough opportunity to develop
a complex offensive strategy and also slows down Black’s pawn formation.
### Response: '''

In [18]:
eval_tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    add_bos_token=True,
)

In [19]:
input_ids = eval_tokenizer(test_prompt, return_tensors="pt").input_ids.to("cuda:0")
input_ids

tensor([[    1,   774, 22478,   714,  6526,  8324,   272,   399,  4533,   393,
          1845, 28764,  7032, 28804,    13, 27332, 14268, 28747,   851,  7032,
           349,  4387,   346,  2651,   390,   272, 10177,  2039,   304,   403,
          5160,  1024,   264,    13, 13116,   789, 16032, 28725,   399,  4533,
           393,  1845, 28764, 28725,   693,  8324,   456,  7032,   297,   272,
           879,   302, 28705, 28740, 28782, 28784, 28740, 28723,   851,    13,
           410,  3250,   403,  3545,   459, 22359,   442,  1307,  1188,   438,
           369,  1305,   302,   727, 28723,  6352,    13,  1483,   272,  1267,
         28725,   456,   659,  2727,   264,  6656,  3352, 14138,   325, 24433,
          9548,  6157,   390,  1162, 28731,    13,   391,   349, 15390,   390,
           624,   302,   272,  1080,  6787,   997,   819,  1565,   742, 28723,
           661,   349,  1307,   390,  5673, 28809, 28713,    13, 13521,  4236,
           297, 25221,   396,  7859,  1024,  3579,  

In [20]:
# input_ids = tokenizer(test_prompt, return_tensors="pt").input_ids.to("cuda:0")
# input_ids

In [21]:
model.eval()
with torch.inference_mode():
    outputs = model.generate(
        input_ids=input_ids,
        # attention_mask = torch.where(input_ids == 2, 0, 1),
        max_new_tokens=2048,
        do_sample=True, 
        top_p=0.9,
        temperature=0.5
    )

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


### Question : Who discovered the Ruy Lopez opening?
### Context: This opening is popularly known as the Spanish game and was named after a
Spanish priest, Ruy Lopez, who discovered this opening in the year of 1561. This
opening was however not appreciated or used much at that point of time. Only
over the years, this has become a favorite among pros (grandmaster levels as well)
and is regarded as one of the most powerful chess openings. It is used as White’s
best attempt in gaining an advantage after double king pawn formations. A major
plus of this opening is that it gives the white player enough opportunity to develop
a complex offensive strategy and also slows down Black’s pawn formation.
### Response: 
The context mentions that the Ruy Lopez opening was discovered by a Spanish priest named Ruy Lopez in the year 1561. The opening was not widely used or appreciated at that time but has become a favorite among pros and is considered one of the most powerful openings. It is particularl

## saving

In [22]:
trainer.save_model()


Cannot access gated repo for url https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/config.json.
Access to model mistralai/Mistral-7B-v0.1 is restricted and you are not in the authorized list. Visit https://huggingface.co/mistralai/Mistral-7B-v0.1 to ask for access. - silently ignoring the lookup for the file config.json in mistralai/Mistral-7B-v0.1.


## saving to hub

In [23]:
# import warnings
# warnings.filterwarnings("ignore")

In [24]:
# model = AutoModelForCausalLM.from_pretrained("./mistral_7b/", device_map="cuda:0", quantization_config=bnb_config)

In [25]:
# model_id = "mistralai/Mistral-7B-v0.1"
# tokenizer = AutoTokenizer.from_pretrained(model_id)
# tokenizer.pad_token = tokenizer.eos_token

In [26]:
model.push_to_hub("OpenSI/cognitive_AI")


Cannot access gated repo for url https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/config.json.
Access to model mistralai/Mistral-7B-v0.1 is restricted and you are not in the authorized list. Visit https://huggingface.co/mistralai/Mistral-7B-v0.1 to ask for access. - silently ignoring the lookup for the file config.json in mistralai/Mistral-7B-v0.1.


adapter_model.safetensors:   0%|          | 0.00/336M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/OpenSI/cognitive_AI/commit/5c0ff4bd7e12455e8a7f21fcad1adba86ee29c03', commit_message='Upload model', commit_description='', oid='5c0ff4bd7e12455e8a7f21fcad1adba86ee29c03', pr_url=None, pr_revision=None, pr_num=None)