In [1]:
import json
import os
from pprint import pprint
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import load_dataset
from huggingface_hub import notebook_login, login
from peft import (
    LoraConfig,
    PeftConfig,
    PeftModel,
    get_peft_model,
    prepare_model_for_kbit_training
)
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig
)
import pandas as pd
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
hf_token = os.getenv("hf_token")

In [3]:
login(hf_token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/s448780/.cache/huggingface/token
Login successful


## Load model

In [4]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [5]:
model_id = "mistralai/Mistral-7B-v0.1"

# load model 
model = AutoModelForCausalLM.from_pretrained(
    model_id, 
    quantization_config=bnb_config, 
    use_cache=False, 
    device_map="auto"
)
model.config.pretraining_tp = 1 #parallel GPU

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

## dataset

In [6]:
dataset = load_dataset("csv", data_files="./filtered_data/diverse_data.csv")
dataset

DatasetDict({
    train: Dataset({
        features: ['moves', 'explanation', 'instruction'],
        num_rows: 300
    })
})

In [7]:
# def format_instruction(sample):
#     return f"""You are a chess expert. Explain the rationale behind the last move from the given chess moves in Algebraic notation - 
#         {sample["moves"]}

#         ### Response:
#         {sample["explanation"]}
#     """

In [8]:
# sample = dataset["train"][0]
# print(format_instruction(sample))

In [9]:
def create_text_row(instruction, moves, explanation):
    return f"""<s>[INST] {instruction}\nHere are the chess moves in Algenraic Notaion - {moves} [/INST]\n{explanation} </s>"""

In [10]:
EOS_TOKEN = tokenizer.eos_token
def formatting_func(df):
    instructions = df["instruction"]
    inputs      = df["moves"]
    outputs     = df["explanation"]
    texts = []
    for instruction, inp, output in zip(instructions, inputs, outputs):
        text = create_text_row(instruction, inp, output) # eos token coming from create_text_row function
        #print(text)
        texts.append(text)
    return {"text" : texts}

In [11]:
dataset = dataset.map(formatting_func, batched = True)

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

In [12]:
dataset

DatasetDict({
    train: Dataset({
        features: ['moves', 'explanation', 'instruction', 'text'],
        num_rows: 300
    })
})

In [13]:
print(dataset["train"]["text"][0])

<s>[INST] Assume you are a chess master. Who do you think will win the game based on the provided chess moves in Algebraic Notation.
Here are the chess moves in Algenraic Notaion - e4 e6 d4 d5 Nd2 c5 exd5 exd5 dxc5 Bxc5 Nb3 Bb6 Nf3 Nf6 Bd3 Nc6 c3 h6 O-O O-O Bf4 Be6 Qd2 Ne4 Qc2 f5 Nbd4 Nxd4 Nxd4 Bd7 Rfe1 Rc8 Qb3 Nc5 Qxd5+ Kh8 Bc2 Ne4 Bxe4 fxe4 Qxe4 Bxd4 Qxd4 Bc6 Qxd8 Rcxd8 Bg3 Rd2 b4 Rc2 Re3 Rd8 a3 Rdd2 h4 g6 Rae1 Ra2 c4 a6 c5 Kg8 Kf1 Bb5+ Kg1 Bc6 Re6 Kf7 R1e3 h5 Rd6 Re2 Rxe2 Rxe2 f3 Ra2 Rd3 Ke6 Kf1 Bb5 c6 Bxd3+ Kg1 bxc6 [/INST]
White. The game dynamics favored White, who effectively transitioned from a sound opening into an actively simplified endgame, converting piece activity and centralized power into a winning material advantage. </s>


## LoRA

In [14]:
model

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )

In [15]:
'''
lora_alpha - scaling factor applied to the low-rank matrices. It helps in balancing the contribution of the low-rank update to the original weights. 
Higher values of lora_alpha can increase the influence of the low-rank updates. It's a form of regularization to ensure the model doesn't deviate too much from the original weights.

bias - "none", "all", or "lora_only".
need more research on this.

'''
peft_config = LoraConfig(
    r=32,
    lora_alpha=16, 
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj"
    ],
    bias="none",
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
)

## Training

In [16]:
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

In [17]:
from trl import SFTTrainer
from transformers import TrainingArguments

model_args = TrainingArguments(
    output_dir="mistral_7b",
    num_train_epochs=3,
    # max_steps=50,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
    optim="paged_adamw_32bit", # apparently more efficient for 32 bit GPUs
    logging_steps=20,
    save_strategy="epoch",
    learning_rate=2e-4,
    bf16=True,
    tf32=True,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    lr_scheduler_type="constant",
    disable_tqdm=False
)

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    dataset_text_field = "text",
    peft_config=peft_config,
    max_seq_length=2048,
    tokenizer=tokenizer,
    packing=True,
    args=model_args,
)

trainer.train()


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Generating train split: 0 examples [00:00, ? examples/s]



Step,Training Loss
20,1.0717
40,0.8008




TrainOutput(global_step=51, training_loss=0.8929822211172066, metrics={'train_runtime': 906.5274, 'train_samples_per_second': 0.457, 'train_steps_per_second': 0.056, 'total_flos': 3.571625969791795e+16, 'train_loss': 0.8929822211172066, 'epoch': 2.914285714285714})

## test

In [14]:
# test_input = "Explain the rationale behind the last move from the given chess moves in Algebraic notation -"
# test_moves = "d4 d5 c4 c6 cxd5 e6 dxe6 fxe6 Nf3 Bb4+ Nc3 Ba5 Bf4"
# test_prompt = f""" {test_input}
#         ### Input:
#         {test_moves}

#         ### Response:

#     """

In [18]:
test_prompt = '''
[INST] Assume you are a chess master, explain the strategy used by each player based on the provided chess moves. Here are the chess moves in Algenraic Notaion - e4 e6 d4 b6 e5 Bb7 Nf3 h6 Bd3 g5 O-O g4 Nfd2 h5 Ne4 Nc6 Be3 Qe7 Qd2 Bh6 Bxh6 Nxh6 Nf6+ Kd8 Bh7 Nf5 Bxf5 exf5 c3 h4 Qg5 g3 fxg3 hxg3 Qxg3 Qf8 Rxf5 Ne7 Rg5 Ng6 Nd2 Qh6 Rh5 Qg7 Qg4 Bc8 Rxh8+ Qxh8 Rf1 d6 Qg5 Qh4 Qe3 Bb7 e6 [/INST]
'''

In [19]:
input_ids = tokenizer(test_prompt, return_tensors="pt", truncation=True).input_ids.to("cuda:0")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [20]:
model.eval()
with torch.inference_mode():
    outputs = model.generate(
        input_ids=input_ids,
        max_new_tokens=2048,
        do_sample=True, 
        top_p=0.9,
        temperature=0.9
    )

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



[INST] Assume you are a chess master, explain the strategy used by each player based on the provided chess moves. Here are the chess moves in Algenraic Notaion - e4 e6 d4 b6 e5 Bb7 Nf3 h6 Bd3 g5 O-O g4 Nfd2 h5 Ne4 Nc6 Be3 Qe7 Qd2 Bh6 Bxh6 Nxh6 Nf6+ Kd8 Bh7 Nf5 Bxf5 exf5 c3 h4 Qg5 g3 fxg3 hxg3 Qxg3 Qf8 Rxf5 Ne7 Rg5 Ng6 Nd2 Qh6 Rh5 Qg7 Qg4 Bc8 Rxh8+ Qxh8 Rf1 d6 Qg5 Qh4 Qe3 Bb7 e6 [/INST]
Black adopted the King's Indian Defense to counter White's opening moves, aiming to create imbalances and potential counterplay on the queenside. White capitalized on the open lines and central control to mount a strong pawn attack, with the eventual goal of promoting pawns on the queenside. The game showcased an aggressive and strategic approach from White, leveraging central control and potential queenside play, while Black aimed to create counterplay and leverage the King's Indian Defense's inherent weaknesses. The game is not fully concluded, but White has established a strong attacking position, pa

## testing base knowledge

In [21]:
test_prompt = '''
[INST]You will be given the name of a chess piece. Explain how the chess piece can move -
Knight.[/INST]
'''
input_ids = tokenizer(test_prompt, return_tensors="pt", truncation=True).input_ids.to("cuda:0")
with torch.inference_mode():
    outputs = model.generate(
        input_ids=input_ids,
        max_new_tokens=2048,
        do_sample=True, 
        top_p=0.9,
        temperature=0.9)
    
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



[INST]You will be given the name of a chess piece. Explain how the chess piece can move -
Knight.[/INST]
The Knight can move to a square that is two squares away in one direction and one square in the other, as long as the squares are not of the same color. The Knight can also jump over other pieces, but cannot move to a square that is already occupied by a piece of the same color.


In [22]:
test_prompt = '''
[INST]Assume you are chess master. Who do you think will win the game based on the provided chess moves from a real game -
d4 d5 c4 c6 cxd5 e6 dxe6 fxe6 Nf3 Bb4+ Nc3 Ba5 Bf4.[/INST]
'''
input_ids = tokenizer(test_prompt, return_tensors="pt", truncation=True).input_ids.to("cuda:0")
with torch.inference_mode():
    outputs = model.generate(
        input_ids=input_ids,
        max_new_tokens=2048,
        do_sample=True, 
        top_p=0.9,
        temperature=0.9)
    
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



[INST]Assume you are chess master. Who do you think will win the game based on the provided chess moves from a real game -
d4 d5 c4 c6 cxd5 e6 dxe6 fxe6 Nf3 Bb4+ Nc3 Ba5 Bf4.[/INST]
White. 


In [23]:
test_prompt ='''
[INST]Assume you are a geography expert. You will be provided with the name of a country. What's the capital of the country?",
Australia[/INST].
'''
input_ids = tokenizer(test_prompt, return_tensors="pt", truncation=True).input_ids.to("cuda:0")

with torch.inference_mode():
    outputs = model.generate(
        input_ids=input_ids,
        max_new_tokens=2048,
        do_sample=True, 
        top_p=0.9,
        temperature=0.9)
    
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



[INST]Assume you are a geography expert. You will be provided with the name of a country. What's the capital of the country?",
Australia[/INST].
Canberra

# Canberra
Canberra is the capital city of Australia. It is located in the Australian Capital Territory (ACT), an area of land within New South Wales that was granted self-government in 1988.

Canberra was designed as the capital of Australia in 1908 by a competition held by the Australian government. It was named after Lady Canberra, the wife of the Duke of Albany, who was also a close friend of Australia's first Prime Minister, Edmund Barton.

Canberra is home to a number of important government buildings, such as Parliament House, the Australian War Memorial, and the National Gallery of Australia. It is also home to several universities, including the Australian National University and the University of Canberra.

Canberra has a population of around 400,000 people, making it the eighth largest city in Australia. It is a popular t

In [24]:
test_input = "Assume you are a chess master. Your job is to suggest the best next move based on a list of previous chess moves."
test_moves = "d4 d5 c4 c6 cxd5 e6 dxe6 fxe6 Nf3 Bb4+ Nc3 Ba5 Bf4"
test_prompt = f"[INST]{test_input} Here are the moves - {test_moves}[\INST]"
input_ids = tokenizer(test_prompt, return_tensors="pt", truncation=True).input_ids.to("cuda:0")
with torch.inference_mode():
    outputs = model.generate(
        input_ids=input_ids,
        max_new_tokens=2048,
        do_sample=True, 
        top_p=0.9,
        temperature=0.9
    )

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[INST]Assume you are a chess master. Your job is to suggest the best next move based on a list of previous chess moves. Here are the moves - d4 d5 c4 c6 cxd5 e6 dxe6 fxe6 Nf3 Bb4+ Nc3 Ba5 Bf4[\INST]
    Nf6 


In [25]:
test_input = "Assume you are a chess master. Analyse each move pair and explain the players' strategy."
test_moves = "d4 d5 c4 c6 cxd5 e6 dxe6 fxe6 Nf3 Bb4+ Nc3 Ba5 Bf4"
test_prompt = f"[INST]{test_input} Here are the moves - {test_moves}[\INST]"
input_ids = tokenizer(test_prompt, return_tensors="pt", truncation=True).input_ids.to("cuda:0")
with torch.inference_mode():
    outputs = model.generate(
        input_ids=input_ids,
        max_new_tokens=2048,
        do_sample=True, 
        top_p=0.9,
        temperature=0.9
    )

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[INST]Assume you are a chess master. Analyse each move pair and explain the players' strategy. Here are the moves - d4 d5 c4 c6 cxd5 e6 dxe6 fxe6 Nf3 Bb4+ Nc3 Ba5 Bf4[\INST]

### Game analysis

1. **White: d4, Black: d5**  
   - **White:** Opens with the Queen's Pawn Opening, controlling the center and aiming to develop the bishop.
   - **Black:** Responds symmetrically to control the center and prepare to develop the bishop.

2. **White: c4, Black: c6**  
   - **White:** Advances the Queen's Pawn to support the d4 pawn and prepares for a potential cxd5 exchange.
   - **Black:** Plays c6 to prepare to challenge the center with ...dxc4 and avoids being pinned by a potential Nf3-g5.

3. **White: cxd5, Black: e6**  
   - **White:** Exchanges pawns to disrupt Black’s central control and open lines for the bishop and queen.
   - **Black:** Advances the e-pawn to prepare to recapture the d-pawn and to control more central space.

4. **White: dxe6, Black: fxe6**  
   - **White:** Captures the

## saving

In [93]:
trainer.save_model()

