# Set Up

## Package Installs

In [1]:
! pip install -q trl
! pip install -q peft
! pip install -q scipy
! pip install -q accelerate
! pip install -q bitsandbytes
! pip install -q transformers
! pip install -q huggingface_hub
! pip install -q wandb
! pip install -q gcsfs==2023.6.0
! pip install -q fsspec==2023.6.0
! pip install -q -U datasets

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf 23.8.0 requires cubinlinker, which is not installed.
cudf 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
cudf 23.8.0 requires ptxcompiler, which is not installed.
cuml 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
dask-cudf 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
cudf 23.8.0 requires cuda-python<12.0a0,>=11.7.1, but you have cuda-python 12.4.0 which is incompatible.
cudf 23.8.0 requires pandas<1.6.0dev0,>=1.3, but you have pandas 2.1.4 which is incompatible.
cudf 23.8.0 requires protobuf<5,>=4.21, but you have protobuf 3.20.3 which is incompatible.
cuml 23.8.0 requires dask==2023.7.1, but you have dask 2024.3.1 which is incompatible.
dask-cuda 23.8.0 requires dask==2023.7.1, but you have dask 2024.3.1 which is incompatible.
dask-cuda 23.8.0 

## Imports

In [2]:
import os
import gc

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments

from datasets import load_dataset

from trl import SFTTrainer
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model

import wandb
import huggingface_hub
from kaggle_secrets import UserSecretsClient

2024-04-09 20:46:33.800063: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-09 20:46:33.800183: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-09 20:46:33.931340: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "garbage_collection_threshold:0.6,max_split_size_mb:512"

## Secrets

In [4]:
user_secrets = UserSecretsClient()
WANDB_KEY = user_secrets.get_secret("WANDB_KEY")
HF_TOKEN = user_secrets.get_secret("HF_TOKEN")

## Third-Party Services

### Weights and Biases

In [5]:
wandb.login(key=WANDB_KEY)

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

### Hugging Face

In [6]:
huggingface_hub.login(token=HF_TOKEN)

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


# Model

## Configuration

In [7]:
model_id = "mistralai/Mistral-7B-Instruct-v0.2"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

## Load Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

tokenizer.pad_token = tokenizer.unk_token
tokenizer.padding_side = 'right'

## Load Model

In [8]:
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             quantization_config=bnb_config,
                                             device_map={"":0})

model.config.pad_token_id = tokenizer.pad_token_id

tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

# Dataset

## Generate Prompt Function for Given Dataset Point

In [9]:
def generate_prompt(data_point):
    
    prompt = f"""
    [INST] You are an AI assistant specializing in Ludwig Wittgenstein. Your task is to generate an appropriate response to a philosophy student's question about Ludwig Wittgenstein's philosophy given in square brackets to clarify his/her confusion.
    Your answer should be accurate, detailed, thorough and relevant. Your tone should be coherent and conversational.
    {data_point["question"]} [/INST]
    
    {data_point["answer"]}</s>""".strip()
    
    return prompt

## Load the Dataset

In [10]:
dataset = load_dataset("descartesevildemon/Ludwig-Wittgenstein-QA-Pairs", split="train")
dataset

Downloading data:   0%|          | 0.00/6.41M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['question', 'answer'],
    num_rows: 3144
})

In [11]:
df = dataset.to_pandas()
df.head(10)

Unnamed: 0,question,answer
0,"What does Wittgenstein mean when he says ""mean...","Wittgenstein's statement ""meaning is use"" is a..."
1,How does Wittgenstein's idea of meaning as use...,"Wittgenstein's idea of meaning, as expressed i..."
2,In what ways does Wittgenstein's concept of la...,"Wittgenstein's concept of ""language games"" is ..."
3,Can you provide an example of how meaning as u...,"Absolutely, I'd be happy to provide an example..."
4,How does Wittgenstein's idea of meaning as use...,Wittgenstein's idea of meaning as use is a cri...
5,What are the implications of Wittgenstein's cl...,"Wittgenstein's claim that ""the meaning of a wo..."
6,How does Wittgenstein's concept of meaning as ...,Wittgenstein's concept of meaning as use is a ...
7,In what ways does Wittgenstein's view of meani...,"Wittgenstein's view of meaning as use, as arti..."
8,How does Wittgenstein's concept of meaning as ...,Wittgenstein's concept of meaning as use is a ...
9,How would Wittgenstein respond to the criticis...,Wittgenstein might respond to this criticism b...


## Add "Prompt" Column to Dataset

In [12]:
text_column = [generate_prompt(data_point) for data_point in dataset]
dataset = dataset.add_column("prompt", text_column)

In [13]:
df = dataset.to_pandas()
df.head(10)

Unnamed: 0,question,answer,prompt
0,"What does Wittgenstein mean when he says ""mean...","Wittgenstein's statement ""meaning is use"" is a...",[INST] You are an AI assistant specializing in...
1,How does Wittgenstein's idea of meaning as use...,"Wittgenstein's idea of meaning, as expressed i...",[INST] You are an AI assistant specializing in...
2,In what ways does Wittgenstein's concept of la...,"Wittgenstein's concept of ""language games"" is ...",[INST] You are an AI assistant specializing in...
3,Can you provide an example of how meaning as u...,"Absolutely, I'd be happy to provide an example...",[INST] You are an AI assistant specializing in...
4,How does Wittgenstein's idea of meaning as use...,Wittgenstein's idea of meaning as use is a cri...,[INST] You are an AI assistant specializing in...
5,What are the implications of Wittgenstein's cl...,"Wittgenstein's claim that ""the meaning of a wo...",[INST] You are an AI assistant specializing in...
6,How does Wittgenstein's concept of meaning as ...,Wittgenstein's concept of meaning as use is a ...,[INST] You are an AI assistant specializing in...
7,In what ways does Wittgenstein's view of meani...,"Wittgenstein's view of meaning as use, as arti...",[INST] You are an AI assistant specializing in...
8,How does Wittgenstein's concept of meaning as ...,Wittgenstein's concept of meaning as use is a ...,[INST] You are an AI assistant specializing in...
9,How would Wittgenstein respond to the criticis...,Wittgenstein might respond to this criticism b...,[INST] You are an AI assistant specializing in...


## Shuffle and Tokenize Dataset

In [14]:
dataset = dataset.shuffle(seed=1234)
dataset = dataset.map(lambda samples: tokenizer(samples["prompt"]), batched=True)

Map:   0%|          | 0/3144 [00:00<?, ? examples/s]

In [15]:
train_data = dataset

In [16]:
print(train_data)

Dataset({
    features: ['question', 'answer', 'prompt', 'input_ids', 'attention_mask'],
    num_rows: 3144
})


# Fine-Tuning

## Set Up

In [17]:
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [18]:
print(model)

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )

### Find All Linear Layers in Model

In [19]:
import bitsandbytes as bnb
def find_all_linear_names(model):
  cls = bnb.nn.Linear4bit
  lora_module_names = set()
  for name, module in model.named_modules():
    if isinstance(module, cls):
      names = name.split('.')
      lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names: # needed for 16-bit
      lora_module_names.remove('lm_head')
  return list(lora_module_names)

In [20]:
modules = find_all_linear_names(model)
print(modules)

['o_proj', 'k_proj', 'down_proj', 'q_proj', 'v_proj', 'up_proj', 'gate_proj']


## Running the Fine-Tuning

Supervised fine-tuning using QLoRA

### Fine-Tuning Parameters

In [21]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=modules,
    lora_dropout=0.15,
    bias="none",
    task_type="CAUSAL_LM"
)

In [22]:
model = get_peft_model(model, lora_config)

In [23]:
training_args = TrainingArguments(
    run_name="Mistral-7b-Instruct-v0p2",
    output_dir="/kaggle/working/finetune_output",
    logging_dir="/kaggle/working/finetune_logs",
    report_to="wandb",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    warmup_ratio=0.03,
    learning_rate=2e-4,
    weight_decay=1e-3,
    optim="paged_adamw_8bit",
    fp16=True,
    logging_steps=20,
    save_strategy="epoch"
)

In [24]:
torch.cuda.empty_cache()

trainer = SFTTrainer(model=model,
                     train_dataset=train_data,
                     dataset_text_field="prompt",
                     peft_config=lora_config,
                     args=training_args,
                     tokenizer=tokenizer)



Map:   0%|          | 0/3144 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


### Starting Training Process

In [25]:
trainable, total = model.get_nb_trainable_parameters()
print(f"Trainable: {trainable} | total: {total} | Percentage: {trainable/total*100:.4f}%")

Trainable: 20971520 | total: 7262703616 | Percentage: 0.2888%


In [26]:
model.config.use_cache = False

trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mdescartesevildemon[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: wandb version 0.16.6 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[34m[1mwandb[0m: Tracking run with wandb version 0.16.4
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20240409_204901-y6b1v6gl[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mMistral-7b-Instruct-v0p2 Prof to Assistant[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/descartesevildemon/huggingface[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/descartesevildemon/huggingface/runs/y6b1v6gl[0m


Step,Training Loss
20,1.329
40,0.7083
60,0.6202
80,0.6052
100,0.5671
120,0.5714
140,0.5565
160,0.5628
180,0.56
200,0.5499




TrainOutput(global_step=2358, training_loss=0.3697198325036642, metrics={'train_runtime': 39878.087, 'train_samples_per_second': 0.237, 'train_steps_per_second': 0.059, 'total_flos': 2.4133509852207514e+17, 'train_loss': 0.3697198325036642, 'epoch': 3.0})

## Post-Fine-Tuning

### Saving Fine-Tuned Model

In [27]:
new_model = "wittgenbot-finetune-test"
trainer.model.save_pretrained(new_model)

### Cleaning-up GPU Memory before Merging

In [28]:
! nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Wed Apr 10 07:53:40 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla P100-PCIE-16GB           Off | 00000000:00:04.0 Off |                    0 |
| N/A   65C    P0              48W / 250W |  12544MiB / 16384MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                         

In [29]:
del [model, tokenizer, lora_config, trainer, train_data, bnb_config, training_args, df]
del [TrainingArguments, SFTTrainer, LoraConfig, BitsAndBytesConfig]

for _ in range(100):
    torch.cuda.empty_cache()
    gc.collect()

In [30]:
! nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Wed Apr 10 07:54:11 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla P100-PCIE-16GB           Off | 00000000:00:04.0 Off |                    0 |
| N/A   57C    P0              39W / 250W |    456MiB / 16384MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                         

### Merging Fine-Tuned Model with Base Model (Mistral-7B-Instruct-v0.2)

In [31]:
base_model = AutoModelForCausalLM.from_pretrained(model_id,
                                                  low_cpu_mem_usage=True,
                                                  return_dict=True,
                                                  torch_dtype=torch.float16,
                                                  device_map={"": 0})

merged_model = PeftModel.from_pretrained(base_model, new_model)
merged_model = merged_model.merge_and_unload()

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.unk_token
tokenizer.padding_side = 'right'

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

### Saving Merged Model & Tokenizer

In [32]:
merged_model_path = '/kaggle/working/wittgenbot-merged-model'

merged_model.save_pretrained(merged_model_path, safe_serialization=True)
tokenizer.save_pretrained(merged_model_path)

('/kaggle/working/wittgenbot-merged-model/tokenizer_config.json',
 '/kaggle/working/wittgenbot-merged-model/special_tokens_map.json',
 '/kaggle/working/wittgenbot-merged-model/tokenizer.model',
 '/kaggle/working/wittgenbot-merged-model/added_tokens.json',
 '/kaggle/working/wittgenbot-merged-model/tokenizer.json')

### Pushing Merged Model & Tokenizer to Hugging Face

In [33]:
repo_id = 'descartesevildemon/Wittgenbot-7B'

merged_model.push_to_hub(repo_id=repo_id)
tokenizer.push_to_hub(repo_id=repo_id)

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/descartesevildemon/Wittgenbot-7B-no-clue/commit/4297357fc43386f26d7d8003000587df8f61c4b2', commit_message='Upload tokenizer', commit_description='', oid='4297357fc43386f26d7d8003000587df8f61c4b2', pr_url=None, pr_revision=None, pr_num=None)

In [34]:
wandb.finish()

[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:                    train/epoch ▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
[34m[1mwandb[0m:              train/global_step ▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
[34m[1mwandb[0m:                train/grad_norm █▄▂▁▂▂▁▂▂▁▂▁▂▁▂▂▄▁▄▃▃▂▃▃▃▃▂▄▄▄▅▄▃▅▄▄▃▄▆▃
[34m[1mwandb[0m:            train/learning_rate ▃▇███▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▁▁▁
[34m[1mwandb[0m:                     train/loss █▄▃▃▃▃▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m:               train/total_flos ▁
[34m[1mwandb[0m:               train/train_loss ▁
[34m[1mwandb[0m:            train/train_runtime ▁
[34m[1mwandb[0m: train/train_samples_per_second ▁
[34m[1mwandb[0m:   train/train_steps_per_second ▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:                    train/epoch 3.0
[34m[1mwan