# Set Up

## Package Installs

In [None]:
! pip install trl
! pip install peft
! pip install scipy
! pip install accelerate
! pip install bitsandbytes
! pip install transformers
! pip install huggingface_hub
! pip install wandb
! pip install gcsfs==2023.6.0
! pip install fsspec==2023.6.0
! pip install -U datasets

## Imports

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments

from datasets import load_dataset

from trl import SFTTrainer
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model

import wandb
import huggingface_hub
from kaggle_secrets import UserSecretsClient

## Secrets

In [None]:
user_secrets = UserSecretsClient()
WANDB_KEY = user_secrets.get_secret("WANDB_KEY")
HF_TOKEN = user_secrets.get_secret("HF_TOKEN")

## Third-Party Services

### Weights and Biases

In [None]:
wandb.login(key=WANDB_KEY)

### Hugging Face

In [None]:
huggingface_hub.login(token=HF_TOKEN)

# Model

## Configuration

In [None]:
model_id = "mistralai/Mistral-7B-Instruct-v0.2"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

## Load Model

In [None]:
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})
tokenizer = AutoTokenizer.from_pretrained(model_id, add_eos_token=True)

# Dataset

## Generate Prompt Function for Given Dataset Point

In [None]:
def generate_prompt(data_point):
    
    prompt = f"""<s>
    [INST] You are a philosophy professor specializing in Ludwig Wittgenstein. Your task is to generate an appropriate response to a philosophy student's question about Ludwig Wittgenstein's philosophy given in square brackets to clarify his/her confusion.
    Your answer should be accurate, detailed, thorough and relevant. Your tone should be coherent and conversational.
    [{data_point["question"]}] [/INST]
    
    {data_point["answer"]}</s>""".strip()
    
    return prompt

## Load the Dataset

In [None]:
dataset = load_dataset("descartesevildemon/Ludwig-Wittgenstein-QA-Pairs", split="train")
dataset

In [None]:
df = dataset.to_pandas()
df.head(10)

## Add "Prompt" Column to Dataset

In [None]:
text_column = [generate_prompt(data_point) for data_point in dataset]
dataset = dataset.add_column("prompt", text_column)

In [None]:
df = dataset.to_pandas()
df.head(10)

## Shuffle and Tokenize Dataset

In [None]:
dataset = dataset.shuffle(seed=1234)
dataset = dataset.map(lambda samples: tokenizer(samples["prompt"]), batched=True)

## Split Dataset into "Train" and "Test"

In [None]:
dataset = dataset.train_test_split(test_size=0.1)
train_data = dataset["train"]
test_data = dataset["test"]

In [None]:
print(test_data)

# Fine-Tuning

## Set Up

In [None]:
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [None]:
print(model)

### Find All Linear Layers in Model

In [None]:
import bitsandbytes as bnb
def find_all_linear_names(model):
  cls = bnb.nn.Linear4bit
  lora_module_names = set()
  for name, module in model.named_modules():
    if isinstance(module, cls):
      names = name.split('.')
      lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names: # needed for 16-bit
      lora_module_names.remove('lm_head')
  return list(lora_module_names)

In [None]:
modules = find_all_linear_names(model)
print(modules)

In [None]:
model = get_peft_model(model, lora_config)

In [None]:
trainable, total = model.get_nb_trainable_parameters()
print(f"Trainable: {trainable} | total: {total} | Percentage: {trainable/total*100:.4f}%")

## Running the Fine-Tuning

Supervised fine-tuning using QLoRA

### Fine-Tuning Parameters

In [None]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=modules,
    lora_dropout=0.15,
    bias="none",
    task_type="CAUSAL_LM"
)

In [None]:
training_args = TrainingArguments(
    run_name="Mistral-7b-Instruct-v0p2 FTing #1",
    output_dir="/kaggle/working/finetune_output",
    logging_dir="/kaggle/working/finetune_logs",
    report_to="wandb",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    warmup_ratio=0.03,
    learning_rate=2e-4,
    weight_decay=0.001,
    optim="paged_adamw_8bit",
    fp16=True,
    logging_steps=20,
    save_strategy="epoch"
)

In [None]:
tokenizer.pad_token = tokenizer.eos_token
torch.cuda.empty_cache()

trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=test_data,
    dataset_text_field="prompt",
    peft_config=lora_config,
    args=training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

### Starting Training Process

In [None]:
model.config.use_cache = False
trainer.train()

## Post-Fine-Tuning

### Saving Fine-Tuned Model

In [None]:
new_model = "wittgenbot-finetune-test"
trainer.model.save_pretrained(new_model)

### Merging Fine-Tuned Model with Base Model (Mistral-7B-Instruct-v0.2)

In [None]:
base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map={"": 0},
)
merged_model= PeftModel.from_pretrained(base_model, new_model)
merged_model= merged_model.merge_and_unload()

### Saving Merged Model

In [None]:
merged_model_path = '/kaggle/working/wittgenbot-merged-model'
merged_model.save_pretrained(merged_model_path, safe_serialization=True)
tokenizer.save_pretrained(merged_model_path)

tokenizer.pad_token = tokenizer.unk_token
tokenizer.padding_side = 'right'
tokenizer.add_eos_token = False
tokenizer.add_bos_token = False

### Pushing Merged Model & Tokenizer to Hugging Face

In [None]:
repo_id = 'descartesevildemon/Wittgenbot'

merged_model.push_to_hub(repo_id=repo_id)
tokenizer.push_to_hub(repo_id=repo_id)

In [None]:
wandb.finish()