# HIGH SCHOOL AI TUTOR

## Libraries

In [1]:
# !pip install -q -U bitsandbytes transformers peft accelerate datasets trl tqdm

In [2]:
from kaggle_secrets import UserSecretsClient
from huggingface_hub import login

user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("HF_TOKEN")

login(token=hf_token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [23]:
import wandb

wandb.init(mode='disabled')

In [3]:
import os
from dataclasses import dataclass, field
from typing import Optional

In [4]:
import torch
from datasets import load_dataset, load_from_disk
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging
)
from peft import (
    LoraConfig,
    PeftModel,
    get_peft_model
)

from trl import SFTTrainer, setup_chat_format

In [5]:
from tqdm.notebook import tqdm

## Load model & tokenizer

In [6]:
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
finetuned_model_name = "Llama-3-8b-healthcare-assistant"
attn_implementation = "eager"

**QLoRA parameters**

In [7]:
# LoRA attention dimension
lora_r = 16
# LoRA scaling
lora_alpha = 32
# dropout probability
lora_dropout = 0.05

**`bitsandbytes` parameters**

In [8]:
use_4bit = True
bnb_4bit_compute_dtype = "float16"
bnb_4bit_quant_type = "nf4"
use_nested_quant = False

In [9]:
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)


# check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

In [10]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation
)

config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

In [11]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model, tokenizer = setup_chat_format(model, tokenizer)

tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

In [12]:
peft_config = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj',
                    'k_proj', 'v_proj', 'v_proj', 'o_proj']
)

model = get_peft_model(model, peft_config)

## Prepare Dataset

In [13]:
dataset = load_dataset("Amod/mental_health_counseling_conversations")
dataset = dataset.shuffle(seed=65)

README.md:   0%|          | 0.00/2.82k [00:00<?, ?B/s]

combined_dataset.json:   0%|          | 0.00/4.79M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3512 [00:00<?, ? examples/s]

In [14]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Context', 'Response'],
        num_rows: 3512
    })
})

In [15]:
def format_chat_template(row):
    row_json = [{'role': 'user', 'content': row['Context']},
                {'role': 'assistant', 'content': row['Response']}]
    row['text'] = tokenizer.apply_chat_template(row_json, tokenize=False)

    return row

In [16]:
dataset = dataset.map(format_chat_template)

Map:   0%|          | 0/3512 [00:00<?, ? examples/s]

In [17]:
dataset = dataset['train']
dataset = dataset.train_test_split(test_size=0.1)

In [18]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Context', 'Response', 'text'],
        num_rows: 3160
    })
    test: Dataset({
        features: ['Context', 'Response', 'text'],
        num_rows: 352
    })
})

## Train

In [19]:
training_arguments = TrainingArguments(
    output_dir=finetuned_model_name,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=3,
    evaluation_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True
)



In [20]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    peft_config=peft_config,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/3160 [00:00<?, ? examples/s]

Map:   0%|          | 0/352 [00:00<?, ? examples/s]

In [24]:
trainer.train()

OutOfMemoryError: CUDA out of memory. Tried to allocate 256.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 186.12 MiB is free. Process 8324 has 14.56 GiB memory in use. Of the allocated memory 14.29 GiB is allocated by PyTorch, and 142.91 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)