## PURPOSE OF THIS NOTEBOOK: 

This notebook contains the training and testing of the third version of the Job Title to ONET Family Matching model. This time, we are focusing on maintaining model performance while significantly reducing the size of the model. I will be experimenting with different models until I land one that is exactly what I need.

In [25]:
from transformers import (
        AutoModelForCausalLM, 
        AutoTokenizer, 
        BitsAndBytesConfig, 
        TrainingArguments,
        HfArgumentParser,
        Trainer,
        TrainingArguments,
        DataCollatorForLanguageModeling,
        EarlyStoppingCallback,
        pipeline,
        logging,
        set_seed)
from tqdm.autonotebook import tqdm 
from torch import cuda
from huggingface_hub import notebook_login
from CommonFunctions import read_and_prepare_data, CustomDataset, Loss
from torch.utils.data import DataLoader
from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training, AutoPeftModelForCausalLM
from trl import SFTTrainer

import pandas as pd 
import numpy as np
import torch
import bitsandbytes as bnb

model_name = "meta-llama/Llama-2-7b-hf"

In [26]:
device = 'cuda' if cuda.is_available() else 'cpu'

In [5]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
bnb_4bit_use_double_quant

In [6]:
def create_model_and_tokenizer():
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type='nf4',
        bnb_4_bit_compute_dtype=torch.bfloat16,
    )

    n_gpus = torch.cuda.device_count()
    max_memory = f'{6900}MB'

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        use_safetensors=True,
        quantization_config=bnb_config,
        trust_remote_code=True,
        device_map="auto",
        max_memory = {i: max_memory for i in range(n_gpus)},
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side="right"

    return model, tokenizer

In [7]:
model, tokenizer = create_model_and_tokenizer()
model.eval()

The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )


## Data Prep

In [17]:
# Setting the directory of the output 
OUTPUT_DIR = "Data"

In [9]:
test_df, train_df, label_df = read_and_prepare_data()

In [10]:
train_set = CustomDataset(train_df, tokenizer, MAX_LEN)
test_set = CustomDataset(test_df, tokenizer, MAX_LEN)

In [11]:
training_loader = DataLoader(train_set, **train_params)
testing_loader = DataLoader(test_set, **test_params)

## Start training of model

In [12]:
lora_alpha = 32
lora_dropout = 0.05
lora_r = 16

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias='none',
    task_type="CASUAL_LM"
)

In [19]:
training_arguments = TrainingArguments(
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    optim='paged_adamw_32bit',
    logging_steps=1,
    learning_rate=1e-4,
    fp16=True,
    max_grad_norm=0.3,
    num_train_epochs=5,
    evaluation_strategy='steps',
    eval_steps=0.2,
    warmup_ratio=0.05,
    save_strategy='epoch',
    group_by_length=True,
    output_dir=OUTPUT_DIR,
    report_to='tensorboard',
    save_safetensors=True,
    lr_scheduler_type='cosine',
    seed=42
)

In [24]:
trainer = SFTTrainer(
    model=model,
    train_dataset=test_df['Reported_Jobs'],
    eval_dataset=train_df['Label'],
    peft_config=peft_config,
    dataset_text_field='text',
    max_seq_length=4096,
    tokenizer=tokenizer,
    args=training_arguments
)

  


AttributeError: 'Series' object has no attribute 'column_names'

In [16]:
OUTPUT_DIR = 'Data/'
%load_ext tensorboard
%tensorboard --logdir Data/runs

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard
