In [None]:
%%capture
!pip install unsloth

!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git
!pip install datasets

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 4096
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-3B-Instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

In [None]:
from datasets import load_dataset, Dataset
from collections import Counter
import pandas as pd


dataset = load_dataset('yifiyifan/synthetic-resume-fit-labelled')
print(type(dataset))


label_mapping = {'No Fit': 0, 'Potential Fit': 1, 'Good Fit': 2}
dataset = dataset.map(lambda map: {'label': label_mapping[map['label']]})

def add_text_field(entry):
  entry['text'] = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
\nYou are an expert resume analyzer. Your job is to analyze both the resume and job description below together and categorize how well the resume fits the job description and give it as a response. Your response should either be 0 1 or 2. 0 means no fit between the resume and job description. 1 means potential fit between the resume and job description, 2 means good fit between the resume and job description.\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>
\nResume: {entry['resume_text']}
Job Description:{entry['job_description_text']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n
{entry['label']}<|eot_id|>
"""
  return entry


def remove_outliers(dataset, tokenizer, max_tokens=4096):
  def filter_fn(example):
        # Tokenize using the tokenizer and count the number of tokens
    tokens = tokenizer.encode(example['text'], truncation=False)
    return len(tokens) <= max_tokens

    # Filter the dataset to keep only examples that don't exceed max_tokens
  return dataset.filter(filter_fn)
dataset = dataset.map(add_text_field)
print(len(dataset['train']))
dataset = remove_outliers(dataset, tokenizer)
print(len(dataset['train']))


df = dataset['train'].to_pandas()
min_count = 1100
balanced_df = (
    df.groupby('label')
      .apply(lambda group: group.sample(n=min_count))
      .reset_index(drop=True)
)
df2 = dataset['test'].to_pandas()
min_count = 300
balanced_df2 = (
    df.groupby('label')
      .apply(lambda group: group.sample(n=min_count))
      .reset_index(drop=True)
)

new_dataset_train = (Dataset.from_pandas(balanced_df)).shuffle()
new_dataset_test = (Dataset.from_pandas(balanced_df2)).shuffle()

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = new_dataset_train,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 170, # Set for 10 percent of total steps
        num_train_epochs = 4,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

In [None]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

In [None]:
from huggingface_hub import login
from google.colab import userdata
access_token = userdata.get('HF_TOKEN')
login(access_token)

In [None]:
trainer_stats = trainer.train()
if True:
  model.push_to_hub("Sabar1/resume_modell2", token = userdata.get('HF_TOKEN'))
  tokenizer.push_to_hub("Sabar1/resume_modell2", token=userdata.get("HF_TOKEN"))


In [None]:
from peft import PeftModel
from transformers import LlamaForCausalLM

# Load the base model and PEFT adapters
base_model = LlamaForCausalLM.from_pretrained("unsloth/Llama-3.2-3B-Instruct")
model = PeftModel.from_pretrained(base_model, "Sabar1/resume_modell2")

# Merge the adapters with the base model
model = model.merge_and_unload()

# Save the merged model
model.save_pretrained("merged_model")
model.push_to_hub("Sabar1/merged_model", token=userdata.get("HF_TOKEN"))

In [None]:
import torch
# Load the model and tokenizer
device = model.device
# Set the model to evaluation mode
model.eval()
FastLanguageModel.for_inference(model)
counter = 0
for i in range(900):

# Prepare input text
  input_text = new_dataset_test[i]["text"]

  input_text = input_text[:-12]
  inputs = tokenizer(input_text, return_tensors="pt")
  inputs = {key: value.to(device) for key, value in inputs.items()}

# Perform inference
  with torch.no_grad():
      outputs = model.generate(**inputs, max_new_tokens=1)

  generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

  original_tokens = inputs["input_ids"][0]
  generated_tokens = outputs[0]


  new_tokens = generated_tokens[len(original_tokens):]
  new_text = tokenizer.decode(new_tokens, skip_special_tokens=True)
  print(new_dataset_test[i]['label'])
  print(new_text)
  print("\n")
  if(new_text == str(new_dataset_test[i]['label'])):
    counter += 1
print("Acc: ", counter/900.0 * 100)



