## STEP 1: LOAD DATA


In [None]:
import json
file = json.load(open("career_qa_dataset.json", "r"))
print(file[1])

{'question': 'What are the key skills required for a Software Engineer?', 'answer': 'The key skills for a Software Engineer include: Coding; Problem Solving; Data Structures.'}


# STEP 2: INSTALL REQUIRED LIBRARIES



In [None]:
!pip install unsloth trl peft accelerate bitsandbytes

Collecting unsloth
  Downloading unsloth-2026.2.1-py3-none-any.whl.metadata (69 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/69.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.7/69.7 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trl
  Downloading trl-0.28.0-py3-none-any.whl.metadata (11 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.49.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting unsloth_zoo>=2026.2.1 (from unsloth)
  Downloading unsloth_zoo-2026.2.1-py3-none-any.whl.metadata (32 kB)
Collecting tyro (from unsloth)
  Downloading tyro-1.0.7-py3-none-any.whl.metadata (12 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.35-py39-none-manylinux_2_28_x86_64.whl.metadata (1.2 kB)
Collecting datasets!=4.0.*,!=4.1.0,<4.4.0,>=3.4.1 (from unsloth)
  Downloading datasets-4.3.0-py3-none-any.whl.metadata (18 kB)
Collecting hf_transfer (

In [None]:
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")

CUDA available: True
GPU: Tesla T4


# STEP 3: CHOOSE THE DESIRABLE MODEL FOR TRAINING
in this case phi3 model is used




In [None]:
from unsloth import FastLanguageModel
import torch

model_name = "unsloth/Phi-3-mini-4k-instruct-bnb-4bit"

max_seq_length = 2048  # Choose sequence length
dtype = None  # Auto detection

# Load model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=True,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2026.2.1: Fast Mistral patching. Transformers: 4.57.6.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.563 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.10.0+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.6.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.35. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.26G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/194 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/458 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [None]:
from datasets import Dataset

# 1. Define the Phi-3 specific prompt template
phi3_prompt = """### Instruction:
{}

### Response:
{}"""

# 2. Add the EOS (End Of Sentence) token so the model knows when to stop
EOS_TOKEN = tokenizer.eos_token

def format_prompt(example):
    # Map 'question' to 'instruction' and 'answer' to 'response' from your JSON
    instruction = example.get("question", "")
    response = example.get("answer", "")

    # Format the string and append the EOS token
    return {"text": phi3_prompt.format(instruction, response) + EOS_TOKEN}

# 3. Apply the formatting to your dataset
# Assuming 'dataset' is already loaded from your JSONL
formatted_data = [format_prompt(item) for item in file]
# Corrected line: Extract the 'text' string from each dictionary in formatted_data
dataset = Dataset.from_dict({"text": [item["text"] for item in formatted_data]})

# STEP 4: ADD LORA(LOW RANK ADAPTERS) FOR BETTER OPTIMIZATION



In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r=64,  # LoRA rank - higher = more capacity, more memory
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_alpha=128,  # LoRA scaling factor (usually 2x rank)
    lora_dropout=0,  # Supports any, but = 0 is optimized
    bias="none",     # Supports any, but = "none" is optimized
    use_gradient_checkpointing="unsloth",  # Unsloth's optimized version
    random_state=3407,
    use_rslora=False,  # Rank stabilized LoRA
    loftq_config=None, # LoftQ
)

Unsloth: Already have LoRA adapters! We shall skip this step.


In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments

# Training arguments optimized for Unsloth
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,  # Effective batch size = 8
        warmup_steps=10,
        num_train_epochs=3,
        learning_rate=2e-4,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=25,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        save_strategy="epoch",
        save_total_limit=2,
        dataloader_pin_memory=False,
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/300 [00:00<?, ? examples/s]

In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 300 | Num Epochs = 3 | Total steps = 114
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 119,537,664 of 3,940,617,216 (3.03% trained)


Step,Training Loss
25,1.2348
50,0.2544
75,0.2087
100,0.1809


0,1
train/epoch,▁▃▅▇█
train/global_step,▁▃▅▇█
train/grad_norm,█▃▁▁
train/learning_rate,█▆▃▁
train/loss,█▁▁▁

0,1
total_flos,875403216433152.0
train/epoch,3.0
train/global_step,114.0
train/grad_norm,0.36076
train/learning_rate,3e-05
train/loss,0.1809
train_loss,0.43291
train_runtime,198.72
train_samples_per_second,4.529
train_steps_per_second,0.574


# STEP 5: TEST THE FINE-TUNED MODEL



In [None]:
from unsloth import FastLanguageModel

def generate_response(instruction):
    # Format the instruction into an inference-specific Phi-3 prompt template
    # The model should generate the response after '### Response:\n'
    inference_prompt = f"""### Instruction:\n{instruction}\n\n### Response:\n"""
    inputs = tokenizer(
        inference_prompt,
        return_tensors="pt"
    ).to("cuda")

    outputs = model.generate(
        **inputs,
        max_new_tokens=512, # Limit the length of the generated response
        use_cache=True,

        pad_token_id=tokenizer.eos_token_id
    )

    # Decode the output tokens
    response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)

    # Remove the extra ' ancient history' that was used in the training prompt template


    return response.strip()

# Example usage
instruction=input()
response = generate_response(instruction)
print(response)

what should i pursue to become software engineer
To become a Software Engineer, you should study Computer Science.


# STEP 6: DOWNLOAD THE MODEL



In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from google.colab import drive
drive.mount('/content/drive')

MessageError: Error: credential propagation was unsuccessful

In [None]:
save_path = "/content/drive/MyDrive/career_counselor_bot"

model.save_pretrained_gguf(save_path,tokenizer,quantization_method="q4_k_m")

Unsloth: Merging model weights to 16-bit format...


config.json:   0%|          | 0.00/724 [00:00<?, ?B/s]

Found HuggingFace hub cache directory: /root/.cache/huggingface/hub


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Checking cache directory for required files...
Cache check failed: model-00001-of-00002.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.
Checking cache directory for required files...
Cache check failed: tokenizer.model not found in local cache.
Not all required files found in cache. Will proceed with downloading.


Unsloth: Preparing safetensor model files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files:  50%|█████     | 1/2 [01:55<01:55, 115.35s/it]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.65G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files: 100%|██████████| 2/2 [02:51<00:00, 85.63s/it]
Unsloth: Merging weights into 16bit: 100%|██████████| 2/2 [04:42<00:00, 141.36s/it]


Unsloth: Merge process complete. Saved to `/content/drive/MyDrive/career_counselor_bot`
Unsloth: Converting to GGUF format...
==((====))==  Unsloth: Conversion from HF to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF f16 might take 3 minutes.
\        /    [2] Converting GGUF f16 to ['q4_k_m'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: Updating system package directories
Unsloth: All required system packages already installed!
Unsloth: Install llama.cpp and building - please wait 1 to 3 minutes
Unsloth: Cloning llama.cpp repository
Unsloth: Install GGUF and other packages
Unsloth: Successfully installed llama.cpp!
Unsloth: Preparing converter script...




Unsloth: [1] Converting model into f16 GGUF format.
This might take 3 minutes...
Unsloth: Initial conversion completed! Files: ['/content/drive/MyDrive/career_counselor_bot_gguf/phi-3-mini-4k-instruct.F16.gguf']
Unsloth: [2] Converting GGUF f16 into q4_k_m. This might take 10 minutes...
Unsloth: Model files cleanup...
Unsloth: All GGUF conversions completed successfully!
Generated files: ['/content/drive/MyDrive/career_counselor_bot_gguf/phi-3-mini-4k-instruct.Q4_K_M.gguf']
Unsloth: example usage for text only LLMs: llama.cpp/llama-cli --model /content/drive/MyDrive/career_counselor_bot_gguf/phi-3-mini-4k-instruct.Q4_K_M.gguf -p "why is the sky blue?"
Unsloth: Saved Ollama Modelfile to /content/drive/MyDrive/career_counselor_bot_gguf/Modelfile
Unsloth: convert model to ollama format by running - ollama create model_name -f /content/drive/MyDrive/career_counselor_bot_gguf/Modelfile


{'save_directory': '/content/drive/MyDrive/career_counselor_bot',
 'gguf_directory': '/content/drive/MyDrive/career_counselor_bot_gguf',
 'gguf_files': ['/content/drive/MyDrive/career_counselor_bot_gguf/phi-3-mini-4k-instruct.Q4_K_M.gguf'],
 'modelfile_location': '/content/drive/MyDrive/career_counselor_bot_gguf/Modelfile',
 'want_full_precision': False,
 'is_vlm': False,
 'fix_bos_token': False}