In [None]:
from google.colab import files
uploaded = files.upload()


Saving job_skills.jsonl to job_skills.jsonl


In [None]:
# Install Dependencies
!pip install torch transformers datasets peft accelerate

!pip install bitsandbytes --no-cache-dir
!pip install --upgrade accelerate transformers





In [None]:
from huggingface_hub import login
login()

In [None]:
#Load Llama 3 and Prepare Dataset

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

# Define model name
model_name = "meta-llama/Meta-Llama-3-8B"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Apply 4-bit quantization configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Enables 4-bit quantization
    bnb_4bit_compute_dtype=torch.float16,  # Use FP16 for computation
    bnb_4bit_use_double_quant=True,  # Enables double quantization to save memory
)

# Load model with quantization and automatic GPU placement
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,  # Apply 4-bit quantization
    device_map="auto",  # Automatically assigns model to available GPU
)



In [None]:
import torch
print("CUDA Available:", torch.cuda.is_available())
print("GPU Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU detected")


In [None]:
#Prepare Data for Fine-Tuning
import json
import pandas as pd
from datasets import Dataset

# Define the path to your JSONL file
dataset_path = "/content/job_skills.jsonl"  # Change if using Google Drive

# Load JSONL file into a Python list
data = []
with open(dataset_path, "r", encoding="utf-8") as f:
    for line in f:
        data.append(json.loads(line))  # Convert each line from JSON to a dictionary

# Convert JSONL data into a structured format
formatted_data = []
for entry in data:
    formatted_data.append({
        "prompt": f"Job Title: {entry['Title']}\nCategory: {entry['Category']}\nLocation: {entry['Location']}\nWhat are the responsibilities?",
        "response": entry["Responsibilities"]
    })

# Convert to Hugging Face dataset format
dataset = Dataset.from_pandas(pd.DataFrame(formatted_data))

# Example: Print first sample
print(dataset[0])


In [None]:
# Tokenize the Dataset
from transformers import DataCollatorForSeq2Seq
# Set a padding token manually
tokenizer.pad_token = tokenizer.eos_token  # Use end-of-sequence token as padding

def tokenize_function(example):
    input_text = example["prompt"]
    target_text = example["response"]

    # Tokenize prompt and response with padding
    model_inputs = tokenizer(input_text, truncation=True, padding="max_length", max_length=512)
    labels = tokenizer(target_text, truncation=True, padding="max_length", max_length=512)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply tokenization to the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["prompt", "response"])

# Define data collator for batching
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)


In [None]:
# Pass Tokenized Data to Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,  # Use tokenized dataset
    data_collator=data_collator,  # Enable proper batch padding
)


In [None]:
# Fine-Tune with LoRA (Efficient Fine-Tuning)
from peft import LoraConfig, get_peft_model
from transformers import TrainingArguments, Trainer

# LoRA configuration
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

# Wrap model with LoRA
model = get_peft_model(model, lora_config)

# Training arguments
training_args = TrainingArguments(
    output_dir="./llama-finetuned",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    save_steps=100,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=10
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,  # Use tokenized dataset
    data_collator=data_collator,  # Enable proper batch padding
)

# Start fine-tuning
trainer.train()



In [None]:
# Save & Use Fine-Tuned Model
model.save_pretrained("/content/llama-finetuned")
tokenizer.save_pretrained("/content/llama-finetuned")


Download it

In [None]:
from google.colab import files
import shutil

shutil.make_archive("llama-finetuned", 'zip', "/content/llama-finetuned")
files.download("llama-finetuned.zip")



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>