Install Required Libraries

In [1]:
!pip install -U transformers datasets peft trl bitsandbytes accelerate evaluate rouge-score nltk absl-py



Login to Hugging Face

In [2]:
from huggingface_hub import login
login("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")

  from .autonotebook import tqdm as notebook_tqdm


Load Dataset and Format for Completion Style

In [17]:
from datasets import load_dataset

dataset = load_dataset("azizshaw/text_to_json")

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 845
    })
})

In [29]:
# See column names
print(dataset["train"].column_names)

# View the first row
print(dataset["train"][0])

# View first few examples (useful for debugging)
for i in range(3):
    print(f"Example {i}:\n", dataset["train"][i], "\n")


['input', 'output', 'instruction']
{'input': '1. Send a mobile app notification to customers whose total recharge amount for the last 30 days is equal to 300 and whose total recharge amount for the last 2 weeks is greater than 80.', 'output': "{'featureId': '', 'appName': '', 'username': '', 'password': '', 'reqTxnId': '', 'msgOrigin': '', 'msgDest': '', 'timestamp': '', 'id': '', 'ruletype': '', 'data': {'detail': {'rules': {'id': '0', 'pid': '#', 'childrens': [{'id': '0_0', 'pid': '0', 'type': 'conditions', 'option': 'All', 'childrens': [{'id': '0_0_0', 'pid': '0_0', 'type': 'condition', 'profile': {'id': 1, 'name': 'TOTAL_RECHARGE_REV_LAST_30_DAYS'}, 'operator': '=', 'values': {'value': '300'}}, {'id': '0_0_1', 'pid': '0_0', 'type': 'condition', 'profile': {'id': 3, 'name': 'TOTAL_RECHARGE_REV_LAST_2_WEEKS'}, 'operator': '>', 'values': {'value': '80'}}, {'id': '0_0_2', 'pid': '0_0', 'type': 'action', 'action': {'id': 98, 'name': 'Mobile App Notification'}, 'field': [{'name': 'Action

In [18]:
def format_example(example):
    return {
        "text": f"Convert the following text into JSON:\n\n{example['instruction']}\n\nJSON:\n{example['output']}"
    }

train_data = dataset["train"].map(format_example)

Load Meta-Llama 3.1–8B Base Model

In [7]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training
import torch

model_name = "meta-llama/Meta-Llama-3.1-8B"

In [8]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [9]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.bfloat16
)

Loading checkpoint shards: 100%|██████████| 4/4 [00:03<00:00,  1.28it/s]


In [10]:
model = prepare_model_for_kbit_training(model)

Apply LoRA (QLoRA)

In [11]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)

model = get_peft_model(model, lora_config)

Load Tokenizer

In [12]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

Fine-Tune with SFTTrainer

In [13]:
from trl import SFTTrainer, SFTConfig

sft_config = SFTConfig(
    output_dir="./llama3-8b-base-json",
    per_device_train_batch_size=2,
    gradient_checkpointing=True,
    bf16=True,
    logging_steps=10,
    num_train_epochs=1,
    save_strategy="epoch"
)

In [21]:
def formatting_func(example):
    return example["text"]

trainer = SFTTrainer(
    model=model,
    
    args=sft_config,
    train_dataset=train_data,
    formatting_func=formatting_func,
    
)

Adding EOS to train dataset: 100%|██████████| 845/845 [00:00<00:00, 10391.93 examples/s]
Tokenizing train dataset: 100%|██████████| 845/845 [00:02<00:00, 347.86 examples/s]
Truncating train dataset: 100%|██████████| 845/845 [00:00<00:00, 23571.97 examples/s]
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [22]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mprajaktapatil328[0m ([33mprajaktapatil328-blue-polaris[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
10,1.659
20,1.6313
30,1.5908
40,1.5396
50,1.4777
60,1.4055
70,1.3189
80,1.2131
90,1.0893
100,0.9606




TrainOutput(global_step=423, training_loss=0.44192269307079046, metrics={'train_runtime': 1223.7599, 'train_samples_per_second': 0.69, 'train_steps_per_second': 0.346, 'total_flos': 3.898085990203392e+16, 'train_loss': 0.44192269307079046})

In [23]:
finetuned_model = trainer.model

 Evaluate (ROUGE + JSON Validity)

In [24]:
from evaluate import load
from tqdm import tqdm
import json

rouge = load("rouge")

def evaluate_model(model, tokenizer, data, check_json=False):
    predictions, references = [], []
    valid_json = 0
    model.eval()

    for example in tqdm(data, desc="Evaluating"):
        prompt = f"Convert the following text into JSON:\n\n{example['instruction']}\n\nJSON:"
        input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.to(model.device)

        with torch.no_grad():
            output_ids = model.generate(input_ids, max_new_tokens=256)
        output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        generated = output.split("JSON:")[-1].strip()

        predictions.append(generated)
        references.append(example["output"])

        if check_json:
            try:
                json.loads(generated)
                valid_json += 1
            except:
                pass

    score = rouge.compute(predictions=predictions, references=references)
    if check_json:
        score["valid_json_percent"] = round(100 * valid_json / len(predictions), 2)

    return score

eval_data = dataset["test"] if "test" in dataset else dataset["train"].select(range(10))
results = evaluate_model(finetuned_model, tokenizer, eval_data, check_json=True)

print("📊 Fine-tuned Base Model Results:", results)

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Evaluating:  10%|█         | 1/10 [00:20<03:02, 20.28s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluating:  20%|██        | 2/10 [00:40<02:41, 20.25s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected be

📊 Fine-tuned Base Model Results: {'rouge1': np.float64(0.5943741625790877), 'rouge2': np.float64(0.510153517236997), 'rougeL': np.float64(0.5889065390632012), 'rougeLsum': np.float64(0.30456100208854686), 'valid_json_percent': 0.0}





In [26]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

base_model_name = "meta-llama/Meta-Llama-3.1-8B"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config
)

base_tokenizer = AutoTokenizer.from_pretrained(base_model_name)
base_tokenizer.pad_token = base_tokenizer.eos_token


Loading checkpoint shards: 100%|██████████| 4/4 [00:03<00:00,  1.32it/s]


In [27]:
# Select a small subset to test (e.g., 10 examples)
eval_data = dataset["test"] if "test" in dataset else dataset["train"].select(range(10))

# Base model (Llama 3.1 8B)
results_base = evaluate_model(base_model, base_tokenizer, eval_data, check_json=True)

# Fine-tuned model
results_finetuned = evaluate_model(finetuned_model, tokenizer, eval_data, check_json=True)

# Display results
print("🔹 Base Model (Llama-3.1-8B):", results_base)
print("✅ Fine-tuned Model:", results_finetuned)


Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluating:  10%|█         | 1/10 [00:16<02:25, 16.18s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluating:  20%|██        | 2/10 [00:32<02:09, 16.18s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluating:  30%|███       | 3/10 [00:48<01:53, 16.19s/it]The attention mask a

🔹 Base Model (Llama-3.1-8B): {'rouge1': np.float64(0.7089763678309513), 'rouge2': np.float64(0.6889421338155516), 'rougeL': np.float64(0.7039641870401184), 'rougeLsum': np.float64(0.4733931682193441), 'valid_json_percent': 0.0}
✅ Fine-tuned Model: {'rouge1': np.float64(0.5983489134834254), 'rouge2': np.float64(0.5242576397916189), 'rougeL': np.float64(0.596164133879008), 'rougeLsum': np.float64(0.31024050491470817), 'valid_json_percent': 0.0}





In [28]:
import pandas as pd
pd.DataFrame([results_base, results_finetuned], index=["Base", "Fine-tuned"])


Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum,valid_json_percent
Base,0.708976,0.688942,0.703964,0.473393,0.0
Fine-tuned,0.598349,0.524258,0.596164,0.310241,0.0


Step-by-Step: Compare Model Outputs1. Define the prompt and generate output from both models

In [None]:
def compare_models(example, base_model, base_tokenizer, finetuned_model, finetuned_tokenizer):
    instruction = example["instruction"]
    reference_output = example["output"]

    prompt = f"Convert the following text into JSON:\n\n{instruction}\n\nJSON:"

    # Tokenize and move to device
    base_inputs = base_tokenizer(prompt, return_tensors="pt").to(base_model.device)
    finetune_inputs = finetuned_tokenizer(prompt, return_tensors="pt").to(finetuned_model.device)

    # Generate outputs
    with torch.no_grad():
        base_output_ids = base_model.generate(base_inputs["input_ids"], max_new_tokens=256)
        finetune_output_ids = finetuned_model.generate(finetune_inputs["input_ids"], max_new_tokens=256)

    base_output = base_tokenizer.decode(base_output_ids[0], skip_special_tokens=True).split("JSON:")[-1].strip()
    finetuned_output = finetuned_tokenizer.decode(finetune_output_ids[0], skip_special_tokens=True).split("JSON:")[-1].strip()

    print("🔹 Prompt:\n", prompt)
    print(" Ground Truth:\n", reference_output)
    print(" Base Model Output:\n", base_output)
    print(" Fine-tuned Model Output:\n", finetuned_output)


 Use it on a sample from the dataset

In [31]:
sample = dataset["train"][0]  # You can change index here
compare_models(sample, base_model, base_tokenizer, finetuned_model, tokenizer)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


🔹 Prompt:
 Convert the following text into JSON:

The task is to generate a json format from a set of keywords.
I will help you with some details about this format.
Consider we have a database with us which has telecom details about customers, their puchase, their demographics, their usage of voice,sms.data & so on.
For ex. when I say "action_name = push notification, KPI1 = P_AON,OP1 = >, VAL1 = 30, KPI2 = P_DEVICE_TYPE,OP2 = =, VAL2 = phone,option=All" then it means,
"Send a mobile app notification to customers whose Age on Network (AON) is greater than 30 days & their device type is a phone"
based on this we will get the below json:
{
	"featureId": "",
	"appName": "",
	"username": "",
	"password": "",
	"reqTxnId": "",
	"msgOrigin": "",
	"msgDest": "",
	"timestamp": "",
	"id": "",
	"ruletype": "",
	"data": {
		"detail": {
			"rules": {
				"id": "0",
				"pid": "#",
				"childrens": [
					{
						"id": "0_0",
						"pid": "0",
						"type": "conditions",
						"option": "All",
					

In [38]:
def compare_models(example, base_model, base_tokenizer, finetuned_model, finetuned_tokenizer):
    instruction = example["instruction"]
    reference_output = example["output"]

    prompt = f"Convert the following text into JSON:\n\n{instruction}\n\nJSON:"
    print("\n🔹 Prompt:\n", prompt)
    print("\n Ground Truth:\n", reference_output)

    # Base Model
    base_inputs = base_tokenizer(prompt, return_tensors="pt").to(base_model.device)
    with torch.no_grad():
        base_output_ids = base_model.generate(base_inputs["input_ids"], max_new_tokens=512)
    base_output = base_tokenizer.decode(base_output_ids[0], skip_special_tokens=True).split("JSON:")[-1].strip()
    print("\n Base Model Output:\n", base_output)

    # Fine-tuned Model
    finetune_inputs = finetuned_tokenizer(prompt, return_tensors="pt").to(finetuned_model.device)
    with torch.no_grad():
        finetune_output_ids = finetuned_model.generate(finetune_inputs["input_ids"], max_new_tokens=512)
    finetuned_output = finetuned_tokenizer.decode(finetune_output_ids[0], skip_special_tokens=True).split("JSON:")[-1].strip()
    print("\n Fine-tuned Model Output:\n", finetuned_output)


In [39]:
compare_models(dataset["train"][0], base_model, base_tokenizer, finetuned_model, tokenizer)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



🔹 Prompt:
 Convert the following text into JSON:

The task is to generate a json format from a set of keywords.
I will help you with some details about this format.
Consider we have a database with us which has telecom details about customers, their puchase, their demographics, their usage of voice,sms.data & so on.
For ex. when I say "action_name = push notification, KPI1 = P_AON,OP1 = >, VAL1 = 30, KPI2 = P_DEVICE_TYPE,OP2 = =, VAL2 = phone,option=All" then it means,
"Send a mobile app notification to customers whose Age on Network (AON) is greater than 30 days & their device type is a phone"
based on this we will get the below json:
{
	"featureId": "",
	"appName": "",
	"username": "",
	"password": "",
	"reqTxnId": "",
	"msgOrigin": "",
	"msgDest": "",
	"timestamp": "",
	"id": "",
	"ruletype": "",
	"data": {
		"detail": {
			"rules": {
				"id": "0",
				"pid": "#",
				"childrens": [
					{
						"id": "0_0",
						"pid": "0",
						"type": "conditions",
						"option": "All",
				

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



 Base Model Output:
 {
	"featureId": "",
	"appName": "",
	"username": "",
	"password": "",
	"reqTxnId": "",
	"msgOrigin": "",
	"msgDest": "",
	"timestamp": "",
	"id": "",
	"ruletype": "",
	"data": {
		"detail": {
			"rules": {
				"id": "0",
				"pid": "#",
				"childrens": [
					{
						"id": "0_0",
						"pid": "0",
						"type": "conditions",
						"option": "All",
						"childrens": [
							{
								"id": "0_0_0",
								"pid": "0_0",
								"type": "condition",
								"profile": {
									"id": 1,
									"name": "TOTAL_RECHARGE_REV_LAST_30_DAYS"
								},
								"operator": "=",
								"values": {
									"value": "300"
								}
							},
							{
								"id": "0_0_1",
								"pid": "0_0",
								"type": "condition",
								"profile": {
									"id": 2,
									"name": "TOTAL_RECHARGE_REV_LAST_2_WEEKS"
								},
								"operator": ">",
								"values": {
									"value": "80"
								}
							},
							{
								"id": "0_0_2",
								"pid": "0_0",
							