Install Required Libraries

In [49]:
!pip install -U transformers datasets peft trl bitsandbytes accelerate evaluate rouge-score nltk absl-py

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting absl-py
  Downloading absl_py-2.3.1-py3-none-any.whl.metadata (3.3 kB)
Downloading absl_py-2.3.1-py3-none-any.whl (135 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.8/135.8 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: absl-py
  Attempting uninstall: absl-py
    Found existing installation: absl-py 2.3.0
    Uninstalling absl-py-2.3.0:
      Successfully uninstalled absl-py-2.3.0
Successfully installed absl-py-2.3.1


Login to Hugging Face

In [50]:
from huggingface_hub import login
login("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")

Load Dataset and Format for Completion Style

In [51]:
from datasets import load_dataset

dataset = load_dataset("azizshaw/text_to_json")

In [52]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 845
    })
})

In [53]:
# See column names
print(dataset["train"].column_names)

# View the first row
print(dataset["train"][0])

# View first few examples (useful for debugging)
for i in range(3):
    print(f"Example {i}:\n", dataset["train"][i], "\n")


['input', 'output', 'instruction']
{'input': '1. Send a mobile app notification to customers whose total recharge amount for the last 30 days is equal to 300 and whose total recharge amount for the last 2 weeks is greater than 80.', 'output': "{'featureId': '', 'appName': '', 'username': '', 'password': '', 'reqTxnId': '', 'msgOrigin': '', 'msgDest': '', 'timestamp': '', 'id': '', 'ruletype': '', 'data': {'detail': {'rules': {'id': '0', 'pid': '#', 'childrens': [{'id': '0_0', 'pid': '0', 'type': 'conditions', 'option': 'All', 'childrens': [{'id': '0_0_0', 'pid': '0_0', 'type': 'condition', 'profile': {'id': 1, 'name': 'TOTAL_RECHARGE_REV_LAST_30_DAYS'}, 'operator': '=', 'values': {'value': '300'}}, {'id': '0_0_1', 'pid': '0_0', 'type': 'condition', 'profile': {'id': 3, 'name': 'TOTAL_RECHARGE_REV_LAST_2_WEEKS'}, 'operator': '>', 'values': {'value': '80'}}, {'id': '0_0_2', 'pid': '0_0', 'type': 'action', 'action': {'id': 98, 'name': 'Mobile App Notification'}, 'field': [{'name': 'Action

In [None]:
"""def format_example(example):
    return {
        "text": f"Convert the following text into JSON:\n\n{example['instruction']}\n\nJSON:\n{example['output']}"
    }

train_data = dataset["train"].map(format_example)"""

In [54]:
def format_example(example):
    return {
        "text": f"Convert the following text into JSON: {example['instruction']}\n\nInput: {example['input']}\n\nJSON:\n{example['output']}"
    }
train_data = dataset["train"].map(format_example)

Map: 100%|██████████| 845/845 [00:00<00:00, 9699.13 examples/s]


Load Meta-Llama 3.1–8B Base Model

In [55]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training
import torch

model_name = "meta-llama/Meta-Llama-3.1-8B"

In [56]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [57]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.bfloat16
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards: 100%|██████████| 4/4 [00:03<00:00,  1.31it/s]


In [58]:
model = prepare_model_for_kbit_training(model)

Apply LoRA (QLoRA)

In [59]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)

model = get_peft_model(model, lora_config)

Load Tokenizer

In [60]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

Fine-Tune with SFTTrainer

In [61]:
from trl import SFTTrainer, SFTConfig

sft_config = SFTConfig(
    output_dir="./llama3-8b-base-json",
    per_device_train_batch_size=1,
    gradient_checkpointing=True,
    bf16=True,
    logging_steps=10,
    num_train_epochs=3,
    save_strategy="epoch"
)

In [62]:
def formatting_func(example):
    return example["text"]

trainer = SFTTrainer(
    model=model,
    
    args=sft_config,
    train_dataset=train_data,
    formatting_func=formatting_func,
    
)

Applying formatting function to train dataset: 100%|██████████| 845/845 [00:00<00:00, 10733.49 examples/s]
Adding EOS to train dataset: 100%|██████████| 845/845 [00:00<00:00, 12061.87 examples/s]
Tokenizing train dataset: 100%|██████████| 845/845 [00:02<00:00, 317.85 examples/s]
Truncating train dataset: 100%|██████████| 845/845 [00:00<00:00, 23842.33 examples/s]
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
trainer.train()

Step,Training Loss
10,1.6776
20,1.6504
30,1.6078
40,1.5535
50,1.4862
60,1.4052
70,1.3056
80,1.1804
90,1.0423
100,0.9161


TrainOutput(global_step=2535, training_loss=0.07425699958934116, metrics={'train_runtime': 4426.7475, 'train_samples_per_second': 0.573, 'train_steps_per_second': 0.573, 'total_flos': 1.1694257970610176e+17, 'train_loss': 0.07425699958934116})

In [45]:
finetuned_model = trainer.model

 Evaluate (ROUGE + JSON Validity)

In [46]:
from evaluate import load
from tqdm import tqdm
import json

rouge = load("rouge")

def evaluate_model(model, tokenizer, data, check_json=False):
    predictions, references = [], []
    valid_json = 0
    model.eval()

    for example in tqdm(data, desc="Evaluating"):
        prompt = f"Convert the following text into JSON:\n\n{example['instruction']}\n\nJSON:"
        input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.to(model.device)

        with torch.no_grad():
            output_ids = model.generate(input_ids, max_new_tokens=256)
        output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        generated = output.split("JSON:")[-1].strip()

        predictions.append(generated)
        references.append(example["output"])

        if check_json:
            try:
                json.loads(generated)
                valid_json += 1
            except:
                pass

    score = rouge.compute(predictions=predictions, references=references)
    if check_json:
        score["valid_json_percent"] = round(100 * valid_json / len(predictions), 2)

    return score

eval_data = dataset["test"] if "test" in dataset else dataset["train"].select(range(10))
results = evaluate_model(finetuned_model, tokenizer, eval_data, check_json=True)

print("📊 Fine-tuned Base Model Results:", results)

Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Evaluating:  10%|█         | 1/10 [00:14<02:10, 14.52s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluating:  20%|██        | 2/10 [00:31<02:07, 15.90s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluating:  30%|███       | 3/10 [00:48<01:54, 16.34s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluating:  40%|████      | 4/10 [01:05<01:39, 16.55s/it]The attentio

📊 Fine-tuned Base Model Results: {'rouge1': np.float64(0.5408918967613718), 'rouge2': np.float64(0.396147617526993), 'rougeL': np.float64(0.49322191130966864), 'rougeLsum': np.float64(0.31012134795580737), 'valid_json_percent': 0.0}





In [47]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

base_model_name = "meta-llama/Meta-Llama-3.1-8B"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True, 
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config
)

base_tokenizer = AutoTokenizer.from_pretrained(base_model_name)
base_tokenizer.pad_token = base_tokenizer.eos_token


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards: 100%|██████████| 4/4 [00:13<00:00,  3.40s/it]


In [21]:
# Select a small subset to test (e.g., 10 examples)
eval_data = dataset["test"] if "test" in dataset else dataset["train"].select(range(10))

# Base model (Llama 3.1 8B)
results_base = evaluate_model(base_model, base_tokenizer, eval_data, check_json=True)

# Fine-tuned model
results_finetuned = evaluate_model(finetuned_model, tokenizer, eval_data, check_json=True)

# Display results
print("🔹 Base Model (Llama-3.1-8B):", results_base)
print("✅ Fine-tuned Model:", results_finetuned)


Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluating:  10%|█         | 1/10 [00:13<02:01, 13.45s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluating:  20%|██        | 2/10 [00:27<01:48, 13.58s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluating:  30%|███       | 3/10 [00:40<01:35, 13.65s/it]The attention mask a

🔹 Base Model (Llama-3.1-8B): {'rouge1': np.float64(0.7089763678309513), 'rouge2': np.float64(0.6889421338155516), 'rougeL': np.float64(0.7039641870401184), 'rougeLsum': np.float64(0.4733931682193441), 'valid_json_percent': 0.0}
✅ Fine-tuned Model: {'rouge1': np.float64(0.616991695233368), 'rouge2': np.float64(0.5697565786913337), 'rougeL': np.float64(0.611339706236674), 'rougeLsum': np.float64(0.37832270264491885), 'valid_json_percent': 0.0}





In [48]:
import pandas as pd
pd.DataFrame([results_base, results_finetuned], index=["Base", "Fine-tuned"])


Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum,valid_json_percent
Base,0.708976,0.688942,0.703964,0.473393,0.0
Fine-tuned,0.616992,0.569757,0.61134,0.378323,0.0


Step-by-Step: Compare Model Outputs1. Define the prompt and generate output from both models

In [None]:
def compare_models(example, base_model, base_tokenizer, finetuned_model, finetuned_tokenizer):
    instruction = example["instruction"]
    reference_output = example["output"]

    prompt = f"Convert the following text into JSON:\n\n{instruction}\n\nJSON:"

    # Tokenize and move to device
    base_inputs = base_tokenizer(prompt, return_tensors="pt").to(base_model.device)
    finetune_inputs = finetuned_tokenizer(prompt, return_tensors="pt").to(finetuned_model.device)

    # Generate outputs
    with torch.no_grad():
        base_output_ids = base_model.generate(base_inputs["input_ids"], max_new_tokens=256)
        finetune_output_ids = finetuned_model.generate(finetune_inputs["input_ids"], max_new_tokens=256)

    base_output = base_tokenizer.decode(base_output_ids[0], skip_special_tokens=True).split("JSON:")[-1].strip()
    finetuned_output = finetuned_tokenizer.decode(finetune_output_ids[0], skip_special_tokens=True).split("JSON:")[-1].strip()

    print("🔹 Prompt:\n", prompt)
    print(" Ground Truth:\n", reference_output)
    print(" Base Model Output:\n", base_output)
    print(" Fine-tuned Model Output:\n", finetuned_output)


 Use it on a sample from the dataset

In [31]:
sample = dataset["train"][0]  # You can change index here
compare_models(sample, base_model, base_tokenizer, finetuned_model, tokenizer)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


🔹 Prompt:
 Convert the following text into JSON:

The task is to generate a json format from a set of keywords.
I will help you with some details about this format.
Consider we have a database with us which has telecom details about customers, their puchase, their demographics, their usage of voice,sms.data & so on.
For ex. when I say "action_name = push notification, KPI1 = P_AON,OP1 = >, VAL1 = 30, KPI2 = P_DEVICE_TYPE,OP2 = =, VAL2 = phone,option=All" then it means,
"Send a mobile app notification to customers whose Age on Network (AON) is greater than 30 days & their device type is a phone"
based on this we will get the below json:
{
	"featureId": "",
	"appName": "",
	"username": "",
	"password": "",
	"reqTxnId": "",
	"msgOrigin": "",
	"msgDest": "",
	"timestamp": "",
	"id": "",
	"ruletype": "",
	"data": {
		"detail": {
			"rules": {
				"id": "0",
				"pid": "#",
				"childrens": [
					{
						"id": "0_0",
						"pid": "0",
						"type": "conditions",
						"option": "All",
					

In [38]:
def compare_models(example, base_model, base_tokenizer, finetuned_model, finetuned_tokenizer):
    instruction = example["instruction"]
    reference_output = example["output"]

    prompt = f"Convert the following text into JSON:\n\n{instruction}\n\nJSON:"
    print("\n🔹 Prompt:\n", prompt)
    print("\n Ground Truth:\n", reference_output)

    # Base Model
    base_inputs = base_tokenizer(prompt, return_tensors="pt").to(base_model.device)
    with torch.no_grad():
        base_output_ids = base_model.generate(base_inputs["input_ids"], max_new_tokens=512)
    base_output = base_tokenizer.decode(base_output_ids[0], skip_special_tokens=True).split("JSON:")[-1].strip()
    print("\n Base Model Output:\n", base_output)

    # Fine-tuned Model
    finetune_inputs = finetuned_tokenizer(prompt, return_tensors="pt").to(finetuned_model.device)
    with torch.no_grad():
        finetune_output_ids = finetuned_model.generate(finetune_inputs["input_ids"], max_new_tokens=512)
    finetuned_output = finetuned_tokenizer.decode(finetune_output_ids[0], skip_special_tokens=True).split("JSON:")[-1].strip()
    print("\n Fine-tuned Model Output:\n", finetuned_output)


In [39]:
compare_models(dataset["train"][0], base_model, base_tokenizer, finetuned_model, tokenizer)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



🔹 Prompt:
 Convert the following text into JSON:

The task is to generate a json format from a set of keywords.
I will help you with some details about this format.
Consider we have a database with us which has telecom details about customers, their puchase, their demographics, their usage of voice,sms.data & so on.
For ex. when I say "action_name = push notification, KPI1 = P_AON,OP1 = >, VAL1 = 30, KPI2 = P_DEVICE_TYPE,OP2 = =, VAL2 = phone,option=All" then it means,
"Send a mobile app notification to customers whose Age on Network (AON) is greater than 30 days & their device type is a phone"
based on this we will get the below json:
{
	"featureId": "",
	"appName": "",
	"username": "",
	"password": "",
	"reqTxnId": "",
	"msgOrigin": "",
	"msgDest": "",
	"timestamp": "",
	"id": "",
	"ruletype": "",
	"data": {
		"detail": {
			"rules": {
				"id": "0",
				"pid": "#",
				"childrens": [
					{
						"id": "0_0",
						"pid": "0",
						"type": "conditions",
						"option": "All",
				

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



 Base Model Output:
 {
	"featureId": "",
	"appName": "",
	"username": "",
	"password": "",
	"reqTxnId": "",
	"msgOrigin": "",
	"msgDest": "",
	"timestamp": "",
	"id": "",
	"ruletype": "",
	"data": {
		"detail": {
			"rules": {
				"id": "0",
				"pid": "#",
				"childrens": [
					{
						"id": "0_0",
						"pid": "0",
						"type": "conditions",
						"option": "All",
						"childrens": [
							{
								"id": "0_0_0",
								"pid": "0_0",
								"type": "condition",
								"profile": {
									"id": 1,
									"name": "TOTAL_RECHARGE_REV_LAST_30_DAYS"
								},
								"operator": "=",
								"values": {
									"value": "300"
								}
							},
							{
								"id": "0_0_1",
								"pid": "0_0",
								"type": "condition",
								"profile": {
									"id": 2,
									"name": "TOTAL_RECHARGE_REV_LAST_2_WEEKS"
								},
								"operator": ">",
								"values": {
									"value": "80"
								}
							},
							{
								"id": "0_0_2",
								"pid": "0_0",
							