In [3]:
!pip install peft
!pip install accelerate
!pip install bitsandBytes
!pip install transformers
!pip install datasets



In [4]:
!pip install GPUtil




In [5]:

import torch
import GPUtil
import os

GPUtil.showUtilization()

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU is not available, using CPU instead")

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"

os.environ["CUDA_VISIBLE_DEVICES"]="0"

| ID | GPU | MEM |
------------------
|  0 | 30% | 56% |
GPU is available


In [6]:
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, LlamaTokenizer
from huggingface_hub import notebook_login
from datasets import load_dataset
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

if "COLAB_GPU" in os.environ:
  from google.colab import output
  output.enable_custom_widget_manager()

In [7]:
if "COLAB_GPU" in os.environ:
    !huggingface-cli login
else:
    %pip install ipywidgets
    from huggingface_hub import notebook_login
    notebook_login()

Note: you may need to restart the kernel to use updated packages.


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [8]:
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, LlamaTokenizer
from huggingface_hub import notebook_login
from datasets import load_dataset
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
import os # Import os

if "COLAB_GPU" in os.environ:
  from google.colab import output
  output.enable_custom_widget_manager()

base_model_id = "meta-llama/Llama-3.2-1B"

# Check if GPU is available
if torch.cuda.is_available():
    print("GPU is available. Loading model with quantization.")
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )
    model = AutoModelForCausalLM.from_pretrained(base_model_id, quantization_config=bnb_config)
    device = torch.device("cuda")
else:
    print("GPU is not available. Loading model without quantization (on CPU).")
    # Load model without quantization config
    model = AutoModelForCausalLM.from_pretrained(base_model_id)
    device = torch.device("cpu")

print(f"Model loaded on device: {device}")

GPU is available. Loading model with quantization.
Model loaded on device: cuda


In [9]:
!git clone https://github.com/Rajangupta9/context.git

fatal: destination path 'context' already exists and is not an empty directory.


In [10]:
from datasets import Dataset, Features, Value, Sequence
import json
import tempfile
import os
import shutil

def load_json_as_dataset(file_path, dataset_name="dataset"):
    """Load JSON file directly and convert to HuggingFace Dataset"""
    try:
        print(f"📂 Loading {dataset_name} directly from JSON...")
        
        # Read JSON file
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        print(f"   ✅ JSON loaded: {len(data)} records")
        
        # Create temporary directory for this specific dataset
        temp_dir = tempfile.mkdtemp(prefix=f"dataset_{dataset_name}_")
        
        try:
            # Create Dataset directly from the loaded data
            dataset = Dataset.from_list(data)
            print(f"   ✅ Dataset created successfully!")
            print(f"   📊 Number of examples: {len(dataset)}")
            print(f"   🏗️  Features: {list(dataset.features.keys())}")
            
            return dataset, None
            
        except Exception as e:
            print(f"   ❌ Error creating dataset: {str(e)}")
            return None, str(e)
        finally:
            # Clean up temporary directory
            try:
                shutil.rmtree(temp_dir)
            except:
                pass
                
    except Exception as e:
        print(f"   ❌ Error loading JSON: {str(e)}")
        return None, str(e)

def display_dataset_info(dataset, dataset_name):
    """Display comprehensive information about the dataset"""
    if dataset is None:
        print(f"❌ {dataset_name} is None")
        return
    
    print(f"\n📋 {dataset_name.upper()} DATASET INFO:")
    print(f"   📊 Total examples: {len(dataset)}")
    print(f"   🏗️  Features ({len(dataset.features)}):")
    
    for feature_name, feature_type in dataset.features.items():
        print(f"      • {feature_name}: {feature_type}")
    
    # Show first example
    if len(dataset) > 0:
        print(f"\n   📝 SAMPLE DATA (First Example):")
        example = dataset[0]
        for key, value in example.items():
            if isinstance(value, (list, dict)):
                if isinstance(value, list):
                    print(f"      • {key}: List with {len(value)} items")
                    if len(value) > 0:
                        print(f"        └─ First item type: {type(value[0])}")
                        if isinstance(value[0], dict) and len(value[0]) > 0:
                            first_keys = list(value[0].keys())[:3]
                            print(f"        └─ Sample keys: {first_keys}{'...' if len(value[0]) > 3 else ''}")
                else:
                    print(f"      • {key}: Dict with {len(value)} keys")
                    if len(value) > 0:
                        sample_keys = list(value.keys())[:3]
                        print(f"        └─ Keys: {sample_keys}{'...' if len(value) > 3 else ''}")
            else:
                # Truncate long strings for display
                str_value = str(value)
                if len(str_value) > 50:
                    str_value = str_value[:47] + "..."
                print(f"      • {key}: {str_value}")

def analyze_nested_structure(dataset, dataset_name):
    """Analyze nested structures in the dataset"""
    if dataset is None or len(dataset) == 0:
        return
    
    print(f"\n🔍 NESTED STRUCTURE ANALYSIS - {dataset_name.upper()}:")
    
    example = dataset[0]
    for key, value in example.items():
        if isinstance(value, list) and len(value) > 0:
            print(f"\n   📋 {key} (List):")
            print(f"      └─ Length: {len(value)}")
            
            if isinstance(value[0], dict):
                print(f"      └─ Item structure (first item):")
                for sub_key, sub_value in value[0].items():
                    print(f"         • {sub_key}: {type(sub_value).__name__}")
                    
        elif isinstance(value, dict) and len(value) > 0:
            print(f"\n   📋 {key} (Dict):")
            for sub_key, sub_value in value.items():
                print(f"      • {sub_key}: {type(sub_value).__name__}")

print("="*70)
print("🚀 DIRECT JSON TO DATASET CONVERSION")
print("="*70)

# Load both datasets directly
print("\n1️⃣ LOADING OUTPUT.JSON")
print("-" * 40)
output_dataset, output_error = load_json_as_dataset("context/output.json", "output")

print("\n2️⃣ LOADING TEMPLATE.JSON")
print("-" * 40)
template_dataset, template_error = load_json_as_dataset("context/template.json", "template")

print("\n" + "="*70)
print("📊 DETAILED DATASET INFORMATION")
print("="*70)

# Display detailed information for both datasets
if output_dataset:
    display_dataset_info(output_dataset, "output")
    analyze_nested_structure(output_dataset, "output")

if template_dataset:
    display_dataset_info(template_dataset, "template")
    analyze_nested_structure(template_dataset, "template")

print("\n" + "="*70)
print("🎯 FINAL SUMMARY")
print("="*70)

success_count = 0
if output_dataset:
    print(f"✅ OUTPUT DATASET: Successfully loaded with {len(output_dataset)} examples")
    success_count += 1
else:
    print(f"❌ OUTPUT DATASET: Failed to load - {output_error}")

if template_dataset:
    print(f"✅ TEMPLATE DATASET: Successfully loaded with {len(template_dataset)} examples")
    success_count += 1
else:
    print(f"❌ TEMPLATE DATASET: Failed to load - {template_error}")

if success_count == 2:
    print(f"\n🎉 SUCCESS! Both datasets are ready for use!")
    print(f"\n💡 Next steps:")
    print(f"   • Use output_dataset for component analysis")
    print(f"   • Use template_dataset for template structure analysis")
    print(f"   • Both datasets support all HuggingFace Dataset operations")
    print(f"   • You can now proceed with your machine learning tasks!")
    
    # Show some example operations you can perform
    print(f"\n🔧 Example operations you can now perform:")
    print(f"   • output_dataset.filter(lambda x: x['component'] == 'profile')")
    print(f"   • output_dataset.map(lambda x: {{...}})")
    print(f"   • output_dataset.select(range(5))  # Get first 5 examples")
    print(f"   • output_dataset.to_pandas()  # Convert to pandas DataFrame")
    
elif success_count == 1:
    print(f"\n⚠️  PARTIAL SUCCESS: One dataset loaded successfully")
else:
    print(f"\n❌ FAILURE: Neither dataset could be loaded")

print("\n" + "="*70)

🚀 DIRECT JSON TO DATASET CONVERSION

1️⃣ LOADING OUTPUT.JSON
----------------------------------------
📂 Loading output directly from JSON...
   ✅ JSON loaded: 8 records
   ✅ Dataset created successfully!
   📊 Number of examples: 8
   🏗️  Features: ['component', 'card_background', 'card_enable', 'card_open', '_id', 'pr_img', 'name', 'name_config', 'desc', 'desc_config', 'company', 'contact_shortcut_enable', 'enable_pr', 'contact_shortcuts', 'show_brand_img', 'enable_br', 'br_img']

2️⃣ LOADING TEMPLATE.JSON
----------------------------------------
📂 Loading template directly from JSON...
   ✅ JSON loaded: 1 records
   ✅ Dataset created successfully!
   📊 Number of examples: 1
   🏗️  Features: ['template_id', 'qr_codes']

📊 DETAILED DATASET INFORMATION

📋 OUTPUT DATASET INFO:
   📊 Total examples: 8
   🏗️  Features (17):
      • component: Value(dtype='string', id=None)
      • card_background: Value(dtype='int64', id=None)
      • card_enable: Value(dtype='int64', id=None)
      • card_o

In [11]:
from transformers import AutoTokenizer

# Use AutoTokenizer to automatically select the correct tokenizer class
tokenizer = AutoTokenizer.from_pretrained(base_model_id, use_fast=False, trust_remote_code=True, add_eos_token=True)

# Add padding token if it doesn't exist
if tokenizer.pad_token is None:
  tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

# Set padding side for consistency, often 'left' for generation, 'right' for training/classification
# Based on typical Llama usage for generation/training, 'right' is common
tokenizer.padding_side = "right"

# Display the tokenizer to confirm it loaded correctly
print(tokenizer)
print(f"Padding token: {tokenizer.pad_token}")
print(f"Padding token ID: {tokenizer.pad_token_id}")
print(f"EOS token: {tokenizer.eos_token}")
print(f"EOS token ID: {tokenizer.eos_token_id}")
print(f"Padding side: {tokenizer.padding_side}")

PreTrainedTokenizerFast(name_or_path='meta-llama/Llama-3.2-1B', vocab_size=128000, model_max_length=131072, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|begin_of_text|>', 'eos_token': '<|end_of_text|>', 'pad_token': '<|end_of_text|>'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	128000: AddedToken("<|begin_of_text|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128001: AddedToken("<|end_of_text|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128002: AddedToken("<|reserved_special_token_0|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128003: AddedToken("<|reserved_special_token_1|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128004: AddedToken("<|finetune_right_pad_id|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	128005: AddedToken("<|re

In [12]:
# Ensure tokenizer is defined (run cell 8 if not)
assert 'tokenizer' in globals(), "Please run the cell that defines 'tokenizer' (cell 8) before running this cell."

# Tokenize the datasets using an existing text column.
# Replace "component" and "qr_name" with the actual column(s) you want to tokenize.
# Note: template_dataset has 'qr_name' at the top level within the 'qr_codes' list of dictionaries.
# We need to access it correctly.

# For output_dataset, tokenizing the 'component' column as an example
# Batched=True is generally good for larger datasets
tokenized_output_dataset = output_dataset.map(
    lambda examples: tokenizer(examples["component"], padding=True, truncation=True),
    batched=True
)

# For template_dataset, tokenizing the 'qr_name' within the 'qr_codes' list as an example.
# Accessing nested data is simpler with batched=False for small datasets or complex structures
tokenized_template_dataset = template_dataset.map(
    lambda examples: tokenizer([qr_code['qr_name'] for qr_code in examples['qr_codes']], padding=True, truncation=True),
    batched=False # Changed to False to simplify access to nested list of dictionaries
)


print("\n" + "="*70)
print("📊 TOKENIZED DATASET INFO")
print("="*70)

if tokenized_output_dataset:
    display_dataset_info(tokenized_output_dataset, "tokenized_output")

if tokenized_template_dataset:
    display_dataset_info(tokenized_template_dataset, "tokenized_template")

print("\n🎉 Tokenization complete for both datasets!")
print("You can now proceed with training using 'tokenized_output_dataset' and 'tokenized_template_dataset'.")

Map:   0%|          | 0/8 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]


📊 TOKENIZED DATASET INFO

📋 TOKENIZED_OUTPUT DATASET INFO:
   📊 Total examples: 8
   🏗️  Features (19):
      • component: Value(dtype='string', id=None)
      • card_background: Value(dtype='int64', id=None)
      • card_enable: Value(dtype='int64', id=None)
      • card_open: Value(dtype='int64', id=None)
      • _id: Value(dtype='string', id=None)
      • pr_img: Value(dtype='string', id=None)
      • name: Value(dtype='string', id=None)
      • name_config: {}
      • desc: Value(dtype='string', id=None)
      • desc_config: {'align': Value(dtype='string', id=None), 'bold': Value(dtype='int64', id=None), 'italic': Value(dtype='int64', id=None), 'lock': Value(dtype='string', id=None)}
      • company: Value(dtype='string', id=None)
      • contact_shortcut_enable: Value(dtype='int64', id=None)
      • enable_pr: Value(dtype='int64', id=None)
      • contact_shortcuts: [{'_id': Value(dtype='string', id=None), 'type': Value(dtype='string', id=None), 'type_config': {}, 'value': Value(

In [13]:
tokenizer.eos_token

'<|end_of_text|>'

In [14]:
# Format the tokenized datasets for causal language modeling
# This is a general approach; you might need to adjust based on your specific task
# (e.g., instruction tuning requires specific chat templates)

def format_for_causal_lm(examples):
    # Concatenate input_ids and attention_mask
    # For simple causal LM, we often train on the entire sequence, so input_ids are the labels
    examples["labels"] = examples["input_ids"].copy()
    return examples

# Apply the formatting function to both tokenized datasets
# You might want to combine your datasets before this step for training
# For demonstration, applying to both separately
formatted_output_dataset = tokenized_output_dataset.map(format_for_causal_lm, batched=True)
formatted_template_dataset = tokenized_template_dataset.map(format_for_causal_lm, batched=True)


print("\n" + "="*70)
print("📊 FORMATTED DATASET INFO")
print("="*70)

if formatted_output_dataset:
    print("Formatted Output Dataset:")
    print(formatted_output_dataset)
    if len(formatted_output_dataset) > 0:
        print("\nFirst example (formatted_output_dataset):")
        # Display input_ids, attention_mask, and labels for the first example
        first_example = formatted_output_dataset[0]
        print(f"  Input IDs: {first_example['input_ids'][:50]}...") # Print first 50 token IDs
        print(f"  Attention Mask: {first_example['attention_mask'][:50]}...") # Print first 50 attention mask values
        print(f"  Labels: {first_example['labels'][:50]}...") # Print first 50 label IDs

if formatted_template_dataset:
    print("\nFormatted Template Dataset:")
    print(formatted_template_dataset)
    if len(formatted_template_dataset) > 0:
        print("\nFirst example (formatted_template_dataset):")
        # Display input_ids, attention_mask, and labels for the first example
        first_example = formatted_template_dataset[0]
        print(f"  Input IDs: {first_example['input_ids'][:50]}...") # Print first 50 token IDs
        print(f"  Attention Mask: {first_example['attention_mask'][:50]}...") # Print first 50 attention mask values
        print(f"  Labels: {first_example['labels'][:50]}...") # Print first 50 label IDs


print("\n🎉 Datasets formatted for causal language modeling!")
print("You now have 'formatted_output_dataset' and 'formatted_template_dataset' ready for fine-tuning.")
print("Remember to combine them or use them as needed for your training setup.")

Map:   0%|          | 0/8 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]


📊 FORMATTED DATASET INFO
Formatted Output Dataset:
Dataset({
    features: ['component', 'card_background', 'card_enable', 'card_open', '_id', 'pr_img', 'name', 'name_config', 'desc', 'desc_config', 'company', 'contact_shortcut_enable', 'enable_pr', 'contact_shortcuts', 'show_brand_img', 'enable_br', 'br_img', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 8
})

First example (formatted_output_dataset):
  Input IDs: [128000, 5478, 128001]...
  Attention Mask: [1, 1, 0]...
  Labels: [128000, 5478, 128001]...

Formatted Template Dataset:
Dataset({
    features: ['template_id', 'qr_codes', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1
})

First example (formatted_template_dataset):
  Input IDs: [[128000]]...
  Attention Mask: [[1]]...
  Labels: [[128000]]...

🎉 Datasets formatted for causal language modeling!
You now have 'formatted_output_dataset' and 'formatted_template_dataset' ready for fine-tuning.
Remember to combine them or use them as needed for your training

In [15]:
import torch
import transformers
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import DataCollatorForLanguageModeling
from datasets import Dataset

def clean_and_inspect_dataset(dataset, dataset_name="dataset"):
    """
    Clean the dataset and inspect its structure to identify problematic fields
    """
    print(f"\n🔍 Inspecting {dataset_name}:")
    print(f"Dataset type: {type(dataset)}")
    
    if len(dataset) == 0:
        print("❌ Dataset is empty!")
        return None
    
    # Look at the first example in detail
    first_example = dataset[0]
    print(f"📋 Fields in first example: {list(first_example.keys())}")
    
    # Analyze each field
    problematic_fields = []
    good_fields = {}
    
    for field_name, value in first_example.items():
        print(f"\n  📊 Field '{field_name}':")
        print(f"     Type: {type(value)}")
        
        if isinstance(value, list):
            if len(value) > 0:
                print(f"     Length: {len(value)}")
                print(f"     First element type: {type(value[0])}")
                print(f"     First few elements: {value[:5]}")
                
                # Check if it's a list of integers (good for input_ids, labels, attention_mask)
                if all(isinstance(x, int) for x in value[:10]):  # Check first 10 elements
                    good_fields[field_name] = value
                    print(f"     ✅ Looks good (list of integers)")
                else:
                    problematic_fields.append(field_name)
                    print(f"     ❌ Problematic (not all integers)")
            else:
                problematic_fields.append(field_name)
                print(f"     ❌ Empty list")
        elif isinstance(value, str):
            print(f"     Value preview: '{value[:100]}...'")
            problematic_fields.append(field_name)
            print(f"     ❌ String field (should be tokenized)")
        else:
            problematic_fields.append(field_name)
            print(f"     ❌ Unexpected type: {type(value)}")
    
    print(f"\n✅ Good fields: {list(good_fields.keys())}")
    print(f"❌ Problematic fields: {problematic_fields}")
    
    return good_fields, problematic_fields

def create_clean_dataset(original_dataset, required_fields=['input_ids', 'attention_mask', 'labels']):
    """
    Create a clean dataset with only the required fields
    """
    print(f"\n🧹 Cleaning dataset...")
    
    clean_examples = []
    
    for i, example in enumerate(original_dataset):
        clean_example = {}
        valid_example = True
        
        # Extract only the required fields
        for field in required_fields:
            if field in example:
                value = example[field]
                
                # Ensure it's a list of integers
                if isinstance(value, list) and len(value) > 0:
                    if all(isinstance(x, int) for x in value):
                        clean_example[field] = value
                    else:
                        print(f"❌ Example {i}: {field} contains non-integers")
                        valid_example = False
                        break
                else:
                    print(f"❌ Example {i}: {field} is not a valid list")
                    valid_example = False
                    break
            else:
                print(f"❌ Example {i}: Missing field {field}")
                valid_example = False
                break
        
        if valid_example:
            clean_examples.append(clean_example)
        else:
            print(f"⚠️ Skipping example {i} due to invalid data")
    
    if clean_examples:
        clean_dataset = Dataset.from_list(clean_examples)
        print(f"✅ Created clean dataset with {len(clean_dataset)} examples")
        return clean_dataset
    else:
        print("❌ No valid examples found!")
        return None

def fix_dataset_structure(dataset):
    """
    Alternative approach: try to fix the dataset structure automatically
    """
    print(f"\n🔧 Attempting to fix dataset structure...")
    
    if len(dataset) == 0:
        return None
    
    # Look for the pattern in your data
    first_example = dataset[0]
    
    # Common problematic patterns and fixes
    fixed_examples = []
    
    for i, example in enumerate(dataset):
        try:
            fixed_example = {}
            
            # Handle different field names and structures
            for field_name, value in example.items():
                if field_name in ['input_ids', 'attention_mask', 'labels']:
                    if isinstance(value, list):
                        # Ensure all elements are integers
                        if all(isinstance(x, int) for x in value):
                            fixed_example[field_name] = value
                        else:
                            # Try to convert to integers if possible
                            try:
                                fixed_example[field_name] = [int(x) for x in value]
                            except (ValueError, TypeError):
                                print(f"❌ Cannot convert {field_name} to integers in example {i}")
                                break
                    else:
                        print(f"❌ {field_name} is not a list in example {i}")
                        break
                # Skip other fields (like 'component' which is causing issues)
            
            # Only add if we have all required fields
            if all(field in fixed_example for field in ['input_ids', 'attention_mask', 'labels']):
                fixed_examples.append(fixed_example)
            else:
                print(f"⚠️ Example {i} missing required fields")
                
        except Exception as e:
            print(f"❌ Error processing example {i}: {e}")
            continue
    
    if fixed_examples:
        return Dataset.from_list(fixed_examples)
    else:
        return None

# =============================================================================
# DATASET CLEANING AND INSPECTION
# =============================================================================

print("\n" + "="*70)
print("🔍 DATASET INSPECTION AND CLEANING")
print("="*70)

# First, let's inspect your current dataset
good_fields, problematic_fields = clean_and_inspect_dataset(formatted_output_dataset, "formatted_output_dataset")

# Try to create a clean dataset
clean_dataset = None

# Method 1: Extract only good fields
if good_fields and all(field in good_fields for field in ['input_ids', 'attention_mask', 'labels']):
    print("\n🎯 Method 1: Using existing good fields...")
    clean_dataset = create_clean_dataset(formatted_output_dataset)

# Method 2: Fix dataset structure
if clean_dataset is None:
    print("\n🎯 Method 2: Attempting to fix dataset structure...")
    clean_dataset = fix_dataset_structure(formatted_output_dataset)

# Method 3: Manual reconstruction if needed
if clean_dataset is None:
    print("\n🎯 Method 3: Manual dataset reconstruction...")
    print("Your dataset might need manual reconstruction. Let's see what we can extract:")
    
    # Show detailed structure of first few examples
    for i in range(min(3, len(formatted_output_dataset))):
        print(f"\nExample {i}:")
        example = formatted_output_dataset[i]
        for key, value in example.items():
            if isinstance(value, list):
                print(f"  {key}: list[{len(value)}] - {value[:10]}...")
            elif isinstance(value, str):
                print(f"  {key}: str - '{value[:50]}...'")
            else:
                print(f"  {key}: {type(value)} - {value}")

if clean_dataset is None:
    print("\n❌ Could not create a clean dataset. Please check your data structure.")
    print("The dataset should have 'input_ids', 'attention_mask', and 'labels' fields,")
    print("each containing lists of integers.")
    exit()

# =============================================================================
# TRAINING SETUP WITH CLEAN DATASET
# =============================================================================

print(f"\n✅ Using clean dataset with {len(clean_dataset)} examples")

# Test the clean dataset structure
print("\n🧪 Testing clean dataset structure...")
test_example = clean_dataset[0]
print("Clean dataset first example:")
for key, value in test_example.items():
    print(f"  {key}: type={type(value)}, length={len(value)}, sample={value[:5]}...")

# Setup model for training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"\nUsing device: {device}")

# Apply LoRA (unload existing adapters first if they exist)
model = model.cpu()
torch.cuda.empty_cache()

if hasattr(model, 'peft_config'):
    print("⚠️ Unloading existing PEFT adapters...")
    model = model.unload()

config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    bias="none",
    lora_dropout=0.05,
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)
model = model.to(device)
model.print_trainable_parameters()

# Simple, robust data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
    pad_to_multiple_of=None,
    return_tensors="pt"
)

# Test the data collator with clean dataset
print("\n🧪 Testing data collator with clean dataset...")
try:
    test_samples = [clean_dataset[i] for i in range(min(2, len(clean_dataset)))]
    test_batch = data_collator(test_samples)
    
    print("✅ Data collator test successful!")
    print(f"Batch keys: {list(test_batch.keys())}")
    for key, value in test_batch.items():
        if torch.is_tensor(value):
            print(f"  {key}: shape={value.shape}, dtype={value.dtype}")
            
except Exception as e:
    print(f"❌ Data collator test still failing: {e}")
    print("\nLet's try a custom data collator...")
    
    # Custom data collator as last resort
    def custom_data_collator(features):
        # Manual batching
        batch = {}
        keys = features[0].keys()
        
        for key in keys:
            values = [f[key] for f in features]
            
            # Find max length for padding
            max_len = max(len(v) for v in values)
            
            # Pad sequences
            padded_values = []
            for v in values:
                if len(v) < max_len:
                    if key == 'labels':
                        # Pad labels with -100 (ignore index)
                        padded = v + [-100] * (max_len - len(v))
                    else:
                        # Pad other fields with 0
                        padded = v + [0] * (max_len - len(v))
                else:
                    padded = v
                padded_values.append(padded)
            
            batch[key] = torch.tensor(padded_values, dtype=torch.long)
        
        return batch
    
    # Test custom collator
    try:
        test_batch = custom_data_collator(test_samples)
        print("✅ Custom data collator works!")
        data_collator = custom_data_collator
    except Exception as e:
        print(f"❌ Even custom data collator failed: {e}")
        exit()

# Training arguments
training_args = transformers.TrainingArguments(
    output_dir="./finetunedModel",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=2,
    learning_rate=1e-4,
    max_steps=10,  # Small number for testing
    bf16=torch.cuda.is_available() and torch.cuda.get_device_properties(0).major >= 8,
    optim="paged_adamw_8bit",
    logging_dir="./log",
    save_strategy="steps",
    save_steps=5,
    logging_steps=1,
    report_to="none",
    dataloader_pin_memory=False,
    dataloader_num_workers=0,
    remove_unused_columns=False,
    gradient_checkpointing=True,
    logging_first_step=True,
    eval_strategy="no",
)

model.config.use_cache = False

# Initialize trainer
trainer = transformers.Trainer(
    model=model,
    train_dataset=clean_dataset,
    args=training_args,
    data_collator=data_collator,
    processing_class=tokenizer,  # Use processing_class instead of tokenizer (new API)
)

print("\n" + "="*70)
print("🚀 STARTING TRAINING WITH CLEAN DATASET")
print("="*70)

try:
    trainer.model.zero_grad()
    print("🏃 Beginning training loop...")
    trainer.train()
    print("\n🎉 Training completed successfully!")
    
except Exception as e:
    print(f"\n❌ Training error: {e}")
    import traceback
    traceback.print_exc()

print("\n" + "="*70)
print("✅ TRAINING COMPLETE")
print("="*70)

# Save model if training succeeded
if hasattr(trainer, 'state') and trainer.state.global_step > 0:
    print("💾 Saving model...")
    trainer.save_model("./finetunedModel")
    tokenizer.save_pretrained("./finetunedModel")
    print("✅ Model saved successfully!")
else:
    print("⚠️ No training progress, model not saved.")

print(f"\n📊 Final dataset info: {len(clean_dataset)} examples successfully processed")


🔍 DATASET INSPECTION AND CLEANING

🔍 Inspecting formatted_output_dataset:
Dataset type: <class 'datasets.arrow_dataset.Dataset'>
📋 Fields in first example: ['component', 'card_background', 'card_enable', 'card_open', '_id', 'pr_img', 'name', 'name_config', 'desc', 'desc_config', 'company', 'contact_shortcut_enable', 'enable_pr', 'contact_shortcuts', 'show_brand_img', 'enable_br', 'br_img', 'input_ids', 'attention_mask', 'labels']

  📊 Field 'component':
     Type: <class 'str'>
     Value preview: 'profile...'
     ❌ String field (should be tokenized)

  📊 Field 'card_background':
     Type: <class 'int'>
     ❌ Unexpected type: <class 'int'>

  📊 Field 'card_enable':
     Type: <class 'int'>
     ❌ Unexpected type: <class 'int'>

  📊 Field 'card_open':
     Type: <class 'int'>
     ❌ Unexpected type: <class 'int'>

  📊 Field '_id':
     Type: <class 'str'>
     Value preview: 'uFpMGYHp1750825849079H...'
     ❌ String field (should be tokenized)

  📊 Field 'pr_img':
     Type: <class 

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


trainable params: 0 || all params: 1,241,450,496 || trainable%: 0.0000

🧪 Testing data collator with clean dataset...
✅ Data collator test successful!
Batch keys: ['input_ids', 'attention_mask', 'labels']
  input_ids: shape=torch.Size([2, 3]), dtype=torch.int64
  attention_mask: shape=torch.Size([2, 3]), dtype=torch.int64
  labels: shape=torch.Size([2, 3]), dtype=torch.int64

🚀 STARTING TRAINING WITH CLEAN DATASET
🏃 Beginning training loop...


Step,Training Loss
1,6.6296
2,7.5089
3,7.4876
4,6.6589
5,6.8539
6,7.3949
7,6.6589
8,7.4876
9,7.5955
10,6.5104



🎉 Training completed successfully!

✅ TRAINING COMPLETE
💾 Saving model...
✅ Model saved successfully!

📊 Final dataset info: 8 examples successfully processed


In [24]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
import torch

base_model_id = "meta-llama/Llama-3.2-1B"  # replace with your correct model

nf4_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_id, use_fast=False)

# Fix pad token
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

# Load model
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    quantization_config=nf4_config,
    device_map="auto",
    trust_remote_code=True
)

# Load PEFT adapter
peft_model_path = "./finetunedModel/checkpoint-10"
model = PeftModel.from_pretrained(base_model, peft_model_path)

# Set to eval mode
model.eval()


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 2048)
        (layers): ModuleList(
          (0-15): 16 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.

In [27]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
import torch

base_model_id = "meta-llama/Llama-3.2-1B"

nf4Config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# ✅ Use auto device placement (no manual map)
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    quantization_config=nf4Config,
    device_map={"": "cpu"},  # Force all layers to CPU
    trust_remote_code=True
)


# Load PEFT adapter
peft_model_id = "./finetunedModel/checkpoint-10"
model = PeftModel.from_pretrained(base_model, peft_model_id)

# Prepare tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_id, use_fast=False, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

model.eval()

print("\n✅ Fine-tuned model loaded and set to evaluation mode.")
print(f"Adapter loaded from: {peft_model_id}")



✅ Fine-tuned model loaded and set to evaluation mode.
Adapter loaded from: ./finetunedModel/checkpoint-10


In [28]:
user_question = "do you know qrcode componant and desc and links"

eval_prompt = f"Question: {user_question} Just answer this question accurately and concisely.\n"

promptTokenized = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

model.eval()

with torch.no_grad():
  print(tokenizer.decode(model.generate(**promptTokenized, max_new_tokens=1024)[0], skip_special_tokens=True))
  torch.cuda.empty_cache()

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper_CUDA__index_select)

In [None]:

tokenizer = LlamaTokenizer.from_pretrained(base_model_id, use_fast=False, trust_remote_code=True, add_eos_token=True

                              )

modelFinetuned = PeftModel.from_pretrained(base_model, "finetunedModel/checkpoint-10")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'PreTrainedTokenizerFast'. 
The class this function is called from is 'LlamaTokenizer'.
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message


TypeError: not a string

In [25]:
# Function to generate text
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

def generate_text(prompt, model, tokenizer, max_new_tokens=2048):
    # Encode the prompt
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)

    # Generate text
    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_new_tokens=max_new_tokens,
            num_return_sequences=1,
            pad_token_id=tokenizer.pad_token_id, # Use the defined pad token ID
            eos_token_id=tokenizer.eos_token_id # Use the defined EOS token ID
        )

    # Decode the generated text
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

    return generated_text

# Example usage:
prompt = """
You are a JSON generation system that formats user business card data into structured JSON.

You will receive:
1. User information (name, phone, email, company, etc.)
2. A JSON template with placeholders or default data.

Your task is:
- Parse the user info and merge it into the template
- Replace placeholders in JSON with actual user values
- Ensure the JSON format remains valid and fully populated
- Do not skip any components (e.g., profile, contact, web_links, etc.)
- Return ONLY a JSON array as final output (no explanation or extra notes)

User Info:
Name: Rajan
Designation: Mr
Company: Tez Minds
Phone: 9709590075
Email: rajang797@gmail.com
Website: https://www.mycoolbrand.com
Socials: Facebook, Instagram, Twitter
Description: Description
Address: Street, City, State, Zipcode, Country
Calendly: Link to book meetings

JSON Template:
(paste your full template JSON array here)

Now output the updated JSON:
[
    {
        "component": "profile",
        "card_background": 0,
        "card_enable": 1,
        "card_open": 1,
        "_id": "uFpMGYHp1750825849079H",
        "pr_img": "https://www.qrcodechimp.com/images/digitalCard/dbcv2/profile_1.webp?v=1750825849004",
        "name": "Rajan ",
        "name_config": {},
        "desc": "Mr",
        "desc_config": {},
        "company": "Tez Minds",
        "contact_shortcut_enable": 1,
        "enable_pr": 1,
        "contact_shortcuts": [
            {
                "_id": "woMsSJ9x17508258490051",
                "type": "mobile",
                "type_config": {},
                "value": "9709590075",
                "value_config": {}
            },
            {
                "_id": "3xyoEBz117508258490052",
                "type": "email",
                "type_config": {},
                "value": "rajang797@gmail.com",
                "value_config": {}
            },
            {
                "_id": "92zAMWLI17508258490053",
                "type": "sms",
                "type_config": {},
                "value": "9709590075",
                "value_config": {}
            }
        ],
        "show_brand_img": 1,
        "enable_br": 1,
        "br_img": "https://www.qrcodechimp.com/images/digitalCard/dbcv2/barand_logo_9.webp?v=1750825849004"
    },
    {
        "component": "text_desc",
        "title": "About Me",
        "desc": "Description",
        "title_config": {
            "bold": 1,
            "italic": 0,
            "align": "center",
            "lock": "unlock"
        },
        "desc_config": {
            "bold": 0,
            "italic": 0,
            "align": "center",
            "lock": "unlock"
        },
        "_id": "QUtpTNrB1750825849079L",
        "card_enable": 1
    },
    {
        "component": "contact",
        "contact_title": "Contact Us",
        "icon_img": "/images/digitalCard/contactus.png",
        "floating_button_enable": 1,
        "floating_button_label": "Add to Contact",
        "ebusiness_card_enable": 1,
        "contact_infos": [
            {
                "type": "number",
                "title": "Call Us",
                "label": "Mobile ",
                "number": "123 456 7890",
                "_id": "60jggoha1750825849079N"
            },
            {
                "type": "email",
                "title": "Email",
                "label": "Email ",
                "email": "contactme@domain.com",
                "_id": "ziyf6bLt1750825849079O"
            },
            {
                "type": "address",
                "title": "Address",
                "street": "Street",
                "city": "City",
                "country": "Country",
                "state": "State",
                "zip": "Zipcode",
                "action_button_enable": 1,
                "action_button_label": "Direction",
                "action_button_link": "#",
                "_id": "ndD3M75Y1750825849079P"
            }
        ],
        "_id": "2p9TJT6q1750825849079M",
        "card_enable": 1
    },
    {
        "component": "images",
        "header_enable": 0,
        "title": "",
        "desc": "",
        "title_config": {
            "bold": 1,
            "italic": 0,
            "align": "center",
            "lock": "unlock"
        },
        "desc_config": {
            "bold": 0,
            "italic": 0,
            "align": "center",
            "lock": "unlock"
        },
        "view_type": "list",
        "images": [
            "/images/digitalCard/image_1.png",
            "/images/digitalCard/image_2.png",
            "/images/digitalCard/image_1.png",
            "/images/digitalCard/image_2.png"
        ],
        "_id": "mhqSEqL11750825849079Q",
        "card_enable": 1
    },
    {
        "component": "social_link",
        "header_enable": 1,
        "title": "Social Links",
        "desc": "Description",
        "title_config": {
            "bold": 1,
            "italic": 0,
            "align": "center",
            "lock": "unlock"
        },
        "desc_config": {
            "bold": 0,
            "italic": 0,
            "align": "center",
            "lock": "unlock"
        },
        "links": [
            {
                "type": "facebook",
                "url": "",
                "title": "Facebook",
                "subtitle": "Follow us on Facebook",
                "subtitle_enable": 1,
                "icon_img": "/images/digitalCard/fb_icon@72x.png",
                "_id": "3uQie4Y91750825849080S"
            },
            {
                "type": "instagram",
                "url": "",
                "title": "Instagram",
                "subtitle": "Follow us on Instagram",
                "subtitle_enable": 0,
                "icon_img": "/images/digitalCard/insta_icon@72x.png",
                "_id": "VL4W2Xnt1750825849080T"
            },
            {
                "type": "twitter",
                "url": "",
                "title": "Twitter",
                "subtitle": "Follow us on Twitter",
                "subtitle_enable": 0,
                "icon_img": "/images/digitalCard/tw_icon@72x.png",
                "_id": "Jkyh7VyZ1750825849080U"
            }
        ],
        "_id": "M8CTSg2A1750825849079R",
        "card_enable": 1
    },
    {
        "component": "web_links",
        "header_enable": 1,
        "title": "Web Links",
        "desc": "Description",
        "title_config": {
            "bold": 1,
            "italic": 0,
            "align": "center",
            "lock": "unlock"
        },
        "desc_config": {
            "bold": 0,
            "italic": 0,
            "align": "center",
            "lock": "unlock"
        },
        "links": [
            {
                "url": "https://www.mycoolbrand.com",
                "title": "Title",
                "subtitle": "Sub Title",
                "subtitle_enable": 1,
                "icon_img": "/images/digitalCard/weblink.png",
                "_id": "f7ZG9Fxq1750825849080W"
            }
        ],
        "_id": "EtyUyAYA1750825849080V",
        "card_enable": 1
    },
    {
        "component": "appointment",
        "header_enable": 1,
        "title": "Schedule Meeting",
        "desc": "Schedule a meeting to discuss potential opportunities for collaboration",
        "title_config": {
            "bold": 1,
            "italic": 0,
            "align": "center",
            "lock": "unlock"
        },
        "desc_config": {
            "bold": 0,
            "italic": 0,
            "align": "center",
            "lock": "unlock"
        },
        "appointments": [
            {
                "link": "",
                "label": "Book on Calendly"
            },
            {
                "link": "",
                "label": "Add to Calendar"
            }
        ],
        "card_enable": 1,
        "_id": "sOOYMPOD1750825849080X"
    },
    {
        "component": "form",
        "card_label": "Collect Contacts",
        "card_delete_disabled": 1,
        "card_enable": 0,
        "card_desc": "Enable this feature to collect your prospect's contact details",
        "form_name": "Contact Collection",
        "form_config": [
            {
                "header": {
                    "title": "Hi, great to connect with you!",
                    "desc": "Please provide the information below to proceed further",
                    "header_enable": 1
                },
                "enable_header_img": 1,
                "header_img": "/images/defaultImages/businesspage/b_brand_logo.png",
                "form_fields": [
                    {
                        "type": "oneLine",
                        "label": "Your Name",
                        "required": true,
                        "_id": "sg7CjRcf1750825849010D"
                    },
                    {
                        "type": "email",
                        "label": "Your Email",
                        "required": true,
                        "_id": "7plFZYz31750825849010E"
                    },
                    {
                        "type": "tel",
                        "label": "Your Phone",
                        "required": true,
                        "_id": "TSlYbths1750825849010F"
                    }
                ],
                "button_label": "Submit",
                "terms_label": "I agree to Terms and Privacy Policy"
            }
        ],
        "view_config": {
            "delay_time": "1",
            "dismiss_form": 1,
            "form_trigger": "delay",
            "form_view": "full",
            "view_type": "overlay"
        },
        "form_integration": {},
        "_id": "rfwgQgSc1750825849080Y"
    }
]"""

print(f"\nGenerating text with prompt: '{prompt}'")
generated_output = generate_text(prompt, model, tokenizer)
print(f"Generated text:\n{generated_output}")

# Example using a prompt related to your data (e.g., a component or QR name)
# Replace with a relevant prompt based on the 'component' or 'qr_name' data
prompt_related_to_data = "Describe the 'profile' component: "
print(f"\nGenerating text with prompt: '{prompt_related_to_data}'")
generated_output_related = generate_text(prompt_related_to_data, model, tokenizer)
print(f"Generated text:\n{generated_output_related}")

prompt_related_to_data_qr = "Explain the purpose of 'MyQR.png': "
print(f"\nGenerating text with prompt: '{prompt_related_to_data_qr}'")
generated_output_qr = generate_text(prompt_related_to_data_qr, model, tokenizer)
print(f"Generated text:\n{generated_output_qr}")


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



Generating text with prompt: '
You are a JSON generation system that formats user business card data into structured JSON.

You will receive:
1. User information (name, phone, email, company, etc.)
2. A JSON template with placeholders or default data.

Your task is:
- Parse the user info and merge it into the template
- Replace placeholders in JSON with actual user values
- Ensure the JSON format remains valid and fully populated
- Do not skip any components (e.g., profile, contact, web_links, etc.)
- Return ONLY a JSON array as final output (no explanation or extra notes)

User Info:
Name: Rajan
Designation: Mr
Company: Tez Minds
Phone: 9709590075
Email: rajang797@gmail.com
Website: https://www.mycoolbrand.com
Socials: Facebook, Instagram, Twitter
Description: Description
Address: Street, City, State, Zipcode, Country
Calendly: Link to book meetings

JSON Template:
(paste your full template JSON array here)

Now output the updated JSON:
[
    {
        "component": "profile",
       

KeyboardInterrupt: 

In [None]:
import torch
import json

# Your model and tokenizer are already loaded from the previous setup
# No need to reload them - using the existing 'model' and 'tokenizer' variables

user_question = """Digital Business Card - CV Demo
Rajan Gupta - Full Stack Developer
Tez Minds
Email: rajan@example.com | Phone: 9876543210
Address: 123 Developer Lane, Code City, India
About Me
A passionate developer with 5+ years of experience in building scalable web and backend systems.
Enthusiastic about Go, React, and cloud architecture.
Skills
Go, JavaScript, React, Node.js, MongoDB, REST APIs, Docker, Kubernetes
Projects
Parcel Management System
A web app to track and manage parcel deliveries, with features like booking, invoice generation, and
customer support.
AI Prompt Platform
Built a backend service that dynamically generates prompts using LLaMA for digital card generation
based on user input.
Education
B.Tech in Computer Science - ABC University (2016-2020)
Languages
English, Hindi
Social Links
LinkedIn: https://linkedin.com/in/rajang
GitHub: https://github.com/rajan-dev
Twitter: @rajan_dev"""
# JSON template as a separate variable for better readability
json_template = {
    "template_id": "b_685bb94064ea9b78d567cade",
    "qr_codes": [
        {
            "qr_name": "",
            "short_url": "",
            "content": [
                {
                    "component": "profile",
                    "pr_img": "/images/digitalCard/dbcv2/profile_1.webp",
                    "br_img": "/images/digitalCard/dbcv2/barand_logo_9.webp",
                    "name": "Name",
                    "desc": "Title",
                    "company": "Company",
                    "contact_shortcuts": [
                        {"type": "mobile", "value": "0000000000"},
                        {"type": "email", "value": "youremail@domain.com"},
                        {"type": "sms", "value": "0000000000"}
                    ]
                },
                {
                    "component": "text_desc",
                    "title": "About Me",
                    "desc": "Description",
                    "title_config": {"bold": 1, "italic": 0, "align": "center", "lock": "unlock"},
                    "desc_config": {"bold": 0, "italic": 0, "align": "center", "lock": "unlock"}
                },
                {
                    "component": "contact",
                    "contact_title": "Contact Us",
                    "icon_img": "/images/digitalCard/contactus.png",
                    "floating_button_label": "Add to Contact",
                    "ebusiness_card_enable": 1,
                    "contact_infos": [
                        {"type": "number", "title": "Call Us", "label": "Mobile ", "number": "123 456 7890"},
                        {"type": "email", "title": "Email", "label": "Email ", "email": "contactme@domain.com"},
                        {
                            "type": "address",
                            "title": "Address",
                            "street": "Street",
                            "city": "City",
                            "country": "Country",
                            "state": "State",
                            "zip": "Zipcode",
                            "action_button_label": "Direction",
                            "action_button_link": "#"
                        }
                    ]
                },
                {
                    "component": "images",
                    "title": "",
                    "desc": "",
                    "title_config": {"bold": 1, "italic": 0, "align": "center", "lock": "unlock"},
                    "desc_config": {"bold": 0, "italic": 0, "align": "center", "lock": "unlock"},
                    "view_type": "list",
                    "images": [
                        "/images/digitalCard/image_1.png",
                        "/images/digitalCard/image_2.png",
                        "/images/digitalCard/image_1.png",
                        "/images/digitalCard/image_2.png"
                    ]
                },
                {
                    "component": "social_link",
                    "title": "Social Links",
                    "desc": "Description",
                    "title_config": {"bold": 1, "italic": 0, "align": "center", "lock": "unlock"},
                    "desc_config": {"bold": 0, "italic": 0, "align": "center", "lock": "unlock"},
                    "links": [
                        {
                            "type": "facebook",
                            "url": "",
                            "title": "Facebook",
                            "subtitle": "Follow us on Facebook",
                            "icon_img": "/images/digitalCard/fb_icon@72x.png"
                        },
                        {
                            "type": "instagram",
                            "url": "",
                            "title": "Instagram",
                            "subtitle": "Follow us on Instagram",
                            "icon_img": "/images/digitalCard/insta_icon@72x.png"
                        },
                        {
                            "type": "twitter",
                            "url": "",
                            "title": "Twitter",
                            "subtitle": "Follow us on Twitter",
                            "icon_img": "/images/digitalCard/tw_icon@72x.png"
                        }
                    ]
                },
                {
                    "component": "web_links",
                    "title": "Web Links",
                    "desc": "Description",
                    "title_config": {"bold": 1, "italic": 0, "align": "center", "lock": "unlock"},
                    "desc_config": {"bold": 0, "italic": 0, "align": "center", "lock": "unlock"},
                    "links": [
                        {
                            "url": "https://www.mycoolbrand.com",
                            "title": "Title",
                            "subtitle": "Sub Title",
                            "icon_img": "/images/digitalCard/weblink.png"
                        }
                    ]
                },
                {
                    "component": "appointment",
                    "title": "Schedule Meeting",
                    "desc": "Schedule a meeting to discuss potential opportunities for collaboration",
                    "title_config": {"bold": 1, "italic": 0, "align": "center", "lock": "unlock"},
                    "desc_config": {"bold": 0, "italic": 0, "align": "center", "lock": "unlock"},
                    "appointments": [
                        {"link": "", "label": "Book on Calendly"},
                        {"link": "", "label": "Add to Calendar"}
                    ]
                },
                {
                    "component": "form",
                    "card_label": "Collect Contacts",
                    "card_delete_disabled": 1,
                    "card_desc": "Enable this feature to collect your prospect's contact details",
                    "form_name": "Contact Collection",
                    "form_config": [
                        {
                            "header": {
                                "title": "Hi, great to connect with you!",
                                "desc": "Please provide the information below to proceed further",
                                "header_enable": 1
                            },
                            "enable_header_img": 1,
                            "header_img": "/images/defaultImages/businesspage/b_brand_logo.png",
                            "form_fields": [
                                {"type": "oneLine", "label": "Your Name", "required": True, "_id": "m5jMN0971750841633067D"},
                                {"type": "email", "label": "Your Email", "required": True, "_id": "8gWDWfpY1750841633067E"},
                                {"type": "tel", "label": "Your Phone", "required": True, "_id": "y4zLRmhk1750841633067F"}
                            ],
                            "button_label": "Submit",
                            "terms_label": "I agree to Terms and Privacy Policy"
                        }
                    ],
                    "view_config": {
                        "delay_time": "1",
                        "dismiss_form": 1,
                        "form_trigger": "delay",
                        "form_view": "full",
                        "view_type": "overlay"
                    },
                    "form_integration": []
                }
            ]
        }
    ]
}

# Create the evaluation prompt
eval_prompt = f"""Question: {user_question}

check user in full detailed mannar and extract it all info and integrate all info this json template all field are required do not loss any fiels and donot add any field
and i want all data integrated in this json formate strictly follow this json template

{json.dumps(json_template, indent=2)}"""
# Tokenize the prompt

prompt_tokenized = tokenizer(eval_prompt, return_tensors="pt", padding=True, truncation=True, max_length=2048)
# Move to the same device as your model (should be auto-handled by device_map="auto")
prompt_tokenized = {k: v.to(model.device) for k, v in prompt_tokenized.items()}

# Generate response
model.eval()
with torch.no_grad():
    output_tokens = model.generate(
        **prompt_tokenized,
        max_new_tokens=2048,
        pad_token_id=tokenizer.pad_token_id,
        do_sample=True,
        temperature=0.7,
        top_p=0.9
    )

    # Decode only the new tokens (excluding the input prompt)
    input_length = prompt_tokenized['input_ids'].shape[1]
    generated_tokens = output_tokens[0][input_length:]
    response = tokenizer.decode(generated_tokens, skip_special_tokens=True)

    print("Generated Response:")
    print(response)

    # Clear GPU cache
    if torch.cuda.is_available():
        torch.cuda.empty_cache()