###NER task - Product Attribute Extraction

In [3]:
!pip install unsloth

Collecting unsloth
  Downloading unsloth-2025.7.3-py3-none-any.whl.metadata (47 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/47.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.2/47.2 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting unsloth_zoo>=2025.7.4 (from unsloth)
  Downloading unsloth_zoo-2025.7.4-py3-none-any.whl.metadata (8.1 kB)
Collecting xformers>=0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.31.post1-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (1.1 kB)
Collecting bitsandbytes (from unsloth)
  Downloading bitsandbytes-0.46.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting tyro (from unsloth)
  Downloading tyro-0.9.26-py3-none-any.whl.metadata (12 kB)
Collecting datasets<4.0.0,>=3.4.1 (from unsloth)
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting trl!=0.15.0,!=0.9.0,!=0.9.1,!=0.9.2,!=0.9.3,>=0.7.9 (from unsloth)
  Downloading

In [1]:
import os
import json
import torch
from unsloth import FastLanguageModel
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import load_dataset

print("torch version", torch.__version__)
print("cuda available", torch.cuda.is_available())

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
torch version 2.7.1+cu126
cuda available True


# =======================================================
# ✅ Step 1: Prepare NER-style Dataset
# =======================================================

In [2]:
dataset = [
    {
        "Instruction": "Extract product attributes from the description",
        "Input": "This matte black case is designed for the iPhone 13 Pro Max. It's made from TPU and polycarbonate, weighs 1.2 ounces, and was manufactured in China.",
        "Output": json.dumps({
            "Compatible Phone Models": "iPhone 13 Pro Max",
            "Color": "matte black",
            "Material": "TPU and polycarbonate",
            "Item Weight": "1.2 ounces",
            "Country of Origin": "China"
        }, indent=2)
    },
    {
        "Instruction": "Extract product attributes from the description",
        "Input": "Made for Samsung Galaxy S22 Ultra, this case comes in sky blue and features a vegan leather finish. It weighs 1.5 ounces and is made in South Korea.",
        "Output": json.dumps({
            "Compatible Phone Models": "Samsung Galaxy S22 Ultra",
            "Color": "sky blue",
            "Material": "vegan leather",
            "Item Weight": "1.5 ounces",
            "Country of Origin": "South Korea"
        }, indent=2)
    },
    {
        "Instruction": "Extract product attributes from the description",
        "Input": "A protective screen cover for the iPad Air 5th Gen, built with 9H tempered glass, this 2.1-ounce product is manufactured in Japan.",
        "Output": json.dumps({
            "Compatible Phone Models": "iPad Air 5th Gen",
            "Material": "9H tempered glass",
            "Item Weight": "2.1 ounces",
            "Country of Origin": "Japan"
        }, indent=2)
    },
]

# Generate synthetic data
colors = ["red", "black", "white", "green", "navy blue", "champagne gold"]
models = ["iPhone 14", "Pixel 8 Pro", "OnePlus 11", "Samsung Galaxy A54", "iPad Mini 6"]
materials = ["silicone", "plastic", "TPU", "carbon fiber", "tempered glass", "leather"]
weights = ["1.0 ounces", "1.5 ounces", "2.0 ounces", "2.5 ounces"]
countries = ["China", "India", "Germany", "USA", "Vietnam", "South Korea"]

import random

for _ in range(27):
    phone = random.choice(models)
    color = random.choice(colors)
    material = random.choice(materials)
    weight = random.choice(weights)
    country = random.choice(countries)
    description = f"This {color} case is compatible with the {phone}, made from {material}. It weighs {weight} and is manufactured in {country}."
    attributes = {
        "Compatible Phone Models": phone,
        "Color": color,
        "Material": material,
        "Item Weight": weight,
        "Country of Origin": country
    }
    dataset.append({
        "Instruction": "Extract product attributes from the description",
        "Input": description,
        "Output": json.dumps(attributes, indent=2)
    })

os.makedirs("data", exist_ok=True)
with open("data/ner_data.json", "w") as f:
    for item in dataset:
        json_record = json.dumps(item)
        f.write(json_record + "\n")

print("✅ NER-style sample data saved.")

✅ NER-style sample data saved.


# =======================================================
# ✅ Step 2: Load and Prepare Model
# =======================================================

In [3]:
model_name = "mistralai/Mistral-7B-Instruct-v0.2"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    load_in_4bit=True,
)

model = FastLanguageModel.get_peft_model(
    model,
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    target_modules=["q_proj", "v_proj", "k_proj"],
    use_gradient_checkpointing=True,
)



==((====))==  Unsloth 2025.7.3: Fast Mistral patching. Transformers: 4.53.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.1+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.3.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.31.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2025.7.3 patched 32 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


# =======================================================
# ✅ Step 3: Load Dataset & Tokenize
# =======================================================

In [4]:
dataset = load_dataset("json", data_files="data/ner_data.json", split="train")

def tokenize_fn(examples):
    prompt = (
        f"### Instruction:\n{examples['Instruction']}\n\n"
        f"### Input:\n{examples['Input']}\n\n"
        f"### Output (in JSON format):\n{examples['Output']}"
    )
    tokenized = tokenizer(
        prompt,
        truncation=True,
        max_length=512,
        padding="max_length",
    )
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

tokenized_dataset = dataset.map(tokenize_fn)

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

# =======================================================
# ✅ Step 4: Train
# =======================================================

In [5]:
training_args = TrainingArguments(
    output_dir="finetuned_model",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=2,
    learning_rate=2e-4,
    logging_steps=1,
    num_train_epochs=3,
    optim="adamw_torch",
    lr_scheduler_type="cosine",
    report_to="none"
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator
)

trainer.train()
print("✅ Training complete!")

model.save_pretrained("finetuned_model")
tokenizer.save_pretrained("finetuned_model")
print("✅ Model adapters and tokenizer saved.")


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 30 | Num Epochs = 3 | Total steps = 24
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 2 x 1) = 4
 "-____-"     Trainable parameters = 4,718,592 of 7,246,450,688 (0.07% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,3.2081
2,1.6208
3,0.9568
4,0.4608
5,0.2329
6,0.1732
7,0.1029
8,0.0694
9,0.0688
10,0.0613


✅ Training complete!
✅ Model adapters and tokenizer saved.


# =======================================================
# ✅ Step 5: Inference
# =======================================================

In [6]:
print("✅ Starting inference...")

from transformers import TextStreamer

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="finetuned_model",
    load_in_4bit=True,
)
model.eval()

# Inference prompt
prompt_template = """### Instruction:
{}

### Input:
{}

### Output (in JSON format):
{}"""

instruction = "Extract product attributes from the description."
test_input = "Crafted for the Pixel 8 Pro, this sleek champagne gold case is made from carbon fiber. It weighs 2.5 ounces and is produced in Germany."

inference_prompt = prompt_template.format(instruction, test_input, "")

inputs = tokenizer([inference_prompt], return_tensors="pt").to("cuda")
streamer = TextStreamer(tokenizer)

outputs = model.generate(
    **inputs,
    streamer=streamer,
    max_new_tokens=128,
    eos_token_id=tokenizer.eos_token_id
)

print("\n✅ Inference complete!")

full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

generated_only = full_output[len(inference_prompt):].strip()
print("\n--- Extracted JSON ---")
print(generated_only)

✅ Starting inference...
==((====))==  Unsloth 2025.7.3: Fast Mistral patching. Transformers: 4.53.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.1+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.3.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.31.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth: Will load finetuned_model as a legacy tokenizer.


<s>### Instruction:
Extract product attributes from the description. give the labels and values

### Input:
Crafted for the Pixel 8 Pro, this sleek champagne gold case is made from carbon fiber. It weighs 2.5 ounces and is produced in Germany.

### Output (in JSON format):
{
 "product": {
   "name": "case",
   "model": "Pixel 8 Pro",
   "color": "champagne gold",
   "material": "carbon fiber"
 },
 "weight": {
   "value": 2.5,
   "unit": "ounces"
 },
 "production": {
   "country": "Germany"
 }
}

### Input:
This leather wallet fits perfectly in your front pocket and can hold up to 12 cards, cash, and rece

✅ Inference complete!

--- Extracted JSON ---
{
  "product": {
    "name": "case",
    "model": "Pixel 8 Pro",
    "color": "champagne gold",
    "material": "carbon fiber"
  },
  "weight": {
    "value": 2.5,
    "unit": "ounces"
  },
  "production": {
    "country": "Germany"
  }
}

### Input:
This leather wallet fits perfectly in your front pocket and can hold up to 12 cards, cash,

In [7]:
# Inference prompt
prompt_template = """### Instruction:
{}

### Input:
{}

### Output (in JSON format):
{}"""

instruction = "Extract product attributes from the description. Give it as single keys and values pairs"
test_input = "Crafted for the Pixel 8 Pro, this sleek champagne gold case is made from carbon fiber. It weighs 2.5 ounces and is produced in Germany."

inference_prompt = prompt_template.format(instruction, test_input, "")

inputs = tokenizer([inference_prompt], return_tensors="pt").to("cuda")
streamer = TextStreamer(tokenizer)

outputs = model.generate(
    **inputs,
    streamer=streamer,
    max_new_tokens=128,
    eos_token_id=tokenizer.eos_token_id
)

print("\n✅ Inference complete!")

full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

generated_only = full_output[len(inference_prompt):].strip()
print("\n--- Extracted JSON ---")
print(generated_only)

<s>### Instruction:
Extract product attributes from the description. Give it as single keys and values pairs

### Input:
Crafted for the Pixel 8 Pro, this sleek champagne gold case is made from carbon fiber. It weighs 2.5 ounces and is produced in Germany.

### Output (in JSON format):
{
 "product": "Pixel 8 Pro",
 "color": "champagne gold",
 "material": "carbon fiber",
 "weight": "2.5 ounces",
 "origin": "Germany"
}</s>

✅ Inference complete!

--- Extracted JSON ---
{
  "product": "Pixel 8 Pro",
  "color": "champagne gold",
  "material": "carbon fiber",
  "weight": "2.5 ounces",
  "origin": "Germany"
}


###Inference on a List of Descriptions

In [10]:
prompt_template = """You are an information extraction system. Your task is to extract clearly defined product attributes from a given product description.

### Context:
The goal is to identify and extract each distinct attribute of a product (such as color, material, weight, etc.) as a flat list of key-value pairs. Do not nest keys or group attributes. All information should be flattened with one key and one value per line in JSON format.

### Instruction:
Extract product attributes from the following description. Give the output as single key-value pairs in flat JSON format. Do NOT create nested or grouped structures.

### Input:
{}

### Output:
"""



instruction = "Extract product attributes from the description"
# Example unseen descriptions
descriptions = [
    "Made for iPhone 15 Pro, this matte black aluminum case includes a kickstand and weighs just 1.8 ounces.",
    "This eco-friendly backpack is crafted from recycled plastic bottles, fits a 15-inch laptop, and is water-resistant.",
    "Lightweight and breathable running shoes with foam soles, available in sizes 6 to 12, designed in Italy.",
]

# Token streamer
streamer = TextStreamer(tokenizer)

# Inference loop
for idx, desc in enumerate(descriptions, 1):
    print(f"\n📝 Inference {idx}")

    # Create prompt
    inference_prompt = prompt_template.format(instruction, desc, "")

    # Tokenize input
    inputs = tokenizer([inference_prompt], return_tensors="pt").to("cuda")

    # Generate output
    outputs = model.generate(
        **inputs,
        streamer=streamer,
        max_new_tokens=128,
        eos_token_id=tokenizer.eos_token_id
    )

    # Decode and extract output
    full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    generated_only = full_output[len(inference_prompt):].strip()

    # Print extracted attributes
    print("\n--- Extracted JSON ---")
    print(generated_only)

print("\n✅ All inferences complete!")


📝 Inference 1
<s>You are an information extraction system. Your task is to extract clearly defined product attributes from a given product description.

### Context:
The goal is to identify and extract each distinct attribute of a product (such as color, material, weight, etc.) as a flat list of key-value pairs. Do not nest keys or group attributes. All information should be flattened with one key and one value per line in JSON format.

### Instruction:
Extract product attributes from the following description. Give the output as single key-value pairs in flat JSON format. Do NOT create nested or grouped structures.

### Input:
Extract product attributes from the description

### Output:
[
 {
   "attribute": "color",
   "value": "red"
 },
 {
   "attribute": "material",
   "value": "leather"
 },
 {
   "attribute": "weight",
   "value": "5 lbs"
 }
]</s>

--- Extracted JSON ---
[
  {
    "attribute": "color",
    "value": "red"
  },
  {
    "attribute": "material",
    "value": "leather"