In [8]:
import numpy as np
import pandas as pd

data = np.load(r"C:\Personal\Educational\Projects\NLP-Reg\LLM_training.npz", allow_pickle=True)
df = pd.DataFrame({key: data[key] for key in data.files})

df=df[['sample_id', 'catalog_content', 'log_price', 'img_pca_128']]
print(df.head())


   sample_id                                    catalog_content  log_price  \
0      33127  Item Name: La Victoria Green Taco Sauce Mild, ...   1.773256   
1     198967  Item Name: Salerno Cookies, The Original Butte...   2.647592   
2     261251  Item Name: Bear Creek Hearty Soup Bowl, Creamy...   1.088562   
3      55858  Item Name: Judee’s Blue Cheese Powder 11.25 oz...   3.444895   
4     292686  Item Name: kedem Sherry Cooking Wine, 12.7 Oun...   4.211979   

                                         img_pca_128  
0  [-0.15211318, -0.25649774, 0.107396826, -0.100...  
1  [-0.20088889, 0.054210357, -0.17757861, -0.079...  
2  [-0.08486046, -0.12251824, -0.16236168, -0.162...  
3  [0.029396318, -0.08662167, -0.1567853, -0.0016...  
4  [0.09248337, -0.13690022, 0.1076192, 0.0300138...  


In [11]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model
import pandas as pd
import numpy as np


In [12]:
class PriceDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=256):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length
        
        # convert img_pca_128 from list string to float tensor if needed
        self.df['img_emb'] = self.df['img_pca_128'].apply(lambda x: torch.tensor(x, dtype=torch.float32))
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        text = self.df.iloc[idx]['catalog_content']
        log_price = torch.tensor(self.df.iloc[idx]['log_price'], dtype=torch.float32)
        img_emb = self.df.iloc[idx]['img_emb']
        
        # tokenize text
        tokenized = self.tokenizer(text,
                                   truncation=True,
                                   padding='max_length',
                                   max_length=self.max_length,
                                   return_tensors='pt')
        input_ids = tokenized.input_ids.squeeze(0)       # shape [seq_len]
        attention_mask = tokenized.attention_mask.squeeze(0)
        
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "img_emb": img_emb,
            "log_price": log_price
        }


In [13]:
model_name = "Qwen/Qwen-3B"  # HuggingFace model
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

# Load model with offloading
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",          # automatically places parts on GPU/CPU
    offload_folder="./offload", # folder for CPU offloaded weights
    torch_dtype=torch.float16   # reduce memory usage
)


OSError: Qwen/Qwen-3B is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `hf auth login` or by passing `token=<your_token>`