In [1]:
# pip install evaluate

In [2]:
# !pip install -U datasets


In [3]:
# pip install transformers

In [4]:
import torch
from datasets import load_dataset,DatasetDict
import evaluate
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
import numpy as np
import pandas as pd
from transformers import EarlyStoppingCallback
from datasets import Dataset
from sklearn.metrics import mean_squared_error
import math
import os
import re
import string

In [5]:
import re
import string


def clean_catalog_content(text):
    # Extract only text after "Item Name:"
    pattern = r'Item Name:\s*(.*)'
    matches = re.findall(pattern, text)

    # Join extracted item names with space
    cleaned_text = " ".join(matches)

    # Remove all punctuation except period '.'
    cleaned_text = re.sub(r"[^\w\s\.]", "", cleaned_text)

    # Lower case and normalize spaces
    cleaned_text = re.sub(r"\s+", " ", cleaned_text).strip().lower()

    return cleaned_text

In [6]:
raw = """
Item Name: Tiesta Tea - Fruity Loose Leaf Dry Flight Mother’s Day Tea Gift Set Sampler | Caffeine-Free | Hot & Iced Ready | Assorted Fruit Blends with Mango, Peach, Orange & More | 8 Resealable Sample Pouches
Bullet Point 1: EXPLORE A WORLD OF FLAVORS: Enjoy a variety of flavors with Tiesta Tea’s Fruity Sampler
Bullet Point 2: INCLUDES 8 DIFFERENT TEA BLENDS: This tea set includes an assortment of fruity blends
Product Description: Experience the vibrant world of Tiesta Tea’s Fruity Sampler Dry Flight Tea Set
Value: 8.0
Unit: Ounce
"""
cleaned = clean_catalog_content(raw)
print(len(raw),len(cleaned))
print(cleaned)

539 179
tiesta tea fruity loose leaf dry flight mothers day tea gift set sampler caffeinefree hot iced ready assorted fruit blends with mango peach orange more 8 resealable sample pouches


In [7]:
X = pd.read_csv("sample_train_test.csv", encoding='utf-8')
X = X.rename(columns={"catalog_content":"text","price":"labels"})
X['text'] = X['text'].apply(clean_catalog_content)
X.head()

Unnamed: 0,sample_id,text,labels
0,240216,tiesta tea fruity loose leaf dry flight mother...,25.95
1,17925,almond vanilla yerba mate tea loose 8 oz zin 5...,26.14
2,132345,viva doria white peppercorn whole white pepper...,13.99
3,269022,sour belts by its delish watermelon 1 lb 16 oz,18.99
4,215644,crazy cups creamy eggnog hot chocolate pods pe...,21.12


In [8]:
X.shape

(60000, 3)

In [9]:
from sklearn.model_selection import train_test_split
train_df, valid_df = train_test_split(X, test_size=0.3, random_state=42)

# 3. Convert to Hugging Face Dataset
train_ds = Dataset.from_pandas(train_df[["text", "labels"]])
valid_ds = Dataset.from_pandas(valid_df[["text", "labels"]])

dataset = DatasetDict({"train": train_ds, "validation": valid_ds})

In [10]:
model_name = "microsoft/Phi-3.5-mini-instruct"
tokenizer  = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForSequenceClassification.from_pretrained(model_name, trust_remote_code=True, num_labels=1, problem_type="regression")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of Phi3ForSequenceClassification were not initialized from the model checkpoint at microsoft/Phi-3.5-mini-instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
def preprocess(examples):
    return tokenizer(examples["text"], truncation=True, max_length=256)
dataset = dataset.map(preprocess, batched=True)

Map:   0%|          | 0/42000 [00:00<?, ? examples/s]

Map:   0%|          | 0/18000 [00:00<?, ? examples/s]

In [12]:
# 4. Data collator
data_collator = DataCollatorWithPadding(tokenizer)

In [13]:
smape = evaluate.load("smape")

In [14]:
def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = preds.flatten()
    return smape.compute(predictions=preds, references=labels)

In [15]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"


In [16]:
# pip install transformers\[torch\]


In [17]:
# pip install "accelerate>=0.26.0"

In [18]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./phi3.5-mini-regression",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=3e-5,
    num_train_epochs=2,
    eval_strategy="epoch",
    save_strategy="epoch",
    weight_decay=0.01,
    fp16=torch.cuda.is_available(),
    load_best_model_at_end=True,
    metric_for_best_model="smape",
    greater_is_better=False,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [19]:
trainer.train()
# trainer.evaluate()

Epoch,Training Loss,Validation Loss,Smape
1,656.7044,876.564209,0.617596
2,496.6409,729.385254,0.565734


TrainOutput(global_step=21000, training_loss=802.7771242559523, metrics={'train_runtime': 3198.1976, 'train_samples_per_second': 26.265, 'train_steps_per_second': 6.566, 'total_flos': 7.423855383394714e+16, 'train_loss': 802.7771242559523, 'epoch': 2.0})

In [20]:
X = pd.read_csv("test.csv", encoding='utf-8-sig')
X = X.rename(columns={"catalog_content":"text"})
X['text'] = X['text'].apply(clean_catalog_content)
X=X[["sample_id","text"]]
X.head()

Unnamed: 0,sample_id,text
0,100179,rani 14spice eshamayas mango chutney indian pr...
1,245611,natural milk tea flavoring extract by halo pan...
2,146263,honey filled hard candy bulk pack 2 pounds ind...
3,95658,vlasic snackmms kosher dill 16 oz pack of 2
4,36806,mccormick culinary vanilla extract 32 fl oz on...


In [22]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [23]:
inputs = tokenizer(X["text"].tolist(), padding=True, truncation=True, max_length=256, return_tensors="pt")
inputs = {k: v.to(device) for k, v in inputs.items()} 

In [25]:
batch_size = 8  # or smaller


In [26]:
batch_size = 8
predictions = []

model.eval()
with torch.no_grad():
    for i in range(0, len(X), batch_size):
        batch_texts = X["text"].iloc[i:i+batch_size].tolist()
        inputs = tokenizer(batch_texts, padding=True, truncation=True, max_length=256, return_tensors="pt").to(device)
        outputs = model(**inputs)
        preds = outputs.logits.squeeze().cpu().numpy()
        if preds.ndim == 0:
            preds = [preds]
        predictions.extend(preds)


In [27]:
# model.eval()
# with torch.no_grad():
#     outputs = model(**inputs)
#     predictions = outputs.logits.squeeze().cpu().numpy()  # For regression, shape should be (num_samples,)

# 5. Attach predictions to sample ids
X["price"] = predictions

# 6. Save or inspect predictions
X.to_csv("test_predictions.csv", index=False)


In [28]:
X.head()

Unnamed: 0,sample_id,text,price
0,100179,rani 14spice eshamayas mango chutney indian pr...,18.828125
1,245611,natural milk tea flavoring extract by halo pan...,13.679688
2,146263,honey filled hard candy bulk pack 2 pounds ind...,21.3125
3,95658,vlasic snackmms kosher dill 16 oz pack of 2,11.820312
4,36806,mccormick culinary vanilla extract 32 fl oz on...,29.953125
