In [None]:
import numpy as np
import pandas as pd

import transformers
from datasets import Dataset,load_dataset, load_from_disk
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split

In [None]:
import re
import string

def clean_catalog_content(text):
    # Regex to extract desired fields (including Product Description)
    pattern = r'(Item Name:.*|Product Description:.*|Value:.*|Unit:.*)'
    matches = re.findall(pattern, text)

    # Join extracted lines with space
    cleaned_text = " ".join(matches)

    # Remove all punctuation except period '.'
    cleaned_text = re.sub(r"[^\w\s\.]", "", cleaned_text)

    # Lower case and normalize spaces
    cleaned_text = re.sub(r"\s+", " ", cleaned_text).strip().lower()

    return cleaned_text


In [None]:
raw = """
Item Name: Tiesta Tea - Fruity Loose Leaf Dry Flight Mother’s Day Tea Gift Set Sampler | Caffeine-Free | Hot & Iced Ready | Assorted Fruit Blends with Mango, Peach, Orange & More | 8 Resealable Sample Pouches
Bullet Point 1: EXPLORE A WORLD OF FLAVORS: Enjoy a variety of flavors with Tiesta Tea’s Fruity Sampler
Bullet Point 2: INCLUDES 8 DIFFERENT TEA BLENDS: This tea set includes an assortment of fruity blends
Product Description: Experience the vibrant world of Tiesta Tea’s Fruity Sampler Dry Flight Tea Set
Value: 8.0
Unit: Ounce
"""
cleaned = clean_catalog_content(raw)
print(len(raw),len(cleaned))
print(cleaned)


540 308
item name tiesta tea fruity loose leaf dry flight mothers day tea gift set sampler caffeinefree hot iced ready assorted fruit blends with mango peach orange more 8 resealable sample pouches product description experience the vibrant world of tiesta teas fruity sampler dry flight tea set value 8.0 unit ounce


In [None]:
raw = """
Item Name: NuNaturals Stevia Syrup, Sugar-Free Sweetener, Plant-Based Sugar Substitute, Zero Calorie, Sugar-Free Syrup, Maple, 6.6oz (3-Pack)
Bullet Point 1: Sweeten your day with our sugar-free syrup made using only pure Stevia extract and natural flavors; Tastes just as sweet and satisfying as the original, without the sugar and added calories
Bullet Point 2: Satisfy your sugar cravings with this diet-friendly syrup; Contains zero carbs; Enhance the taste and flavor of your favorite recipes for everyone in the family to enjoy
Bullet Point 3: Enjoy a decadent twist to your favorite treats and beverages; Add it to milk, coffee and lattes, top off ice cream and desserts or drizzle over waffles, pancakes or baked goods
Bullet Point 4: Gluten-free, low-glycemic, vegan, Non-GMO and sugar-free; Ideal for keto, low carb and sugar sensitive lifestyles
Bullet Point 5: NuNaturals makes naturally-sweetened sugar-free products that taste great and are good for you; We're committed to creating ethically and sustainably sourced foods that support your wellness
Product Description: Treat your taste buds to something sweet and delectable, minus the sugar and calories with our NuNaturals Stevia Syrup. Sweetened with premium-grade stevia extract, this dessert syrup is a healthy and diet-friendly way to fix your sugar cravings. Drizzle over your favorite treats or top off your beverages with this sugar-free version of the delicious childhood staple you grew up loving. You only need a little to add decadent flavor, so you can enjoy this rich syrup for a long time.
Value: 19.8
Unit: Fl Oz

"""
cleaned = clean_catalog_content(raw)
print(len(raw),len(cleaned))
print(cleaned)

1598 651
item name nunaturals stevia syrup sugarfree sweetener plantbased sugar substitute zero calorie sugarfree syrup maple 6.6oz 3pack product description treat your taste buds to something sweet and delectable minus the sugar and calories with our nunaturals stevia syrup. sweetened with premiumgrade stevia extract this dessert syrup is a healthy and dietfriendly way to fix your sugar cravings. drizzle over your favorite treats or top off your beverages with this sugarfree version of the delicious childhood staple you grew up loving. you only need a little to add decadent flavor so you can enjoy this rich syrup for a long time. value 19.8 unit fl oz


In [None]:
raw = """
Item Name: Earth's Best Organic Baby Food Jars, Stage 2 Fruit Puree for Babies 6 Months and Older, Organic Fruit Variety Pack, 4 oz Resealable Glass Jar (Pack of 12)
Bullet Point 1: ORGANIC FRUIT PUREE: Made with wholesome ingredients like organic fruit puree, these baby food jars help nourish your little one as they explore new foods and textures
Bullet Point 2: RESEALABLE GLASS BABY FOOD JARS: Resealable glass jars make it easy to store leftovers or portion out meals and can even be washed and re-used as baby food storage containers
Bullet Point 3: STAGE 2 BABY FOOD: Expand your little one's pallet to stage 2 foods and introduce them to the exciting new flavor combinations of our fruit puree jars
Bullet Point 4: EARTH'S BEST: Explore our full collection of organic baby food and toddler snacks to find easy-to-digest baby formula, toddler cookies, and more stage 1 and stage 2 baby food jars
Bullet Point 5: ORGANIC FRUIT VARIETY PACK: Includes twelve 4 oz glass jars of Earth's Best Organic baby food (4 Peach Oatmeal Banana, 4 Pear and Raspberries, 4 Apples and Blueberries)
Value: 48.0
Unit: Ounce

"""
cleaned = clean_catalog_content(raw)
print(len(raw),len(cleaned))
print(cleaned)


1115 180
item name earths best organic baby food jars stage 2 fruit puree for babies 6 months and older organic fruit variety pack 4 oz resealable glass jar pack of 12 value 48.0 unit ounce


In [None]:
# X = pd.read_csv("/content/sample_train_test.csv", encoding='utf-8-sig')
# X = X.rename(columns={"catalog_content":"text","price":"labels"})
# X['text'] = X['text'].apply(clean_catalog_content)
# X.head()

Unnamed: 0,sample_id,text,labels
0,240216,item name tiesta tea fruity loose leaf dry fli...,25.95
1,17925,item name almond vanilla yerba mate tea loose ...,26.14
2,132345,item name viva doria white peppercorn whole wh...,13.99
3,269022,item name sour belts by its delish watermelon ...,18.99
4,215644,item name crazy cups creamy eggnog hot chocola...,21.12


In [None]:
# dataset = Dataset.from_pandas(X,preserve_index=False)
# dataset = dataset.train_test_split(test_size=0.2)

# dataset

DatasetDict({
    train: Dataset({
        features: ['sample_id', 'text', 'labels'],
        num_rows: 48000
    })
    test: Dataset({
        features: ['sample_id', 'text', 'labels'],
        num_rows: 12000
    })
})

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")


In [None]:
def tokenize_function(examples):
    # Tokenize “text” but keep “labels”
    tokenized = tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True
    )
    tokenized["labels"] = examples["labels"]
    return tokenized

tokenized_datasets = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"]  # only remove “text”, not “labels”
)


Map:   0%|          | 0/48000 [00:00<?, ? examples/s]

Map:   0%|          | 0/12000 [00:00<?, ? examples/s]

In [None]:
print(tokenized_datasets["train"].column_names)
# Should include: ['input_ids','attention_mask','labels',…]


['sample_id', 'labels', 'input_ids', 'attention_mask']


In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=1,
    problem_type="regression"  # This is crucial for regression tasks
)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model.resize_token_embeddings(len(tokenizer))

Embedding(30522, 768, padding_idx=0)

In [None]:
from sklearn.metrics import mean_squared_error
import math

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.ravel()
    labels = labels.ravel()
    # Compute MSE first
    mse = mean_squared_error(labels, predictions)
    # Then take square root for RMSE
    rmse = math.sqrt(mse)
    return {"rmse": rmse}





In [None]:
import numpy as np

def smape(y_true, y_pred):
    """
    Compute Symmetric Mean Absolute Percentage Error (SMAPE)

    Args:
        y_true (array-like): Actual values
        y_pred (array-like): Predicted values

    Returns:
        float: SMAPE value in percentage
    """
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    # Avoid division by zero
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    mask = denominator != 0  # Only compute where denominator != 0

    smape_values = np.abs(y_pred[mask] - y_true[mask]) / denominator[mask]

    return np.mean(smape_values) * 100  # Percentage


In [None]:
# !pip install -q --upgrade transformers


In [None]:
import transformers
print(transformers.__version__)


4.57.0


In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [None]:
from transformers import TrainingArguments, Trainer
from transformers import EarlyStoppingCallback

training_args = TrainingArguments(
    output_dir="./test_trainer",
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to=None,
    metric_for_best_model="rmse",        # use your main validation metric
    greater_is_better=False,             # because lower rmse is better
    save_total_limit=3                   # limit checkpoints to save space
)



trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]  # Stops if metric doesn't improve in 2 evals
)


# Train the model
trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rmse
1,338.2486,601.477417,24.525037
2,288.5008,541.10437,23.261652
3,425.6979,524.166931,22.894692
4,1248.4772,514.89325,22.691259
5,202.5128,508.905914,22.558943


TrainOutput(global_step=30000, training_loss=561.470920928955, metrics={'train_runtime': 11809.2746, 'train_samples_per_second': 20.323, 'train_steps_per_second': 2.54, 'total_flos': 3.179160870912e+16, 'train_loss': 561.470920928955, 'epoch': 5.0})

In [None]:
trainer.save_model("./fine_tuned_regression_model")
tokenizer.save_pretrained("./fine_tuned_regression_model")


('./fine_tuned_regression_model/tokenizer_config.json',
 './fine_tuned_regression_model/special_tokens_map.json',
 './fine_tuned_regression_model/vocab.txt',
 './fine_tuned_regression_model/added_tokens.json',
 './fine_tuned_regression_model/tokenizer.json')

# Model Predictions for Test data

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import Dataset

# 1. Load your saved model & tokenizer
model_dir = "./fine_tuned_regression_model"
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSequenceClassification.from_pretrained(model_dir)
model.eval()


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
