In [4]:
import numpy as np
import pandas as pd

import transformers
from datasets import Dataset,load_dataset, load_from_disk
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split

In [5]:
import re
import string

def clean_catalog_content(text):
    # Regex to extract desired fields (including Product Description)
    pattern = r'(Item Name:.*|Product Description:.*|Value:.*|Unit:.*)'
    matches = re.findall(pattern, text)

    # Join extracted lines with space
    cleaned_text = " ".join(matches)

    # Remove all punctuation except period '.'
    cleaned_text = re.sub(r"[^\w\s\.]", "", cleaned_text)

    # Lower case and normalize spaces
    cleaned_text = re.sub(r"\s+", " ", cleaned_text).strip().lower()

    return cleaned_text


In [6]:
raw = """
Item Name: Tiesta Tea - Fruity Loose Leaf Dry Flight Mother’s Day Tea Gift Set Sampler | Caffeine-Free | Hot & Iced Ready | Assorted Fruit Blends with Mango, Peach, Orange & More | 8 Resealable Sample Pouches
Bullet Point 1: EXPLORE A WORLD OF FLAVORS: Enjoy a variety of flavors with Tiesta Tea’s Fruity Sampler
Bullet Point 2: INCLUDES 8 DIFFERENT TEA BLENDS: This tea set includes an assortment of fruity blends
Product Description: Experience the vibrant world of Tiesta Tea’s Fruity Sampler Dry Flight Tea Set
Value: 8.0
Unit: Ounce
"""
cleaned = clean_catalog_content(raw)
print(len(raw),len(cleaned))
print(cleaned)


539 308
item name tiesta tea fruity loose leaf dry flight mothers day tea gift set sampler caffeinefree hot iced ready assorted fruit blends with mango peach orange more 8 resealable sample pouches product description experience the vibrant world of tiesta teas fruity sampler dry flight tea set value 8.0 unit ounce


In [7]:
raw = """
Item Name: NuNaturals Stevia Syrup, Sugar-Free Sweetener, Plant-Based Sugar Substitute, Zero Calorie, Sugar-Free Syrup, Maple, 6.6oz (3-Pack)
Bullet Point 1: Sweeten your day with our sugar-free syrup made using only pure Stevia extract and natural flavors; Tastes just as sweet and satisfying as the original, without the sugar and added calories
Bullet Point 2: Satisfy your sugar cravings with this diet-friendly syrup; Contains zero carbs; Enhance the taste and flavor of your favorite recipes for everyone in the family to enjoy
Bullet Point 3: Enjoy a decadent twist to your favorite treats and beverages; Add it to milk, coffee and lattes, top off ice cream and desserts or drizzle over waffles, pancakes or baked goods
Bullet Point 4: Gluten-free, low-glycemic, vegan, Non-GMO and sugar-free; Ideal for keto, low carb and sugar sensitive lifestyles
Bullet Point 5: NuNaturals makes naturally-sweetened sugar-free products that taste great and are good for you; We're committed to creating ethically and sustainably sourced foods that support your wellness
Product Description: Treat your taste buds to something sweet and delectable, minus the sugar and calories with our NuNaturals Stevia Syrup. Sweetened with premium-grade stevia extract, this dessert syrup is a healthy and diet-friendly way to fix your sugar cravings. Drizzle over your favorite treats or top off your beverages with this sugar-free version of the delicious childhood staple you grew up loving. You only need a little to add decadent flavor, so you can enjoy this rich syrup for a long time.
Value: 19.8
Unit: Fl Oz

"""
cleaned = clean_catalog_content(raw)
print(len(raw),len(cleaned))
print(cleaned)

1598 651
item name nunaturals stevia syrup sugarfree sweetener plantbased sugar substitute zero calorie sugarfree syrup maple 6.6oz 3pack product description treat your taste buds to something sweet and delectable minus the sugar and calories with our nunaturals stevia syrup. sweetened with premiumgrade stevia extract this dessert syrup is a healthy and dietfriendly way to fix your sugar cravings. drizzle over your favorite treats or top off your beverages with this sugarfree version of the delicious childhood staple you grew up loving. you only need a little to add decadent flavor so you can enjoy this rich syrup for a long time. value 19.8 unit fl oz


In [8]:
raw = """
Item Name: Earth's Best Organic Baby Food Jars, Stage 2 Fruit Puree for Babies 6 Months and Older, Organic Fruit Variety Pack, 4 oz Resealable Glass Jar (Pack of 12)
Bullet Point 1: ORGANIC FRUIT PUREE: Made with wholesome ingredients like organic fruit puree, these baby food jars help nourish your little one as they explore new foods and textures
Bullet Point 2: RESEALABLE GLASS BABY FOOD JARS: Resealable glass jars make it easy to store leftovers or portion out meals and can even be washed and re-used as baby food storage containers
Bullet Point 3: STAGE 2 BABY FOOD: Expand your little one's pallet to stage 2 foods and introduce them to the exciting new flavor combinations of our fruit puree jars
Bullet Point 4: EARTH'S BEST: Explore our full collection of organic baby food and toddler snacks to find easy-to-digest baby formula, toddler cookies, and more stage 1 and stage 2 baby food jars
Bullet Point 5: ORGANIC FRUIT VARIETY PACK: Includes twelve 4 oz glass jars of Earth's Best Organic baby food (4 Peach Oatmeal Banana, 4 Pear and Raspberries, 4 Apples and Blueberries)
Value: 48.0
Unit: Ounce

"""
cleaned = clean_catalog_content(raw)
print(len(raw),len(cleaned))
print(cleaned)


1115 180
item name earths best organic baby food jars stage 2 fruit puree for babies 6 months and older organic fruit variety pack 4 oz resealable glass jar pack of 12 value 48.0 unit ounce


In [9]:
import numpy as np

def smape(y_true, y_pred):
    """
    Compute Symmetric Mean Absolute Percentage Error (SMAPE)

    Args:
        y_true (array-like): Actual values
        y_pred (array-like): Predicted values

    Returns:
        float: SMAPE value in percentage
    """
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    # Avoid division by zero
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    mask = denominator != 0  # Only compute where denominator != 0

    smape_values = np.abs(y_pred[mask] - y_true[mask]) / denominator[mask]

    return np.mean(smape_values) * 100  # Percentage


In [10]:
import transformers
print(transformers.__version__)


4.53.3


In [11]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [12]:
from transformers import TrainingArguments, Trainer
from transformers import EarlyStoppingCallback


# Model Predictions for Test data

In [13]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import Dataset

# 1. Load your saved model & tokenizer
model_dir = "/kaggle/input/distil_bert_amazon_ml_challenge/pytorch/default/1/fine_tuned_regression_model"
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSequenceClassification.from_pretrained(model_dir)
model.eval()


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [14]:
X = pd.read_csv("/kaggle/input/test-data/test.csv", encoding='utf-8-sig')
X = X.rename(columns={"catalog_content":"text"})
X['text'] = X['text'].apply(clean_catalog_content)
X=X[["sample_id","text"]]
X.head()

Unnamed: 0,sample_id,text
0,100179,item name rani 14spice eshamayas mango chutney...
1,245611,item name natural milk tea flavoring extract b...
2,146263,item name honey filled hard candy bulk pack 2 ...
3,95658,item name vlasic snackmms kosher dill 16 oz pa...
4,36806,item name mccormick culinary vanilla extract 3...


In [15]:

dataset_test = Dataset.from_pandas(X, preserve_index=False)


In [16]:
# 4. Tokenize (keep only necessary columns)
def tokenize_fn(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=tokenizer.model_max_length
    )

tokenized_test = dataset_test.map(
    tokenize_fn,
    batched=True,
    remove_columns=["text"]
)

# 5. Run inference
#    Use Trainer.predict() or call model directly
from transformers import Trainer

trainer = Trainer(model=model)

Map:   0%|          | 0/75000 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [18]:

predictions = trainer.predict(tokenized_test)
# predictions.predictions shape: (num_samples, 1)
preds = predictions.predictions.squeeze(-1)

# 6. Attach predictions back to DataFrame
X["price"] = preds

# 7. View results
X.head()

Unnamed: 0,sample_id,text,price
0,100179,item name rani 14spice eshamayas mango chutney...,12.593371
1,245611,item name natural milk tea flavoring extract b...,9.971018
2,146263,item name honey filled hard candy bulk pack 2 ...,27.302692
3,95658,item name vlasic snackmms kosher dill 16 oz pa...,6.319869
4,36806,item name mccormick culinary vanilla extract 3...,45.660789


In [20]:
test=X[["sample_id","price"]]
test["price"] = test["price"].round(2)
test.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test["price"] = test["price"].round(2)


Unnamed: 0,sample_id,price
0,100179,12.59
1,245611,9.97
2,146263,27.299999
3,95658,6.32
4,36806,45.66


In [21]:
# Assuming 'test' is your DataFrame and 'predicted_value' is the column added with predictions
test.to_csv("/kaggle/working/test_predictions.csv", index=False)