In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import regex as re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import hstack
from xgboost import XGBRegressor
pd.set_option('display.max_colwidth', None)

DATA PREPROCESSING

In [2]:
# Load the training data
train = pd.read_excel(r"C:\Users\Dell\Downloads\Amazon ML Challenge 2025\student_resource\mine\train.xlsx")
print(train.head())
print(train.info())
print(train.describe())
print(train.isnull().sum())
print(train.columns)

   sample_id  \
0      33127   
1     198967   
2     261251   
3      55858   
4     292686   

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       

In [3]:
# Cleaning the text by removing special characters, links, and converting to lowercase
def clean_text(text):
    text = str(text).lower()                      # converts to lowercase
    text = re.sub(r'http\S+', ' ', text)          # remove links
    text = re.sub(r'[^a-z0-9\s\.]', ' ', text)    # remove punctuation except periods, since quantity can have a decimal
    text = re.sub(r'\s+', ' ', text).strip()      # remove extra spaces
    return text

train['catalog_content_clean'] = train['catalog_content'].apply(clean_text)

In [4]:
# Reviewing cleaned text samples
for line in train['catalog_content_clean'].head():
    print(line)

item name la victoria green taco sauce mild 12 ounce pack of 6 value 72.0 unit fl oz
item name salerno cookies the original butter cookies 8 ounce pack of 4 bullet point 1 original butter cookies classic butter cookies made with real butter bullet point 2 variety pack includes 4 boxes with 32 cookies total bullet point 3 occasion perfect delicious cookies for birthdays weddings anniversaries bullet point 4 shareable treats fun to give and enjoy with friends and family bullet point 5 salerno brand trusted brand of delicious butter cookies since 1925 value 32.0 unit ounce
item name bear creek hearty soup bowl creamy chicken with rice 1.9 ounce pack of 6 bullet point 1 loaded with hearty long grain wild rice and vegetables bullet point 2 full of hearty goodness bullet point 3 single serve bowls bullet point 4 easy to prepare mix bullet point 5 0 grams trans fat value 11.4 unit ounce
item name judee s blue cheese powder 11.25 oz gluten free and nut free use in seasonings and salad dressing

In [5]:
# Extracting brand and quantity from the cleaned catalog content 
def extract_quantity(text):
    text = re.sub(r'\s+', ' ', text)
    pattern1 = r"(\d+(\.\d+)?\s*(oz|ounce|g|gram|kg|ml|l|liter|lb|pound|fl oz|fluid ounce))\s*(pack of|pack|count of|count)?\s*(\d+)?"
    match = re.search(pattern1, text)
    if match:
        size = match.group(1)
        pack_type = match.group(4) or ""
        pack_count = match.group(5) or ""
        quantity = f"{size} {pack_type} {pack_count}".strip()
        return quantity
    return ""

def extract_brand(text):
    text = re.sub(r'^item name\s*', '', text)
    brand = ' '.join(text.split()[:2])
    return brand.lower()

train['quantity'] = train['catalog_content_clean'].apply(extract_quantity)
train['brand'] = train['catalog_content_clean'].apply(extract_brand)

In [6]:
# Initialize LabelEncoder
le = LabelEncoder()

# Split dataset first before fitting encoders
X_train, X_val, y_train, y_val = train_test_split(
    train,
    train['price'],
    test_size=0.2,
    random_state=42
)

# Fit on all unique brands
all_brands = pd.concat([X_train['brand'], X_val['brand']]).fillna('unknown').unique()
le.fit(all_brands)

brand_train = le.transform(X_train['brand'].fillna('unknown'))
brand_val = le.transform(X_val['brand'].fillna('unknown'))

# Convert quantities (fallback to 0 if cannot be numeric)
def convert_to_numeric(q):
    try:
        val = re.findall(r'\d+(\.\d+)?', q)
        return float(val[0]) if val else 0.0
    except:
        return 0.0

quantity_train = X_train['quantity'].apply(convert_to_numeric).values.astype(np.float32)
quantity_val = X_val['quantity'].apply(convert_to_numeric).values.astype(np.float32)

In [7]:
# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(
    max_features=100000,
    ngram_range=(1, 2),
    stop_words='english',
    min_df=3
)

X_train_tfidf = tfidf.fit_transform(X_train['catalog_content_clean'])
X_val_tfidf = tfidf.transform(X_val['catalog_content_clean'])

In [8]:
# Training a baseline Ridge regression model using only TF-IDF features
model = Ridge(alpha=1.0)
model.fit(X_train_tfidf, y_train)
preds = model.predict(X_val_tfidf)

smape = np.mean(np.abs(preds - y_val) / ((np.abs(y_val) + np.abs(preds)) / 2)) * 100
print("SMAPE using only Tfidf:", smape)

SMAPE using only Tfidf: 68.36264862961396


In [9]:
# Combining TF-IDF features with brand and quantity features into a single feature set
X_train_full = hstack([X_train_tfidf, brand_train.reshape(-1, 1), quantity_train.reshape(-1, 1)])
X_val_full = hstack([X_val_tfidf, brand_val.reshape(-1, 1), quantity_val.reshape(-1, 1)])

print("Combined Feature Shapes:\n", X_train_full.shape, X_val_full.shape)

Combined Feature Shapes:
 (60000, 100002) (15000, 100002)


In [12]:
# Refitting Ridge regression with combined features
model = Ridge(alpha=1.0)
model.fit(X_train_full, y_train)
preds = model.predict(X_val_full)

smape = np.mean(np.abs(preds - y_val) / ((np.abs(y_val) + np.abs(preds)) / 2)) * 100
print("SMAPE after combining features:", smape)

SMAPE after combining features: 68.37349495769114


In [15]:
# Training an XGBoost model with combined features
model = XGBRegressor(
    n_estimators=800,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.7,
    tree_method='hist',
    n_jobs=-1,
    random_state=42
)

model.fit(X_train_full, y_train)
preds = model.predict(X_val_full)

smape = np.mean(np.abs(preds - y_val) / ((np.abs(y_val) + np.abs(preds)) / 2)) * 100
print("SMAPE after using XGBoost:", smape)

SMAPE after using XGBoost: 62.96078951259315


In [16]:
# Log-transforming the target variable for XGBoost
y_train_log = np.log1p(y_train)
model.fit(X_train_full, y_train_log)

preds_log = model.predict(X_val_full)
preds = np.expm1(preds_log)

smape = np.mean(np.abs(preds - y_val) / ((np.abs(y_val) + np.abs(preds)) / 2)) * 100
print("SMAPE after using XGBoost with log transformed price:", smape)

SMAPE after using XGBoost with log transformed price: 55.36629250545527
