In [1]:
!pip install --quiet pandas numpy scikit-learn joblib tqdm


In [86]:
# CELL 4 — Fit SVD (TruncatedSVD) on train TF-IDF, transform train+test
from sklearn.decomposition import TruncatedSVD
n_components = 200   # change to 100 if memory/time concern
svd = TruncatedSVD(n_components=n_components, random_state=42)
X_train_svd = svd.fit_transform(X_train_text) # Fit on X_train_text (from new train)
X_val_svd  = svd.transform(X_val_text) # Transform X_val_text

joblib.dump(svd, "svd_for_clustering_new_train.joblib") # Save with new name
print("SVD output shapes -> new train:", X_train_svd.shape, " val:", X_val_svd.shape)

SVD output shapes -> new train: (60000, 200)  val: (15000, 200)


In [32]:
import unicodedata
import re

def clean_text(s):
    if pd.isna(s): return s
    s = unicodedata.normalize('NFKC', str(s))
    s = re.sub(r'[\r\n\t]+', ' ', s)        # newlines -> space
    s = re.sub(r'[–—―]', '-', s)            # unify dashes
    s = re.sub(r'\s+', ' ', s).strip()     # collapse spaces
    # unify common units into simple tokens (conservative)
    s = re.sub(r'\bounces?\b', 'oz', flags=re.I, string=s)
    s = re.sub(r'\bfl\s*oz\b', 'fl_oz', flags=re.I, string=s)
    s = re.sub(r'\bounces?\b', 'oz', flags=re.I, string=s)
    s = re.sub(r'\bgrams\b', 'g', flags=re.I, string=s)
    s = re.sub(r'\bkilogram?s\b', 'kg', flags=re.I, string=s)
    return s

df['catalog_clean'] = df['catalog_content'].apply(clean_text)
# quick sample
display(df[['catalog_content','catalog_clean']].sample(5, random_state=2).to_dict(orient='records'))

NameError: name 'df' is not defined

In [53]:
# Create numeric fills for clustering on test data
import re
import numpy as np
import pandas as pd
import unicodedata # Import unicodedata if not already in scope

# Ensure catalog_clean is created for test data
def clean_text(s):
    if pd.isna(s): return s
    s = unicodedata.normalize('NFKC', str(s))
    s = re.sub(r'[\r\n\t]+', ' ', s)        # newlines -> space
    s = re.sub(r'[–—―]', '-', s)            # unify dashes
    s = re.sub(r'\s+', ' ', s).strip()     # collapse spaces
    # unify common units into simple tokens (conservative)
    s = re.sub(r'\bounces?\b', 'oz', flags=re.I, string=s)
    s = re.sub(r'\bfl\s*oz\b', 'fl_oz', flags=re.I, string=s)
    s = re.sub(r'\bounces?\b', 'oz', flags=re.I, string=s)
    s = re.sub(r'\bgrams\b', 'g', flags=re.I, string=s)
    s = re.sub(r'\bkilogram?s\b', 'kg', flags=re.I, string=s)
    return s

# Check if 'catalog_clean' exists, create if not
if 'catalog_clean' not in test.columns:
    print("Creating 'catalog_clean' for test data...")
    test['catalog_clean'] = test['catalog_content'].apply(clean_text)


# Re-defining parse_numeric_fields function if it's not available in the current scope
def parse_numeric_fields(text):
    out = {'pack_count': np.nan, 'unit_value': np.nan, 'unit_type': None, 'total_units': np.nan}
    if pd.isna(text):
        return out
    pack_pat = re.compile(r'\bpack\s*of\s*(\d+)\b', flags=re.I)
    pack_pat2 = re.compile(r'\b(\d+)\s*[-]?\s*pack\b', flags=re.I)
    mult_pat = re.compile(r'(\d+)\s*[x×]\s*(\d+\.?\d*)\s*(fl_oz|oz|ml|g|kg)?', flags=re.I)
    unit_pat = re.compile(r'(\d+\.?\d*)\s*(fl_oz|oz|ml|g|kg|count|ct|piece|pcs|bottle|bottles)\b', flags=re.I)

    # pack_count
    m = pack_pat.search(text) or pack_pat2.search(text)
    if m:
        try: out['pack_count'] = int(m.group(1))
        except: out['pack_count'] = np.nan
    # multiplication like "2 x 250 ml"
    m = mult_pat.search(text)
    if m:
        try:
            num = float(m.group(1)); size = float(m.group(2)); unit = (m.group(3) or '').lower()
            out['total_units'] = num * size
            out['unit_value'] = size
            out['unit_type'] = unit if unit else None
        except:
            pass
    # simple unit pattern
    if np.isnan(out['unit_value']):
        m = unit_pat.search(text)
        if m:
            try:
                out['unit_value'] = float(m.group(1))
                out['unit_type'] = m.group(2).lower()
            except:
                pass
    return out

parsed = test['catalog_clean'].apply(parse_numeric_fields).apply(pd.Series)

# Create a new DataFrame with a clean index and copy relevant columns
test_processed = pd.DataFrame(index=test.index)
test_processed['catalog_clean'] = test['catalog_clean']
test_processed['pack_count'] = parsed['pack_count']
test_processed['unit_value'] = parsed['unit_value']
test_processed['unit_type'] = parsed['unit_type']
test_processed['total_units'] = parsed['total_units']


def normalize_unit(unit_value, unit_type):
    # Ensure scalar values are passed and handle potential Series
    if isinstance(unit_value, pd.Series) or isinstance(unit_type, pd.Series):
        # This should not happen with axis=1, but as a safeguard:
        return (np.nan, None)
    if pd.isna(unit_value) or pd.isna(unit_type):
        return (np.nan, None)
    try:
        val = float(unit_value)
    except (ValueError, TypeError):
        return (np.nan, None)
    u = str(unit_type).lower() # Ensure unit_type is a string

    if u == 'oz':
        # conservative assumption: treat oz as grams (documented!)
        return (val * 28.3495, 'g')
    if u == 'fl_oz':
        return (val * 29.5735, 'ml')
    if u == 'kg':
        return (val * 1000.0, 'g')
    if u == 'g':
        return (val, 'g')
    if u == 'ml':
        return (val, 'ml')
    if u in ('count','ct','piece','pcs','bottle','bottles'):
        return (val, 'count')
    return (val, u)

# Apply normalization to the new DataFrame
norms = test_processed.apply(lambda r: normalize_unit(r['unit_value'], r['unit_type']), axis=1)
test_processed['norm_quantity'] = norms.map(lambda x: x[0])
test_processed['norm_unit'] = norms.map(lambda x: x[1])

# If total_units not filled from parse, fill with unit_value*pack_count where both present
test_processed['total_units'] = test_processed['total_units'].fillna(test_processed['unit_value'] * test_processed['pack_count'])
# and a normalized total if possible
test_processed['total_norm_quantity'] = test_processed['total_units'].fillna(test_processed['unit_value'] * test_processed['pack_count'])

mask = test_processed['total_norm_quantity'].isna() & test_processed['norm_quantity'].notna() & test_processed['pack_count'].notna()
test_processed.loc[mask, 'total_norm_quantity'] = test_processed.loc[mask, 'norm_quantity'] * test_processed.loc[mask, 'pack_count']


test_processed['norm_quantity_fill'] = test_processed.get('norm_quantity', pd.Series(index=test_processed.index)).fillna(0.0).astype(float)
test_processed['pack_count_fill'] = test_processed.get('pack_count', pd.Series(index=test_processed.index)).fillna(0.0).astype(float)
test_processed['desc_len'] = test_processed['catalog_clean'].str.split().str.len().fillna(0).astype(int)

# Copy the new processed columns back to the original test DataFrame
for col in ['pack_count', 'unit_value', 'unit_type', 'total_units',
            'norm_quantity', 'norm_unit', 'total_norm_quantity',
            'norm_quantity_fill', 'pack_count_fill', 'desc_len']:
    test[col] = test_processed[col]


print("Numeric fills created for test data.")

Creating 'catalog_clean' for test data...
Numeric fills created for test data.


In [57]:
# CELL 7 — Inspect a few clusters (human-check)
# print sample product texts from top clusters for quick inspection
top_clusters = train['cluster'].value_counts().head(8).index.tolist()
for c in top_clusters:
    print("\n=== Cluster", c, "size:", int((train['cluster']==c).sum()), "===\n")
    print(train[train['cluster']==c]['text_for_clustering'].sample(6, random_state=1).tolist())


=== Cluster 53 size: 4488 ===

['Other M&Ms Peanut Chocolate Candy Singles, 48 Count', 'Other Green Mountain Flavored Variety (22 K cups) Keurig', 'Other Vernors Ginger Soda', 'Other Artisana Nut Bttr Almond Raw', 'Other USA Grown Organic Garbanzo Beans (ChickPeas) Raw/Non GMO/Kosher 7LB', 'Other Snickers Bars (, )']

=== Cluster 45 size: 3410 ===

['Other POLAND SPRING Sparkling Water 24 pk, 0.5 LT', 'Other Butterfly Flavoring Paste, Pack of 1 (Pineapple, )', 'Other Byrne Half & Half Original Liquid Creamer (902 00071)', "Other Bisquick Shake 'N Pour Buttermilk Pancake Mix,", 'Other Tea Zone 2.2 lb Mango Pudding Mix Powder', 'Other Frontier Peppermint Flavor Certified Organic, Bottle']

=== Cluster 108 size: 3228 ===

['Other Christmas Tree Swirl Pops (1 dozen) Holiday Party and Decor Accessories', 'Other EARTHS BEST Organic Strawberry Sweet & Veggie Straws,', 'Other Cafe Tastle Single Serve Packets, Rich Hot Chocolate Mix,', 'Other Primal Palate Organic Spices Taco Seasoning, Certif

In [55]:
# CELL 7 — Cluster the training data with MiniBatchKMeans
from sklearn.cluster import MiniBatchKMeans
import joblib
K = 150   # start; change to 100/200 as needed
kmeans = MiniBatchKMeans(n_clusters=K, random_state=42, batch_size=5000, n_init=10) # Added n_init for clarity
kmeans.fit(X_train_cl)
train_labels = kmeans.predict(X_train_cl)
test_labels  = kmeans.predict(X_test_cl)

# attach labels
train['cluster'] = train_labels
test['cluster']  = test_labels

joblib.dump(kmeans, "kmeans_clustering.joblib")
print("Cluster sizes (train top 10):\n", pd.Series(train_labels).value_counts().head(10).to_dict())

Cluster sizes (train top 10):
 {53: 4488, 45: 3410, 108: 3228, 104: 2199, 149: 1749, 38: 1749, 141: 1700, 86: 1588, 24: 1523, 87: 1484}


In [58]:
# CELL 8 — Compute cluster-level stats on train and save them
cluster_stats = train.groupby('cluster')['price_clean'].agg(['count','median','mean','std']).rename(columns={'median':'median_price'}).reset_index()
display(cluster_stats.head(10))
# Save for later lookups
cluster_stats.to_csv("cluster_stats_train.csv", index=False)

print("\nSaved cluster_stats_train.csv")

Unnamed: 0,cluster,count,median_price,mean,std
0,0,1264,16.99,27.881551,36.926884
1,1,1172,7.96,11.670282,15.468718
2,2,84,9.085,14.127679,13.79166
3,3,132,6.35,12.035152,14.170129
4,4,418,10.44,15.316818,12.109764
5,5,43,11.85,18.211628,22.915131
6,6,28,40.595,50.572321,37.08575
7,7,397,21.99,25.290743,16.256973
8,8,897,10.99,18.27981,19.315072
9,9,1157,17.99,25.755627,34.943936



Saved cluster_stats_train.csv


In [54]:
# CELL 6 — Build final clustering matrices (concatenate SVD + scaled numerics)
import numpy as np
X_train_cl = np.concatenate([X_train_svd, X_train_num_s], axis=1)
X_test_cl  = np.concatenate([X_test_svd, X_test_num_s], axis=1)

print("Final clustering matrices shapes -> train:", X_train_cl.shape, " test:", X_test_cl.shape)

Final clustering matrices shapes -> train: (75000, 203)  test: (75000, 203)


In [87]:
# CELL 5 — Prepare numeric extras and scale (fit scaler on train, transform test)
from sklearn.preprocessing import StandardScaler

num_cols = ['norm_quantity_fill','pack_count_fill','desc_len']
X_train_num = train_new[num_cols].fillna(0).astype(float).values # Use train_new
X_test_num  = val[num_cols].fillna(0).astype(float).values # Use val

scaler = StandardScaler()
X_train_num_s = scaler.fit_transform(X_train_num)
X_test_num_s  = scaler.transform(X_test_num)

joblib.dump(scaler, "scaler_num_extras.joblib")
print("Numeric extras shapes:", X_train_num_s.shape, X_test_num_s.shape)

Numeric extras shapes: (60000, 3) (15000, 3)


In [36]:
# Apply text cleaning to test data
def clean_text(s):
    if pd.isna(s): return s
    s = unicodedata.normalize('NFKC', str(s))
    s = re.sub(r'[\r\n\t]+', ' ', s)        # newlines -> space
    s = re.sub(r'[–—―]', '-', s)            # unify dashes
    s = re.sub(r'\s+', ' ', s).strip()     # collapse spaces
    # unify common units into simple tokens (conservative)
    s = re.sub(r'\bounces?\b', 'oz', flags=re.I, string=s)
    s = re.sub(r'\bfl\s*oz\b', 'fl_oz', flags=re.I, string=s)
    s = re.sub(r'\bounces?\b', 'oz', flags=re.I, string=s)
    s = re.sub(r'\bgrams\b', 'g', flags=re.I, string=s)
    s = re.sub(r'\bkilogram?s\b', 'kg', flags=re.I, string=s)
    return s

test['catalog_clean'] = test['catalog_content'].apply(clean_text)
print("Test catalog_clean created.")

Test catalog_clean created.


In [37]:
# Apply product_name_short extraction to test data
def make_product_name_short(s, max_len=200):
    if pd.isna(s):
        return ''
    # remove "Item Name:" prefix if present
    out = re.sub(r'Item Name:\s*', '', str(s), flags=re.I)
    # remove parenthetical "(Pack of ...)" and trailing unit e.g. "12 oz" etc.
    out = re.sub(r'\(.*?pack.*?\)', '', out, flags=re.I)
    out = re.sub(r'\b\d+\.?\d*\s*(fl_oz|oz|ml|g|kg|count|ct|pcs|pieces|bottle|bottles)\b', '', out, flags=re.I)
    # remove stray punctuation at ends and collapse spaces
    out = re.sub(r'[-–—]', ' ', out)
    out = re.sub(r'\s{2,}', ' ', out).strip()
    return out[:max_len]

test['product_name_short'] = test['catalog_clean'].apply(make_product_name_short)
print("Test product_name_short created.")

Test product_name_short created.


In [38]:
# Apply brand extraction to test data
def extract_brand(text):
    if pd.isna(text):
        return np.nan
    m = re.search(r'Item Name:\s*(.+)', str(text), flags=re.I)
    if not m:
        return np.nan
    rest = m.group(1)
    # split rest by common separators to get the initial segment
    first = re.split(r'[,\(\-]| - ', rest, maxsplit=1)[0].strip()
    # accept up to first 3 tokens if they look like a brand (start alnum or capital)
    tokens = first.split()
    if len(tokens)==0:
        return np.nan
    brand_tokens = []
    for t in tokens[:4]:
        if re.match(r'^[A-Za-z0-9&\']', t):
            brand_tokens.append(t)
        else:
            break
    brand = ' '.join(brand_tokens).strip()
    # sanity checks: avoid capturing generic words
    if len(brand) < 2 or brand.lower() in {'item','new','pack','sample'}:
        return np.nan
    return brand

test['brand_extracted'] = test['catalog_clean'].apply(extract_brand)
print("Test brand_extracted created.")

Test brand_extracted created.


In [39]:
# Apply brand_topK mapping to test data
K = 100 # Must match K used for training
def brand_topk_map(b):
    if pd.isna(b):
        return np.nan
    if b in topk_brands: # Use the topk_brands set created from training data
        return b
    return 'Other'

test['brand_topK'] = test['brand_extracted'].apply(brand_topk_map)
print("Test brand_topK created.")

Test brand_topK created.


In [40]:
# Create text_for_clustering for test data
def make_text_for_clustering(row):
    brand = row.get('brand_topK') if 'brand_topK' in row else row.get('brand_extracted','')
    brand = '' if pd.isna(brand) else str(brand)
    pname = row.get('product_name_short') if row.get('product_name_short') else ''
    if not pname:
        pname = row.get('catalog_clean','') or ''
    return (brand + ' ' + str(pname)).strip()

test['text_for_clustering'] = test.apply(make_text_for_clustering, axis=1)
print("Test text_for_clustering created.")

Test text_for_clustering created.


In [1]:
# CELL 5 (from plan) — Cluster the new training data with MiniBatchKMeans
from sklearn.cluster import MiniBatchKMeans
import joblib

K = 150   # Use the same K as before, or tune if needed
kmeans_new = MiniBatchKMeans(n_clusters=K, random_state=42, batch_size=5000, n_init=10) # Added n_init for clarity
kmeans_new.fit(X_train_cl_new)

train_new_labels = kmeans_new.predict(X_train_cl_new)
val_labels  = kmeans_new.predict(X_val_cl)

# attach labels to the new dataframes
train_new['cluster'] = train_new_labels
val['cluster']  = val_labels

# Optionally save the new kmeans model
joblib.dump(kmeans_new, "kmeans_clustering_new_train.joblib")

print("Cluster sizes (new train top 10):\n", pd.Series(train_new_labels).value_counts().head(10).to_dict())

NameError: name 'X_train_cl_new' is not defined

## Summary:

### Data Analysis Key Findings

* The SMAPE score for the `pred_cluster_median` predictions compared to the actual `price` was 38.2202%.
* The SMAPE score for the `pred_hybrid` predictions compared to the actual `price` was 39.2181%.

### Insights or Next Steps

* The `pred_cluster_median` model performed slightly better than the `pred_hybrid` model based on the SMAPE metric.
* Further analysis could investigate the specific instances where the `pred_hybrid` model performed worse to identify potential areas for improvement.

**Reasoning**:
The SMAPE function has been defined. Now, calculate the SMAPE for both prediction columns using the `smape` function and the 'price' column from the merged DataFrame as the true values.

In [8]:
smape_cluster_median = smape(df_merged['price'], df_merged['pred_cluster_median'])
smape_hybrid = smape(df_merged['price'], df_merged['pred_hybrid'])

print(f"SMAPE for cluster median baseline: {smape_cluster_median:.4f}")
print(f"SMAPE for hybrid prediction: {smape_hybrid:.4f}")

SMAPE for cluster median baseline: 38.2202
SMAPE for hybrid prediction: 39.2181


## Merge data

### Subtask:
Merge the two DataFrames on a common identifier (assuming 'sample_id').

**Reasoning**:
Merge the two DataFrames on a common identifier (assuming 'sample_id').

In [7]:
df_merged = pd.merge(df_predictions, df_actuals, on='sample_id', how='inner')
display(df_merged.head())

Unnamed: 0,sample_id,pred_cluster_median,pred_hybrid,price
0,100179,18.28,18.28,25.94581
1,245611,18.28,18.28,22.815003
2,146263,18.28,18.28,27.192009
3,95658,10.12,18.910109,17.980817
4,36806,18.28,18.28,31.610154


## Define smape function

### Subtask:
Create a Python function to calculate SMAPE based on the provided formula.

**Reasoning**:
Define the SMAPE function as requested in the instructions.

In [6]:
import numpy as np

def smape(y_true, y_pred):
    """
    Calculates the Symmetric Mean Absolute Percentage Error (SMAPE).

    Args:
        y_true: The true values.
        y_pred: The predicted values.

    Returns:
        The SMAPE score.
    """
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    # Add a small epsilon to avoid division by zero where both y_true and y_pred are zero
    epsilon = 1e-8
    ratio = numerator / (denominator + epsilon)
    return np.mean(ratio) * 100

## Load data

### Subtask:
Load `test_predictions.csv` and `test_out.csv` into pandas DataFrames.

**Reasoning**:
Load the two CSV files into pandas DataFrames and display their heads and columns to confirm they are loaded correctly.

In [5]:
import pandas as pd

df_predictions = pd.read_csv('test_predictions.csv')
df_actuals = pd.read_csv('test_out.csv')

print("df_predictions head:")
display(df_predictions.head())
print("\ndf_predictions columns:", df_predictions.columns.tolist())

print("\ndf_actuals head:")
display(df_actuals.head())
print("\ndf_actuals columns:", df_actuals.columns.tolist())

df_predictions head:


Unnamed: 0,sample_id,pred_cluster_median,pred_hybrid
0,100179,18.28,18.28
1,245611,18.28,18.28
2,146263,18.28,18.28
3,95658,10.12,18.910109
4,36806,18.28,18.28



df_predictions columns: ['sample_id', 'pred_cluster_median', 'pred_hybrid']

df_actuals head:


Unnamed: 0,sample_id,price
0,100179,25.94581
1,245611,22.815003
2,146263,27.192009
3,95658,17.980817
4,36806,31.610154



df_actuals columns: ['sample_id', 'price']


In [None]:
empty_cols = [c for c in df.columns if df[c].isna().all()]
print("Empty columns found:", empty_cols)
df = df.drop(columns=empty_cols)

Empty columns found: []


In [None]:
import re

keywords = ['organic','gluten-free','gluten','keto','sugar-free','premium','imported','baby','natural','diet']
for kw in keywords:
    df[f'kw_{kw.replace("-","_")}'] = df['catalog_clean'].str.contains(r'\b' + re.escape(kw) + r'\b', case=False, na=False)

# Check frequencies
freqs = {c: int(df[c].sum()) for c in df.columns if c.startswith('kw_')}
print(freqs)

{'kw_organic': 7603, 'kw_gluten_free': 1587, 'kw_gluten': 6612, 'kw_keto': 1967, 'kw_sugar_free': 411, 'kw_premium': 2114, 'kw_imported': 299, 'kw_baby': 657, 'kw_natural': 3786, 'kw_diet': 319}


In [None]:
import pandas as pd, numpy as np
PATH = "train_preprocessed_for_clustering.csv"   # change if filename differs
df = pd.read_csv(PATH)
print("rows, cols:", df.shape)
print("columns:", df.columns.tolist())
# quick preview
display(df.head(3))
# check target presence
print("price_clean: non-null / total =", df['price_clean'].notna().sum(), "/", len(df))

rows, cols: (75000, 19)
columns: ['sample_id', 'catalog_content', 'catalog_clean', 'product_name_short', 'brand_extracted', 'brand_topK', 'text_for_clustering', 'unit_value', 'unit_type', 'pack_count', 'total_units', 'norm_quantity', 'norm_unit', 'total_norm_quantity', 'price', 'price_clean', 'norm_quantity_fill', 'pack_count_fill', 'desc_len']


Unnamed: 0,sample_id,catalog_content,catalog_clean,product_name_short,brand_extracted,brand_topK,text_for_clustering,unit_value,unit_type,pack_count,total_units,norm_quantity,norm_unit,total_norm_quantity,price,price_clean,norm_quantity_fill,pack_count_fill,desc_len
0,33127,"Item Name: La Victoria Green Taco Sauce Mild, ...","Item Name: La Victoria Green Taco Sauce Mild, ...","La Victoria Green Taco Sauce Mild,",La Victoria Green Taco,Other,"Other La Victoria Green Taco Sauce Mild,",12.0,oz,6.0,72.0,340.194,g,72.0,4.89,4.89,340.194,6.0,13
1,198967,"Item Name: Salerno Cookies, The Original Butte...","Item Name: Salerno Cookies, The Original Butte...","Salerno Cookies, The Original Butter Cookies,",Salerno Cookies,Other,"Other Salerno Cookies, The Original Butter Coo...",8.0,oz,4.0,32.0,226.796,g,32.0,13.12,13.12,226.796,4.0,13
2,261251,"Item Name: Bear Creek Hearty Soup Bowl, Creamy...","Item Name: Bear Creek Hearty Soup Bowl, Creamy...","Bear Creek Hearty Soup Bowl, Creamy Chicken wi...",Bear Creek Hearty Soup,Other,"Other Bear Creek Hearty Soup Bowl, Creamy Chic...",1.9,oz,6.0,11.4,53.86405,g,11.4,1.97,1.97,53.86405,6.0,16


price_clean: non-null / total = 75000 / 75000


In [None]:
try:
    df_test = pd.read_csv(test_path, engine='python')
    print("Successfully loaded test.csv using python engine.")
    print("test shape:", df_test.shape)
    print("test sample columns:", df_test.columns.tolist()[:20])
except Exception as e:
    print(f"Error reading test.csv with python engine: {e}")

Error reading test.csv with python engine: unexpected end of data


In [48]:
# Cell 1: change paths if needed
import pandas as pd, numpy as np, re, unicodedata
train_path = "sample_data/train_preprocessed_for_clustering.csv"   # your processed train file
test_raw_path = "sample_data/test.csv"   # replace with actual test filename (raw)
test_out_path = "test_preprocessed_for_clustering.csv"

# load files
train = pd.read_csv(train_path)
test  = pd.read_csv(test_raw_path)

print("train rows,cols:", train.shape)
print("test  rows,cols:", test.shape)
# quick col lists
print("Train cols sample:", train.columns.tolist()[:20])
print("Test cols sample:", test.columns.tolist()[:20])

train rows,cols: (75000, 19)
test  rows,cols: (75000, 3)
Train cols sample: ['sample_id', 'catalog_content', 'catalog_clean', 'product_name_short', 'brand_extracted', 'brand_topK', 'text_for_clustering', 'unit_value', 'unit_type', 'pack_count', 'total_units', 'norm_quantity', 'norm_unit', 'total_norm_quantity', 'price', 'price_clean', 'norm_quantity_fill', 'pack_count_fill', 'desc_len']
Test cols sample: ['sample_id', 'catalog_content', 'image_link']


In [19]:
# Apply text cleaning to test data
def clean_text(s):
    if pd.isna(s): return s
    s = unicodedata.normalize('NFKC', str(s))
    s = re.sub(r'[\r\n\t]+', ' ', s)        # newlines -> space
    s = re.sub(r'[–—―]', '-', s)            # unify dashes
    s = re.sub(r'\s+', ' ', s).strip()     # collapse spaces
    # unify common units into simple tokens (conservative)
    s = re.sub(r'\bounces?\b', 'oz', flags=re.I, string=s)
    s = re.sub(r'\bfl\s*oz\b', 'fl_oz', flags=re.I, string=s)
    s = re.sub(r'\bounces?\b', 'oz', flags=re.I, string=s)
    s = re.sub(r'\bgrams\b', 'g', flags=re.I, string=s)
    s = re.sub(r'\bkilogram?s\b', 'kg', flags=re.I, string=s)
    return s

test['catalog_clean'] = test['catalog_content'].apply(clean_text)
print("Test catalog_clean created.")

Test catalog_clean created.


In [20]:
# Apply product_name_short extraction to test data
def make_product_name_short(s, max_len=200):
    if pd.isna(s):
        return ''
    # remove "Item Name:" prefix if present
    out = re.sub(r'Item Name:\s*', '', str(s), flags=re.I)
    # remove parenthetical "(Pack of ...)" and trailing unit e.g. "12 oz" etc.
    out = re.sub(r'\(.*?pack.*?\)', '', out, flags=re.I)
    out = re.sub(r'\b\d+\.?\d*\s*(fl_oz|oz|ml|g|kg|count|ct|pcs|pieces|bottle|bottles)\b', '', out, flags=re.I)
    # remove stray punctuation at ends and collapse spaces
    out = re.sub(r'[-–—]', ' ', out)
    out = re.sub(r'\s{2,}', ' ', out).strip()
    return out[:max_len]

test['product_name_short'] = test['catalog_clean'].apply(make_product_name_short)
print("Test product_name_short created.")

Test product_name_short created.


In [67]:
# CELL 12 — Save predictions and artifacts
test[['sample_id','pred_cluster_median','pred_hybrid']].to_csv("test_predictions.csv", index=False)
print("Saved test_predictions.csv and saved models (tfidf,svd,scaler,kmeans,cluster_models).")

Saved test_predictions.csv and saved models (tfidf,svd,scaler,kmeans,cluster_models).


In [66]:
# CELL 11 — Predict on test with hybrid approach and evaluate
import numpy as np
preds = []
for i, row in test.iterrows(): # Changed df_test to test
    c = row['cluster']
    model = cluster_models.get(c, None)
    if model is None:
        preds.append(row['pred_cluster_median'])   # fallback median
    else:
        # Need to get the correct row from X_test_cl based on the original index of the test DataFrame
        # Assuming the index of X_test_cl aligns with the index of the test DataFrame after preprocessing
        x_row = X_test_cl[test.index.get_loc(row.name)].reshape(1,-1)
        # Assuming the model in Cell 10 predicts price directly, not log price
        p = model.predict(x_row)[0]
        preds.append(float(p))

test['pred_hybrid'] = preds

# Evaluation (Evaluation on test requires ground truth 'price_clean' which is not available)
# from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
# y_true = test['price_clean'].values # This line caused the error previously
# y_pred = test['pred_hybrid'].values
# print("Hybrid Test RMSE: {:.4f}, MAE: {:.4f}, R2: {:.4f}".format(
#     mean_squared_error(y_true, y_pred, squared=False),
#     mean_absolute_error(y_true, y_pred),
#     r2_score(y_true, y_pred)))

print("Hybrid predictions (pred_hybrid) added to test data.")

Hybrid predictions (pred_hybrid) added to test data.


In [65]:
# CELL 10 — (Optional) Improve within-cluster using per-cluster regression (hybrid)

# Strategy:
# Train a small regressor for clusters with sufficient train rows (e.g., >=200).
# Use per-cluster models for those clusters; fallback to cluster median otherwise.

from sklearn.linear_model import Ridge
import joblib, numpy as np

cluster_models = {}
min_rows = 200
y_train = train['price_clean'].values # Use 'train' DataFrame

for c, grp in train.groupby('cluster'): # Use 'train' DataFrame for grouping
    if len(grp) < min_rows:
        cluster_models[c] = None
        continue
    idx = grp.index.to_numpy()
    # Ensure X_train_cl is available in the current scope
    try:
        Xc = X_train_cl[idx]     # use same transformed features
    except NameError:
        print(f"Error: X_train_cl is not defined. Please ensure Cell 6 was executed.")
        cluster_models[c] = None
        continue


    model = Ridge(alpha=1.0) # Example regressor, can be tuned
    model.fit(Xc, y_train[idx])
    cluster_models[c] = model

joblib.dump(cluster_models, "cluster_regression_models.joblib")

print("Per-cluster regression models trained where sufficient data was available.")

# Note: The evaluation part of the hybrid model is in the next step (Cell 11)

Per-cluster regression models trained where sufficient data was available.


In [62]:
# Re-running CELL 8 — Compute cluster-level stats on train and save them
# This is done to ensure cluster_stats is available for the next step.
cluster_stats = train.groupby('cluster')['price_clean'].agg(['count','median','mean','std']).rename(columns={'median':'median_price'}).reset_index()
display(cluster_stats.head(10))
# Save for later lookups
cluster_stats.to_csv("cluster_stats_train.csv", index=False)

print("\nSaved cluster_stats_train.csv")

Unnamed: 0,cluster,count,median_price,mean,std
0,0,1264,16.99,27.881551,36.926884
1,1,1172,7.96,11.670282,15.468718
2,2,84,9.085,14.127679,13.79166
3,3,132,6.35,12.035152,14.170129
4,4,418,10.44,15.316818,12.109764
5,5,43,11.85,18.211628,22.915131
6,6,28,40.595,50.572321,37.08575
7,7,397,21.99,25.290743,16.256973
8,8,897,10.99,18.27981,19.315072
9,9,1157,17.99,25.755627,34.943936



Saved cluster_stats_train.csv


In [64]:
# Re-running CELL 9 — Baseline prediction on test using train-cluster medians
# merge median_price into test using cluster id; fallback to global median
global_median = train['price_clean'].median()

# Remove 'median_price' from test if it exists to avoid merge conflicts
if 'median_price' in test.columns:
    test = test.drop(columns=['median_price'])

test = test.merge(cluster_stats[['cluster','median_price']], on='cluster', how='left')
test['pred_cluster_median'] = test['median_price'].fillna(global_median)

# Evaluation (Evaluation on test requires ground truth 'price_clean' which is not available)
# from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
# y_true = test['price_clean'].values # This line caused the error
# y_pred = test['pred_cluster_median'].values

# rmse = mean_squared_error(y_true, y_pred, squared=False)
# mae  = mean_absolute_error(y_true, y_pred)
# r2   = r2_score(y_true, y_pred)

# print("Test baseline (cluster median) -> RMSE: {:.4f}, MAE: {:.4f}, R2: {:.4f}".format(rmse, mae, r2))

print("Baseline predictions (pred_cluster_median) added to test data.")

Baseline predictions (pred_cluster_median) added to test data.


In [21]:
# Apply brand extraction to test data
def extract_brand(text):
    if pd.isna(text):
        return np.nan
    m = re.search(r'Item Name:\s*(.+)', str(text), flags=re.I)
    if not m:
        return np.nan
    rest = m.group(1)
    # split rest by common separators to get the initial segment
    first = re.split(r'[,\(\-]| - ', rest, maxsplit=1)[0].strip()
    # accept up to first 3 tokens if they look like a brand (start alnum or capital)
    tokens = first.split()
    if len(tokens)==0:
        return np.nan
    brand_tokens = []
    for t in tokens[:4]:
        if re.match(r'^[A-Za-z0-9&\']', t):
            brand_tokens.append(t)
        else:
            break
    brand = ' '.join(brand_tokens).strip()
    # sanity checks: avoid capturing generic words
    if len(brand) < 2 or brand.lower() in {'item','new','pack','sample'}:
        return np.nan
    return brand

test['brand_extracted'] = test['catalog_clean'].apply(extract_brand)
print("Test brand_extracted created.")

Test brand_extracted created.


In [22]:
# Apply brand_topK mapping to test data
K = 100 # Must match K used for training
def brand_topk_map(b):
    if pd.isna(b):
        return np.nan
    if b in topk_brands: # Use the topk_brands set created from training data
        return b
    return 'Other'

test['brand_topK'] = test['brand_extracted'].apply(brand_topk_map)
print("Test brand_topK created.")

Test brand_topK created.


In [16]:
# Create text_for_clustering for test data
def make_text_for_clustering(row):
    brand = row.get('brand_topK') if 'brand_topK' in row else row.get('brand_extracted','')
    brand = '' if pd.isna(brand) else str(brand)
    pname = row.get('product_name_short') if row.get('product_name_short') else ''
    if not pname:
        pname = row.get('catalog_clean','') or ''
    return (brand + ' ' + str(pname)).strip()

test['text_for_clustering'] = test.apply(make_text_for_clustering, axis=1)
print("Test text_for_clustering created.")

Test text_for_clustering created.


In [85]:
# CELL 3 — Fit TF-IDF on train text and transform train+test
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

tfidf = TfidfVectorizer(max_features=50000, ngram_range=(1,2), min_df=5)
X_train_text = tfidf.fit_transform(train_new['text_for_clustering'].fillna('')) # Fit on train_new
X_val_text  = tfidf.transform(val['text_for_clustering'].fillna('')) # Transform val

joblib.dump(tfidf, "tfidf_for_clustering_new_train.joblib") # Save with new name
print("TF-IDF shapes -> new train:", X_train_text.shape, " val:", X_val_text.shape)

TF-IDF shapes -> new train: (60000, 30269)  val: (15000, 30269)


In [13]:
# Cell 2: build the same Top-K brand set used for train
K = 100   # must match the K used for train_preprocessing
# train may already have brand_extracted or brand_topK; prefer brand_extracted
brand_col = 'brand_extracted' if 'brand_extracted' in train.columns else ('brand' if 'brand' in train.columns else None)
assert brand_col is not None, "Train file lacks brand_extracted/brand column"

brand_counts = train[brand_col].fillna('UNKNOWN').value_counts()
topk_brands = set(brand_counts.head(K).index.tolist())
print("Top-K brands (sample 20):", list(topk_brands)[:20])

Top-K brands (sample 20): ['Wish', 'Torani Syrup', 'Pride Of India', 'Monin', 'Food to Live Organic', 'Starbucks Ground Coffee', 'Cheez', 'Beech', 'English Tea Store Loose', 'Fire Department Coffee', 'Coca', 'Starkist Tuna Creations', 'Luster Dust', 'Food to Live Dry', "Bob's Red Mill Organic", 'The Tao of Tea', 'Pop', 'Goya', 'Thick', 'The Bean Organic Coffee']


In [61]:
# CELL 9 — Baseline prediction on test using train-cluster medians
# merge median_price into test using cluster id; fallback to global median
global_median = train['price_clean'].median()
test = test.merge(cluster_stats[['cluster','median_price']], on='cluster', how='left')
test['pred_cluster_median'] = test['median_price'].fillna(global_median)

# Evaluate (Evaluation on test requires ground truth 'price_clean' which is not available)
# from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
# y_true = test['price_clean'].values # This line caused the error
# y_pred = test['pred_cluster_median'].values

# rmse = mean_squared_error(y_true, y_pred, squared=False)
# mae  = mean_absolute_error(y_true, y_pred)
# r2   = r2_score(y_true, y_pred)

# print("Test baseline (cluster median) -> RMSE: {:.4f}, MAE: {:.4f}, R2: {:.4f}".format(rmse, mae, r2))

print("Baseline predictions (pred_cluster_median) added to test data.")

Baseline predictions (pred_cluster_median) added to test data.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Preprocess new training data

### Subtask:
Apply the necessary preprocessing steps (text cleaning, feature extraction, numeric fills, TF-IDF, SVD, scaling) to the new training set.

**Reasoning**:
The previous cell successfully split the training data. The next steps involve applying the same preprocessing steps to the new training set (`train_new`) as were originally applied to the full training set. This includes text cleaning, feature extraction, numeric field parsing, and creating the `text_for_clustering` column. I will combine steps 1-10 from the instructions into a single code block.

In [74]:
# 1. Define the clean_text function (already defined in previous cells, but redefining for clarity within this scope)
import re
import unicodedata
import pandas as pd
import numpy as np

def clean_text(s):
    if pd.isna(s): return s
    s = unicodedata.normalize('NFKC', str(s))
    s = re.sub(r'[\r\n\t]+', ' ', s)        # newlines -> space
    s = re.sub(r'[–—―]', '-', s)            # unify dashes
    s = re.sub(r'\s+', ' ', s).strip()     # collapse spaces
    # unify common units into simple tokens (conservative)
    s = re.sub(r'\bounces?\b', 'oz', flags=re.I, string=s)
    s = re.sub(r'\bfl\s*oz\b', 'fl_oz', flags=re.I, string=s)
    s = re.sub(r'\bounces?\b', 'oz', flags=re.I, string=s)
    s = re.sub(r'\bgrams\b', 'g', flags=re.I, string=s)
    s = re.sub(r'\bkilogram?s\b', 'kg', flags=re.I, string=s)
    return s

# 2. Apply the clean_text function
train_new['catalog_clean'] = train_new['catalog_content'].apply(clean_text)
print("train_new catalog_clean created.")

# 3. Define the make_product_name_short function (already defined, redefining)
def make_product_name_short(s, max_len=200):
    if pd.isna(s):
        return ''
    # remove "Item Name:" prefix if present
    out = re.sub(r'Item Name:\s*', '', str(s), flags=re.I)
    # remove parenthetical "(Pack of ...)" and trailing unit e.g. "12 oz" etc.
    out = re.sub(r'\(.*?pack.*?\)', '', out, flags=re.I)
    out = re.sub(r'\b\d+\.?\d*\s*(fl_oz|oz|ml|g|kg|count|ct|pcs|pieces|bottle|bottles)\b', '', out, flags=re.I)
    # remove stray punctuation at ends and collapse spaces
    out = re.sub(r'[-–—]', ' ', out)
    out = re.sub(r'\s{2,}', ' ', out).strip()
    return out[:max_len]

# 4. Apply the make_product_name_short function
train_new['product_name_short'] = train_new['catalog_clean'].apply(make_product_name_short)
print("train_new product_name_short created.")

# 5. Define the extract_brand function (already defined, redefining)
def extract_brand(text):
    if pd.isna(text):
        return np.nan
    m = re.search(r'Item Name:\s*(.+)', str(text), flags=re.I)
    if not m:
        return np.nan
    rest = m.group(1)
    # split rest by common separators to get the initial segment
    first = re.split(r'[,\(\-]| - ', rest, maxsplit=1)[0].strip()
    # accept up to first 3 tokens if they look like a brand (start alnum or capital)
    tokens = first.split()
    if len(tokens)==0:
        return np.nan
    brand_tokens = []
    for t in tokens[:4]:
        if re.match(r'^[A-Za-z0-9&\']', t):
            brand_tokens.append(t)
        else:
            break
    brand = ' '.join(brand_tokens).strip()
    # sanity checks: avoid capturing generic words
    if len(brand) < 2 or brand.lower() in {'item','new','pack','sample'}:
        return np.nan
    return brand

# 6. Apply the extract_brand function
train_new['brand_extracted'] = train_new['catalog_clean'].apply(extract_brand)
print("train_new brand_extracted created.")

# 7. Define the brand_topk_map function (already defined, redefining)
# K and topk_brands should be available from previous cells
def brand_topk_map(b):
    if pd.isna(b):
        return np.nan
    if b in topk_brands: # Use the topk_brands set created from original training data
        return b
    return 'Other'

# 8. Apply the brand_topk_map function
train_new['brand_topK'] = train_new['brand_extracted'].apply(brand_topk_map)
print("train_new brand_topK created.")

# 9. Define the make_text_for_clustering function (already defined, redefining)
def make_text_for_clustering(row):
    brand = row.get('brand_topK') if 'brand_topK' in row else row.get('brand_extracted','')
    brand = '' if pd.isna(brand) else str(brand)
    pname = row.get('product_name_short') if row.get('product_name_short') else ''
    if not pname:
        pname = row.get('catalog_clean','') or ''
    return (brand + ' ' + str(pname)).strip()

# 10. Apply the make_text_for_clustering function
train_new['text_for_clustering'] = train_new.apply(make_text_for_clustering, axis=1)
print("train_new text_for_clustering created.")

train_new catalog_clean created.
train_new product_name_short created.
train_new brand_extracted created.
train_new brand_topK created.
train_new text_for_clustering created.


## Split training data

### Subtask:
Split the original `train` DataFrame into a new training set and a validation set.

**Reasoning**:
Split the original train DataFrame into a new training set and a validation set using train_test_split and print their shapes.

In [73]:
from sklearn.model_selection import train_test_split

train_new, val = train_test_split(train, test_size=0.2, random_state=42)

print("Shape of new training set:", train_new.shape)
print("Shape of validation set:", val.shape)

Shape of new training set: (60000, 20)
Shape of validation set: (15000, 20)


In [11]:
import pandas as pd

train_path = "sample_data/train_preprocessed_for_clustering.csv"
test_path  = "sample_data/test.csv"   # Assuming original test file is named test.csv

df_train = pd.read_csv(train_path)
df_test  = pd.read_csv(test_path)

print("train shape:", df_train.shape)
print("test  shape:", df_test.shape)
print("train sample columns:", df_train.columns.tolist()[:20])
print("test sample columns:", df_test.columns.tolist()[:20])

train shape: (75000, 19)
test  shape: (75000, 3)
train sample columns: ['sample_id', 'catalog_content', 'catalog_clean', 'product_name_short', 'brand_extracted', 'brand_topK', 'text_for_clustering', 'unit_value', 'unit_type', 'pack_count', 'total_units', 'norm_quantity', 'norm_unit', 'total_norm_quantity', 'price', 'price_clean', 'norm_quantity_fill', 'pack_count_fill', 'desc_len']
test sample columns: ['sample_id', 'catalog_content', 'image_link']


**Reasoning**:
The previous step completed the text preprocessing for `train_new`. The next logical steps are to extract, normalize, and fill the numeric fields from the text, and then create the numeric features used for clustering. This corresponds to steps 11-16 in the instructions. I will combine these into a single code block.

In [77]:
# 11. Define the parse_numeric_fields function (already defined, redefining)
import re
import numpy as np
import pandas as pd # Ensure pandas is imported for pd.Series and pd.isna

pack_pat = re.compile(r'\bpack\s*of\s*(\d+)\b', flags=re.I)
pack_pat2 = re.compile(r'\b(\d+)\s*[-]?\s*pack\b', flags=re.I)
mult_pat = re.compile(r'(\d+)\s*[x×]\s*(\d+\.?\d*)\s*(fl_oz|oz|ml|g|kg)?', flags=re.I)
unit_pat = re.compile(r'(\d+\.?\d*)\s*(fl_oz|oz|ml|g|kg|count|ct|piece|pcs|bottle|bottles)\b', flags=re.I)

def parse_numeric_fields(text):
    out = {'pack_count': np.nan, 'unit_value': np.nan, 'unit_type': None, 'total_units': np.nan}
    if pd.isna(text):
        return out
    # pack_count
    m = pack_pat.search(text) or pack_pat2.search(text)
    if m:
        try: out['pack_count'] = int(m.group(1))
        except: out['pack_count'] = np.nan
    # multiplication like "2 x 250 ml"
    m = mult_pat.search(text)
    if m:
        try:
            num = float(m.group(1)); size = float(m.group(2)); unit = (m.group(3) or '').lower()
            out['total_units'] = num * size
            out['unit_value'] = size
            out['unit_type'] = unit if unit else None
        except:
            pass
    # simple unit pattern
    if np.isnan(out['unit_value']):
        m = unit_pat.search(text)
        if m:
            try:
                out['unit_value'] = float(m.group(1))
                out['unit_type'] = m.group(2).lower()
            except:
                pass
    return out

# 12. Apply the parse_numeric_fields function
parsed_new = train_new['catalog_clean'].apply(parse_numeric_fields).apply(pd.Series)
# No longer concatenating parsed_new directly to train_new here

# Create a new DataFrame with a clean index and copy relevant columns
train_new_processed = pd.DataFrame(index=train_new.index)
train_new_processed['catalog_clean'] = train_new['catalog_clean']
# Copy parsed columns to the new processed DataFrame
train_new_processed['pack_count'] = parsed_new['pack_count']
train_new_processed['unit_value'] = parsed_new['unit_value']
train_new_processed['unit_type'] = parsed_new['unit_type']
train_new_processed['total_units'] = parsed_new['total_units']


# 13. Define the normalize_unit function (already defined, redefining)
def normalize_unit(unit_value, unit_type):
    # Ensure scalar values are passed and handle potential Series
    if isinstance(unit_value, pd.Series) or isinstance(unit_type, pd.Series):
        # This should not happen with axis=1, but as a safeguard:
        return (np.nan, None)
    if pd.isna(unit_value) or pd.isna(unit_type):
        return (np.nan, None)
    try:
        val = float(unit_value)
    except (ValueError, TypeError):
        return (np.nan, None)
    u = str(unit_type).lower() # Ensure unit_type is a string

    if u == 'oz':
        # conservative assumption: treat oz as grams (documented!)
        return (val * 28.3495, 'g')
    if u == 'fl_oz':
        return (val * 29.5735, 'ml')
    if u == 'kg':
        return (val * 1000.0, 'g')
    if u == 'g':
        return (val, 'g')
    if u == 'ml':
        return (val, 'ml')
    if u in ('count','ct','piece','pcs','bottle','bottles'):
        return (val, 'count')
    return (val, u)

# 14. Apply the normalize_unit function to the processed DataFrame
norms_new = train_new_processed.apply(lambda r: normalize_unit(r['unit_value'], r['unit_type']), axis=1)
train_new_processed['norm_quantity'] = norms_new.map(lambda x: x[0])
train_new_processed['norm_unit'] = norms_new.map(lambda x: x[1])
print("train_new norm_quantity and norm_unit created.")

# 15. Fill missing total_units and total_norm_quantity
# Calculate the product first, then fill NaNs on the processed DataFrame
calculated_total_units = train_new_processed['unit_value'] * train_new_processed['pack_count']
train_new_processed['total_units'] = train_new_processed['total_units'].fillna(calculated_total_units)
train_new_processed['total_norm_quantity'] = train_new_processed['total_units'].fillna(calculated_total_units)

mask_new = train_new_processed['total_norm_quantity'].isna() & train_new_processed['norm_quantity'].notna() & train_new_processed['pack_count'].notna()
train_new_processed.loc[mask_new, 'total_norm_quantity'] = train_new_processed.loc[mask_new, 'norm_quantity'] * train_new_processed.loc[mask_new, 'pack_count']
print("train_new total_units and total_norm_quantity filled.")

# 16. Create numeric fills for clustering on the processed DataFrame
train_new_processed['norm_quantity_fill'] = train_new_processed.get('norm_quantity', pd.Series(index=train_new_processed.index)).fillna(0.0).astype(float)
train_new_processed['pack_count_fill'] = train_new_processed.get('pack_count', pd.Series(index=train_new_processed.index)).fillna(0.0).astype(float)
train_new_processed['desc_len'] = train_new_processed['catalog_clean'].str.split().str.len().fillna(0).astype(int)
print("train_new numeric fills (norm_quantity_fill, pack_count_fill, desc_len) created.")

# Copy the new processed columns back to the original train_new DataFrame
for col in ['pack_count', 'unit_value', 'unit_type', 'total_units',
            'norm_quantity', 'norm_unit', 'total_norm_quantity',
            'norm_quantity_fill', 'pack_count_fill', 'desc_len']:
    train_new[col] = train_new_processed[col]

train_new norm_quantity and norm_unit created.
train_new total_units and total_norm_quantity filled.
train_new numeric fills (norm_quantity_fill, pack_count_fill, desc_len) created.


In [84]:
# CELL 5 (from plan) — Cluster the new training data with MiniBatchKMeans
from sklearn.cluster import MiniBatchKMeans
import joblib

K = 150   # Use the same K as before, or tune if needed
kmeans_new = MiniBatchKMeans(n_clusters=K, random_state=42, batch_size=5000, n_init=10) # Added n_init for clarity
kmeans_new.fit(X_train_cl_new)

train_new_labels = kmeans_new.predict(X_train_cl_new)
val_labels  = kmeans_new.predict(X_val_cl)

# attach labels to the new dataframes
train_new['cluster'] = train_new_labels
val['cluster']  = val_labels

# Optionally save the new kmeans model
joblib.dump(kmeans_new, "kmeans_clustering_new_train.joblib")

print("Cluster sizes (new train top 10):\n", pd.Series(train_new_labels).value_counts().head(10).to_dict())

ValueError: Length of values (75000) does not match length of index (60000)

In [83]:
# CELL 4 (from plan) — Build final clustering matrices (concatenate SVD + scaled numerics)
import numpy as np
X_train_cl_new = np.concatenate([X_train_svd, X_train_num_s], axis=1)
X_val_cl = np.concatenate([X_val_svd, X_val_num_s], axis=1)

print("Final clustering matrices shapes -> new train:", X_train_cl_new.shape, " val:", X_val_cl.shape)

Final clustering matrices shapes -> new train: (75000, 203)  val: (15000, 203)


## Preprocess Validation Data

### Subtask:
Apply the same preprocessing steps (using the scalers and vectorizers fitted on the new training data) to the validation set.

**Reasoning**:
The previous step completed preprocessing for the new training data. Now, apply the same preprocessing steps to the validation set (`val`), using the transformers (TF-IDF, SVD, StandardScaler) that were fitted on `train_new`.

In [78]:
# Apply the same text cleaning to validation data
# Assuming clean_text, make_product_name_short, extract_brand, brand_topk_map, make_text_for_clustering functions are in scope
# Assuming topk_brands set is in scope from previous steps

val['catalog_clean'] = val['catalog_content'].apply(clean_text)
print("val catalog_clean created.")

val['product_name_short'] = val['catalog_clean'].apply(make_product_name_short)
print("val product_name_short created.")

val['brand_extracted'] = val['catalog_clean'].apply(extract_brand)
print("val brand_extracted created.")

val['brand_topK'] = val['brand_extracted'].apply(brand_topk_map)
print("val brand_topK created.")

val['text_for_clustering'] = val.apply(make_text_for_clustering, axis=1)
print("val text_for_clustering created.")

val catalog_clean created.
val product_name_short created.
val brand_extracted created.
val brand_topK created.
val text_for_clustering created.


In [79]:
# Apply the same numeric field parsing and normalization to validation data
# Assuming parse_numeric_fields and normalize_unit functions are in scope

parsed_val = val['catalog_clean'].apply(parse_numeric_fields).apply(pd.Series)

# Create a new DataFrame with a clean index and copy relevant columns
val_processed = pd.DataFrame(index=val.index)
val_processed['catalog_clean'] = val['catalog_clean']
val_processed['pack_count'] = parsed_val['pack_count']
val_processed['unit_value'] = parsed_val['unit_value']
val_processed['unit_type'] = parsed_val['unit_type']
val_processed['total_units'] = parsed_val['total_units']

norms_val = val_processed.apply(lambda r: normalize_unit(r['unit_value'], r['unit_type']), axis=1)
val_processed['norm_quantity'] = norms_val.map(lambda x: x[0])
val_processed['norm_unit'] = norms_val.map(lambda x: x[1])
print("val norm_quantity and norm_unit created.")

# Fill missing total_units and total_norm_quantity on the processed DataFrame
calculated_total_units_val = val_processed['unit_value'] * val_processed['pack_count']
val_processed['total_units'] = val_processed['total_units'].fillna(calculated_total_units_val)
val_processed['total_norm_quantity'] = val_processed['total_units'].fillna(calculated_total_units_val)

mask_val = val_processed['total_norm_quantity'].isna() & val_processed['norm_quantity'].notna() & val_processed['pack_count'].notna()
val_processed.loc[mask_val, 'total_norm_quantity'] = val_processed.loc[mask_val, 'norm_quantity'] * val_processed.loc[mask_val, 'pack_count']
print("val total_units and total_norm_quantity filled.")

# Create numeric fills for clustering on the processed DataFrame
val_processed['norm_quantity_fill'] = val_processed.get('norm_quantity', pd.Series(index=val_processed.index)).fillna(0.0).astype(float)
val_processed['pack_count_fill'] = val_processed.get('pack_count', pd.Series(index=val_processed.index)).fillna(0.0).astype(float)
val_processed['desc_len'] = val_processed['catalog_clean'].str.split().str.len().fillna(0).astype(int)
print("val numeric fills (norm_quantity_fill, pack_count_fill, desc_len) created.")

# Copy the new processed columns back to the original val DataFrame
for col in ['pack_count', 'unit_value', 'unit_type', 'total_units',
            'norm_quantity', 'norm_unit', 'total_norm_quantity',
            'norm_quantity_fill', 'pack_count_fill', 'desc_len']:
    val[col] = val_processed[col]

val norm_quantity and norm_unit created.
val total_units and total_norm_quantity filled.
val numeric fills (norm_quantity_fill, pack_count_fill, desc_len) created.


In [80]:
# Transform validation text data using TF-IDF fitted on new training data
# Assuming tfidf object is in scope from fitting on train_new
X_val_text = tfidf.transform(val['text_for_clustering'].fillna(''))
print("TF-IDF shape -> val:", X_val_text.shape)

TF-IDF shape -> val: (15000, 36778)


In [81]:
# Transform validation SVD using SVD fitted on new training data
# Assuming svd object is in scope from fitting on train_new_text
X_val_svd = svd.transform(X_val_text)
print("SVD output shape -> val:", X_val_svd.shape)

SVD output shape -> val: (15000, 200)


In [82]:
# Scale validation numeric features using scaler fitted on new training data
# Assuming scaler object is in scope from fitting on train_new_num
num_cols = ['norm_quantity_fill','pack_count_fill','desc_len']
X_val_num = val[num_cols].fillna(0).astype(float).values
X_val_num_s = scaler.transform(X_val_num)
print("Numeric extras shapes -> val:", X_val_num_s.shape)

Numeric extras shapes -> val: (15000, 3)


In [None]:
# Step 8F: list top brand_extracted tokens and inspect a few rows from a suspicious token
top_brands = df['brand_extracted'].value_counts().head(30)
display(top_brands)
# If you see suspicious tokens like "Food to" you can list sample rows:
suspicious = [b for b in top_brands.index if len(b.split())<=2 and any(ch.islower() for ch in b)]
display(suspicious[:10])
# show sample rows for first suspicious candidate (if any)
if suspicious:
    sample_token = suspicious[0]
    display(df[df['brand_extracted']==sample_token][['catalog_clean','brand_extracted','product_name_short']].sample(5, random_state=2).to_dict(orient='records'))

Unnamed: 0_level_0,count
brand_extracted,Unnamed: 1_level_1
Food to Live Organic,450
Amoretti,231
Food to Live,203
Fresh Roasted Coffee,148
Frontier Co,140
Jell,116
Davidson's Organics,110
Amazon Brand,94
Marshalls Creek Spices,91
Tiesta Tea,80


['Amoretti',
 'Frontier Co',
 'Jell',
 "Davidson's Organics",
 'Amazon Brand',
 'Tiesta Tea',
 'Kool',
 'Cheez',
 'Amazon Fresh',
 'Monin']

[{'catalog_clean': 'Item Name: Amoretti - Bloody Mary Craft Puree® 750 mL - Perfect for Brewing, Cocktails, and other Beverages, Made of Real Fruit, No Preservatives, Filtered, Super Concentrated, Fully Pasteurized, TTB Registered',
  'brand_extracted': 'Amoretti',
  'product_name_short': 'Amoretti Bloody Mary Craft Puree® Perfect for Brewing, Cocktails, and other Beverages, Made of Real Fruit, No Preservatives, Filtered, Super Concentrated, Fully Pasteurized, TTB Registered'},
 {'catalog_clean': 'Item Name: Amoretti - Iced Tea Lemonade Beverage Infusion, (10 lbs) - Drink Mix & Water Enhancer with Pump for Flavoring Cocktails, Waters, Teas, Mocktails, and other Beverages, Preservative Free, Gluten Free',
  'brand_extracted': 'Amoretti',
  'product_name_short': 'Amoretti Iced Tea Lemonade Beverage Infusion, (10 lbs) Drink Mix & Water Enhancer with Pump for Flavoring Cocktails, Waters, Teas, Mocktails, and other Beverages, Preservative Free, Gluten Free'},
 {'catalog_clean': 'Item Name: 

In [None]:
# Step 8E: numeric fills for clustering
df['norm_quantity_fill'] = df.get('norm_quantity', pd.Series()).fillna(0.0).astype(float)
df['pack_count_fill'] = df.get('pack_count', pd.Series()).fillna(0.0).astype(float)
df['desc_len'] = df['catalog_clean'].str.split().str.len().fillna(0).astype(int)

# quick stats
display(df[['norm_quantity_fill','pack_count_fill','desc_len']].describe().T)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
norm_quantity_fill,75000.0,237.409442,1004.894327,0.0,0.0,90.7184,340.194,215000.0
pack_count_fill,75000.0,3.313173,24.615385,0.0,0.0,0.0,3.0,2424.0
desc_len,75000.0,16.44676,7.899954,2.0,11.0,14.0,20.0,78.0


In [None]:
# Step 8D: final text_for_clustering
def make_text_for_clustering(row):
    brand = row.get('brand_topK') if 'brand_topK' in row else row.get('brand_extracted','')
    brand = '' if pd.isna(brand) else str(brand)
    pname = row.get('product_name_short') if row.get('product_name_short') else ''
    if not pname:
        pname = row.get('catalog_clean','') or ''
    return (brand + ' ' + str(pname)).strip()

df['text_for_clustering'] = df.apply(make_text_for_clustering, axis=1)

# Show a few examples
display(df['text_for_clustering'].sample(8, random_state=3).tolist())

['Other Lars Own Cloudberry Preserves',
 'Other Ritter Sport Alpine Milk Chocolate Bar',
 'Other High Protein, High Fiber Stuffed Snacks Gluten Free, Vegan, Non GMO, Low Glycemic, Plant Based Zero Added Sugar Healthy Snacks for Adults and Kids Rivalz Snacks',
 'Other Lindt LINDOR White Chocolate Truffles ,',
 'Other Hawaiian Paradise Coffee Sea Salt Caramel Single Serve Cups Compatible with Keurig K Cup Brewers',
 'Other Albers, Enriched Hominy Quick Grits,',
 'Other cookies and brownies parent listing (The Wild Oat Cookie)',
 "Other Andy's seasoning, Yellow fish breading for excellent tasting fish and shrimp Bag"]

In [None]:
# Step 8C: create brand_topK conservative mapping
K = 100   # change if you want more/less granularity
brand_counts = df['brand_extracted'].fillna('UNKNOWN').value_counts()
topk = set(brand_counts.head(K).index.tolist())

def brand_topk_map(b):
    if pd.isna(b):
        return np.nan
    if b in topk:
        return b
    return 'Other'

df['brand_topK'] = df['brand_extracted'].apply(brand_topk_map)

# Check distribution
display(df['brand_topK'].value_counts().head(20).to_dict())
display(df['brand_topK'].isna().mean())

{'Other': 70116,
 'Food to Live Organic': 450,
 'Amoretti': 231,
 'Food to Live': 203,
 'Fresh Roasted Coffee': 148,
 'Frontier Co': 140,
 'Jell': 116,
 "Davidson's Organics": 110,
 'Amazon Brand': 94,
 'Marshalls Creek Spices': 91,
 'Tiesta Tea': 80,
 'Big Dot of Happiness': 79,
 'Kool': 76,
 'Green Mountain Coffee Roasters': 73,
 'Crystal Light On The': 69,
 'Cheez': 66,
 'Amazon Fresh': 61,
 'The Bean Organic Coffee': 61,
 'Monin': 57,
 'The Republic of Tea': 57}

np.float64(0.0029333333333333334)

In [None]:
# Step 8B: conservative brand extraction
import re, numpy as np

def extract_brand(text):
    if pd.isna(text):
        return np.nan
    m = re.search(r'Item Name:\s*(.+)', str(text), flags=re.I)
    if not m:
        return np.nan
    rest = m.group(1)
    # split rest by common separators to get the initial segment
    first = re.split(r'[,\(\-]| - ', rest, maxsplit=1)[0].strip()
    # accept up to first 3 tokens if they look like a brand (start alnum or capital)
    tokens = first.split()
    if len(tokens)==0:
        return np.nan
    brand_tokens = []
    for t in tokens[:4]:
        if re.match(r'^[A-Za-z0-9&\']', t):
            brand_tokens.append(t)
        else:
            break
    brand = ' '.join(brand_tokens).strip()
    # sanity checks: avoid capturing generic words
    if len(brand) < 2 or brand.lower() in {'item','new','pack','sample'}:
        return np.nan
    return brand

df['brand_extracted'] = df['catalog_clean'].apply(extract_brand)
# Show top 30 candidate brands (non-null)
display(df['brand_extracted'].value_counts().head(30).to_dict())

{'Food to Live Organic': 450,
 'Amoretti': 231,
 'Food to Live': 203,
 'Fresh Roasted Coffee': 148,
 'Frontier Co': 140,
 'Jell': 116,
 "Davidson's Organics": 110,
 'Amazon Brand': 94,
 'Marshalls Creek Spices': 91,
 'Tiesta Tea': 80,
 'Big Dot of Happiness': 79,
 'Kool': 76,
 'Green Mountain Coffee Roasters': 73,
 'Crystal Light On The': 69,
 'Cheez': 66,
 'Amazon Fresh': 61,
 'The Bean Organic Coffee': 61,
 'The Republic of Tea': 57,
 'Monin': 57,
 'Pride Of India': 56,
 'From You Flowers': 53,
 "Bob's Red Mill Organic": 52,
 'NOW Foods': 52,
 'Pride of India': 52,
 "Peet's Coffee": 48,
 'NUT CRAVINGS': 47,
 'The Original Donut Shop': 46,
 'Pop': 46,
 'Mr & Mrs T': 46,
 'Malt': 44}

In [None]:
# Step 8A: create product_name_short conservatively
import re
import pandas as pd
# df = df.copy()   # work on a working copy if not already

def make_product_name_short(s, max_len=200):
    if pd.isna(s):
        return ''
    # remove "Item Name:" prefix if present
    out = re.sub(r'Item Name:\s*', '', str(s), flags=re.I)
    # remove parenthetical "(Pack of ...)" and trailing unit e.g. "12 oz" etc.
    out = re.sub(r'\(.*?pack.*?\)', '', out, flags=re.I)
    out = re.sub(r'\b\d+\.?\d*\s*(fl_oz|oz|ml|g|kg|count|ct|pcs|pieces|bottle|bottles)\b', '', out, flags=re.I)
    # remove stray punctuation at ends and collapse spaces
    out = re.sub(r'[-–—]', ' ', out)
    out = re.sub(r'\s{2,}', ' ', out).strip()
    return out[:max_len]

df['product_name_short'] = df['catalog_clean'].apply(make_product_name_short)
# Quick check: show 8 random examples
display(df[['catalog_clean','product_name_short']].sample(8, random_state=2).to_dict(orient='records'))

[{'catalog_clean': 'Item Name: Ka-Me Gluten Free Rice Noodles - Hong Kong Express Noodles Ready To Serve (Pack of 6)',
  'product_name_short': 'Ka Me Gluten Free Rice Noodles Hong Kong Express Noodles Ready To Serve'},
 {'catalog_clean': 'Item Name: LIFE SAVERS Wint-O-Green Breath Mint Hard Candy, Family Size, 23.3 oz Bag',
  'product_name_short': 'LIFE SAVERS Wint O Green Breath Mint Hard Candy, Family Size, Bag'},
 {'catalog_clean': 'Item Name: Ortega Yellow Corn Taco Shells, 4.9 oz, 12 Shells',
  'product_name_short': 'Ortega Yellow Corn Taco Shells, , 12 Shells'},
 {'catalog_clean': 'Item Name: Assorted Fruit Flavored Hard Candy Bulk Bag, 5 lb 10700156805',
  'product_name_short': 'Assorted Fruit Flavored Hard Candy Bulk Bag, 5 lb 10700156805'},
 {'catalog_clean': 'Item Name: Coffee-mate Sugar Free Italian Sweet Creme Liquid Coffee Creamer',
  'product_name_short': 'Coffee mate Sugar Free Italian Sweet Creme Liquid Coffee Creamer'},
 {'catalog_clean': 'Item Name: Pack of 2 Funfetti

In [None]:
def normalize_unit(unit_value, unit_type):
    if pd.isna(unit_value) or pd.isna(unit_type):
        return (np.nan, None)
    try:
        val = float(unit_value)
    except:
        return (np.nan, None)
    u = unit_type.lower()
    if u == 'oz':
        # conservative assumption: treat oz as grams (documented!)
        return (val * 28.3495, 'g')
    if u == 'fl_oz':
        return (val * 29.5735, 'ml')
    if u == 'kg':
        return (val * 1000.0, 'g')
    if u == 'g':
        return (val, 'g')
    if u == 'ml':
        return (val, 'ml')
    if u in ('count','ct','piece','pcs','bottle','bottles'):
        return (val, 'count')
    return (val, u)

norms = df.apply(lambda r: normalize_unit(r['unit_value'], r['unit_type']), axis=1)
df['norm_quantity'] = norms.map(lambda x: x[0])
df['norm_unit'] = norms.map(lambda x: x[1])

# If total_units not filled from parse, fill with unit_value*pack_count where both present
df['total_units'] = df['total_units'].fillna(df['unit_value'] * df['pack_count'])
# and a normalized total if possible
df['total_norm_quantity'] = df['total_units']
mask = df['total_norm_quantity'].isna() & df['norm_quantity'].notna() & df['pack_count'].notna()
df.loc[mask, 'total_norm_quantity'] = df.loc[mask, 'norm_quantity'] * df.loc[mask, 'pack_count']

print("norm_unit top:", df['norm_unit'].value_counts().head(10).to_dict())
print("rows with norm_quantity:", df['norm_quantity'].notna().mean())

norm_unit top: {'g': 41724, 'count': 5419, 'ml': 3872}
rows with norm_quantity: 0.6802


In [None]:
import re
import numpy as np

pack_pat = re.compile(r'\bpack\s*of\s*(\d+)\b', flags=re.I)
pack_pat2 = re.compile(r'\b(\d+)\s*[-]?\s*pack\b', flags=re.I)
mult_pat = re.compile(r'(\d+)\s*[x×]\s*(\d+\.?\d*)\s*(fl_oz|oz|ml|g|kg)?', flags=re.I)
unit_pat = re.compile(r'(\d+\.?\d*)\s*(fl_oz|oz|ml|g|kg|count|ct|piece|pcs|bottle|bottles)\b', flags=re.I)

def parse_numeric_fields(text):
    out = {'pack_count': np.nan, 'unit_value': np.nan, 'unit_type': None, 'total_units': np.nan}
    if pd.isna(text):
        return out
    # pack_count
    m = pack_pat.search(text) or pack_pat2.search(text)
    if m:
        try: out['pack_count'] = int(m.group(1))
        except: out['pack_count'] = np.nan
    # multiplication like "2 x 250 ml"
    m = mult_pat.search(text)
    if m:
        try:
            num = float(m.group(1)); size = float(m.group(2)); unit = (m.group(3) or '').lower()
            out['total_units'] = num * size
            out['unit_value'] = size
            out['unit_type'] = unit if unit else None
        except:
            pass
    # simple unit pattern
    if np.isnan(out['unit_value']):
        m = unit_pat.search(text)
        if m:
            try:
                out['unit_value'] = float(m.group(1))
                out['unit_type'] = m.group(2).lower()
            except:
                pass
    return out

parsed = df['catalog_clean'].apply(parse_numeric_fields).apply(pd.Series)
df = pd.concat([df, parsed], axis=1)
# Show coverage
print("unit_value present:", df['unit_value'].notna().mean())
print("pack_count present:", df['pack_count'].notna().mean())

unit_value present: 0.6827466666666666
pack_count present: 0.3556533333333333


# Task
Calculate the SMAPE on a validation set created by splitting the training data, using both the cluster median baseline and the hybrid prediction approaches. Report the SMAPE for both methods.

## Split training data

### Subtask:
Split the original `train` DataFrame into a new training set and a validation set.


**Reasoning**:
Split the original train DataFrame into a new training set and a validation set using train_test_split and print their shapes.



In [68]:
from sklearn.model_selection import train_test_split

train_new, val = train_test_split(train, test_size=0.2, random_state=42)

print("Shape of new training set:", train_new.shape)
print("Shape of validation set:", val.shape)

Shape of new training set: (60000, 20)
Shape of validation set: (15000, 20)


## Preprocess new training data

### Subtask:
Apply the necessary preprocessing steps (text cleaning, feature extraction, numeric fills, TF-IDF, SVD, scaling) to the new training set.


**Reasoning**:
The previous cell successfully split the training data. The next steps involve applying the same preprocessing steps to the new training set (`train_new`) as were originally applied to the full training set. This includes text cleaning, feature extraction, numeric field parsing, and creating the `text_for_clustering` column. I will combine steps 1-10 from the instructions into a single code block.



In [69]:
# 1. Define the clean_text function (already defined in previous cells, but redefining for clarity within this scope)
import re
import unicodedata
import pandas as pd
import numpy as np

def clean_text(s):
    if pd.isna(s): return s
    s = unicodedata.normalize('NFKC', str(s))
    s = re.sub(r'[\r\n\t]+', ' ', s)        # newlines -> space
    s = re.sub(r'[–—―]', '-', s)            # unify dashes
    s = re.sub(r'\s+', ' ', s).strip()     # collapse spaces
    # unify common units into simple tokens (conservative)
    s = re.sub(r'\bounces?\b', 'oz', flags=re.I, string=s)
    s = re.sub(r'\bfl\s*oz\b', 'fl_oz', flags=re.I, string=s)
    s = re.sub(r'\bounces?\b', 'oz', flags=re.I, string=s)
    s = re.sub(r'\bgrams\b', 'g', flags=re.I, string=s)
    s = re.sub(r'\bkilogram?s\b', 'kg', flags=re.I, string=s)
    return s

# 2. Apply the clean_text function
train_new['catalog_clean'] = train_new['catalog_content'].apply(clean_text)
print("train_new catalog_clean created.")

# 3. Define the make_product_name_short function (already defined, redefining)
def make_product_name_short(s, max_len=200):
    if pd.isna(s):
        return ''
    # remove "Item Name:" prefix if present
    out = re.sub(r'Item Name:\s*', '', str(s), flags=re.I)
    # remove parenthetical "(Pack of ...)" and trailing unit e.g. "12 oz" etc.
    out = re.sub(r'\(.*?pack.*?\)', '', out, flags=re.I)
    out = re.sub(r'\b\d+\.?\d*\s*(fl_oz|oz|ml|g|kg|count|ct|pcs|pieces|bottle|bottles)\b', '', out, flags=re.I)
    # remove stray punctuation at ends and collapse spaces
    out = re.sub(r'[-–—]', ' ', out)
    out = re.sub(r'\s{2,}', ' ', out).strip()
    return out[:max_len]

# 4. Apply the make_product_name_short function
train_new['product_name_short'] = train_new['catalog_clean'].apply(make_product_name_short)
print("train_new product_name_short created.")

# 5. Define the extract_brand function (already defined, redefining)
def extract_brand(text):
    if pd.isna(text):
        return np.nan
    m = re.search(r'Item Name:\s*(.+)', str(text), flags=re.I)
    if not m:
        return np.nan
    rest = m.group(1)
    # split rest by common separators to get the initial segment
    first = re.split(r'[,\(\-]| - ', rest, maxsplit=1)[0].strip()
    # accept up to first 3 tokens if they look like a brand (start alnum or capital)
    tokens = first.split()
    if len(tokens)==0:
        return np.nan
    brand_tokens = []
    for t in tokens[:4]:
        if re.match(r'^[A-Za-z0-9&\']', t):
            brand_tokens.append(t)
        else:
            break
    brand = ' '.join(brand_tokens).strip()
    # sanity checks: avoid capturing generic words
    if len(brand) < 2 or brand.lower() in {'item','new','pack','sample'}:
        return np.nan
    return brand

# 6. Apply the extract_brand function
train_new['brand_extracted'] = train_new['catalog_clean'].apply(extract_brand)
print("train_new brand_extracted created.")

# 7. Define the brand_topk_map function (already defined, redefining)
# K and topk_brands should be available from previous cells
def brand_topk_map(b):
    if pd.isna(b):
        return np.nan
    if b in topk_brands: # Use the topk_brands set created from original training data
        return b
    return 'Other'

# 8. Apply the brand_topk_map function
train_new['brand_topK'] = train_new['brand_extracted'].apply(brand_topk_map)
print("train_new brand_topK created.")

# 9. Define the make_text_for_clustering function (already defined, redefining)
def make_text_for_clustering(row):
    brand = row.get('brand_topK') if 'brand_topK' in row else row.get('brand_extracted','')
    brand = '' if pd.isna(brand) else str(brand)
    pname = row.get('product_name_short') if row.get('product_name_short') else ''
    if not pname:
        pname = row.get('catalog_clean','') or ''
    return (brand + ' ' + str(pname)).strip()

# 10. Apply the make_text_for_clustering function
train_new['text_for_clustering'] = train_new.apply(make_text_for_clustering, axis=1)
print("train_new text_for_clustering created.")

train_new catalog_clean created.
train_new product_name_short created.
train_new brand_extracted created.
train_new brand_topK created.
train_new text_for_clustering created.


**Reasoning**:
The previous step completed the text preprocessing for `train_new`. The next logical steps are to extract, normalize, and fill the numeric fields from the text, and then create the numeric features used for clustering. This corresponds to steps 11-16 in the instructions. I will combine these into a single code block.



In [70]:
# 11. Define the parse_numeric_fields function (already defined, redefining)
import re
import numpy as np
import pandas as pd # Ensure pandas is imported for pd.Series and pd.isna

pack_pat = re.compile(r'\bpack\s*of\s*(\d+)\b', flags=re.I)
pack_pat2 = re.compile(r'\b(\d+)\s*[-]?\s*pack\b', flags=re.I)
mult_pat = re.compile(r'(\d+)\s*[x×]\s*(\d+\.?\d*)\s*(fl_oz|oz|ml|g|kg)?', flags=re.I)
unit_pat = re.compile(r'(\d+\.?\d*)\s*(fl_oz|oz|ml|g|kg|count|ct|piece|pcs|bottle|bottles)\b', flags=re.I)

def parse_numeric_fields(text):
    out = {'pack_count': np.nan, 'unit_value': np.nan, 'unit_type': None, 'total_units': np.nan}
    if pd.isna(text):
        return out
    # pack_count
    m = pack_pat.search(text) or pack_pat2.search(text)
    if m:
        try: out['pack_count'] = int(m.group(1))
        except: out['pack_count'] = np.nan
    # multiplication like "2 x 250 ml"
    m = mult_pat.search(text)
    if m:
        try:
            num = float(m.group(1)); size = float(m.group(2)); unit = (m.group(3) or '').lower()
            out['total_units'] = num * size
            out['unit_value'] = size
            out['unit_type'] = unit if unit else None
        except:
            pass
    # simple unit pattern
    if np.isnan(out['unit_value']):
        m = unit_pat.search(text)
        if m:
            try:
                out['unit_value'] = float(m.group(1))
                out['unit_type'] = m.group(2).lower()
            except:
                pass
    return out

# 12. Apply the parse_numeric_fields function
parsed_new = train_new['catalog_clean'].apply(parse_numeric_fields).apply(pd.Series)
train_new = pd.concat([train_new, parsed_new], axis=1)
print("train_new numeric fields parsed and concatenated.")

# 13. Define the normalize_unit function (already defined, redefining)
def normalize_unit(unit_value, unit_type):
    # Ensure scalar values are passed and handle potential Series
    if isinstance(unit_value, pd.Series) or isinstance(unit_type, pd.Series):
        # This should not happen with axis=1, but as a safeguard:
        return (np.nan, None)
    if pd.isna(unit_value) or pd.isna(unit_type):
        return (np.nan, None)
    try:
        val = float(unit_value)
    except (ValueError, TypeError):
        return (np.nan, None)
    u = str(unit_type).lower() # Ensure unit_type is a string

    if u == 'oz':
        # conservative assumption: treat oz as grams (documented!)
        return (val * 28.3495, 'g')
    if u == 'fl_oz':
        return (val * 29.5735, 'ml')
    if u == 'kg':
        return (val * 1000.0, 'g')
    if u == 'g':
        return (val, 'g')
    if u == 'ml':
        return (val, 'ml')
    if u in ('count','ct','piece','pcs','bottle','bottles'):
        return (val, 'count')
    return (val, u)

# 14. Apply the normalize_unit function
norms_new = train_new.apply(lambda r: normalize_unit(r['unit_value'], r['unit_type']), axis=1)
train_new['norm_quantity'] = norms_new.map(lambda x: x[0])
train_new['norm_unit'] = norms_new.map(lambda x: x[1])
print("train_new norm_quantity and norm_unit created.")

# 15. Fill missing total_units and total_norm_quantity
train_new['total_units'] = train_new['total_units'].fillna(train_new['unit_value'] * train_new['pack_count'])
train_new['total_norm_quantity'] = train_new['total_units'].fillna(train_new['unit_value'] * train_new['pack_count'])

mask_new = train_new['total_norm_quantity'].isna() & train_new['norm_quantity'].notna() & train_new['pack_count'].notna()
train_new.loc[mask_new, 'total_norm_quantity'] = train_new.loc[mask_new, 'norm_quantity'] * train_new.loc[mask_new, 'pack_count']
print("train_new total_units and total_norm_quantity filled.")

# 16. Create numeric fills for clustering
train_new['norm_quantity_fill'] = train_new.get('norm_quantity', pd.Series(index=train_new.index)).fillna(0.0).astype(float)
train_new['pack_count_fill'] = train_new.get('pack_count', pd.Series(index=train_new.index)).fillna(0.0).astype(float)
train_new['desc_len'] = train_new['catalog_clean'].str.split().str.len().fillna(0).astype(int)
print("train_new numeric fills (norm_quantity_fill, pack_count_fill, desc_len) created.")

train_new numeric fields parsed and concatenated.
train_new norm_quantity and norm_unit created.


ValueError: Columns must be same length as key

**Reasoning**:
The previous step failed because I tried to assign a Series generated by `fillna` with a multiplication result back to a DataFrame column using a DataFrame-like assignment. The traceback indicates a `ValueError: Columns must be same length as key`. This likely happened because the result of the `fillna` operation on `train_new['total_units']` combined with the multiplication `train_new['unit_value'] * train_new['pack_count']` somehow resulted in a structure that pandas didn't interpret as a simple Series for assignment to a single column. I need to fix the assignment logic for `total_units` and `total_norm_quantity` filling. I will regenerate the entire code block (steps 11-16) to ensure the fix is applied correctly and include the print statements that were missed in the previous output.



In [71]:
# 11. Define the parse_numeric_fields function (already defined, redefining)
import re
import numpy as np
import pandas as pd # Ensure pandas is imported for pd.Series and pd.isna

pack_pat = re.compile(r'\bpack\s*of\s*(\d+)\b', flags=re.I)
pack_pat2 = re.compile(r'\b(\d+)\s*[-]?\s*pack\b', flags=re.I)
mult_pat = re.compile(r'(\d+)\s*[x×]\s*(\d+\.?\d*)\s*(fl_oz|oz|ml|g|kg)?', flags=re.I)
unit_pat = re.compile(r'(\d+\.?\d*)\s*(fl_oz|oz|ml|g|kg|count|ct|piece|pcs|bottle|bottles)\b', flags=re.I)

def parse_numeric_fields(text):
    out = {'pack_count': np.nan, 'unit_value': np.nan, 'unit_type': None, 'total_units': np.nan}
    if pd.isna(text):
        return out
    # pack_count
    m = pack_pat.search(text) or pack_pat2.search(text)
    if m:
        try: out['pack_count'] = int(m.group(1))
        except: out['pack_count'] = np.nan
    # multiplication like "2 x 250 ml"
    m = mult_pat.search(text)
    if m:
        try:
            num = float(m.group(1)); size = float(m.group(2)); unit = (m.group(3) or '').lower()
            out['total_units'] = num * size
            out['unit_value'] = size
            out['unit_type'] = unit if unit else None
        except:
            pass
    # simple unit pattern
    if np.isnan(out['unit_value']):
        m = unit_pat.search(text)
        if m:
            try:
                out['unit_value'] = float(m.group(1))
                out['unit_type'] = m.group(2).lower()
            except:
                pass
    return out

# 12. Apply the parse_numeric_fields function
parsed_new = train_new['catalog_clean'].apply(parse_numeric_fields).apply(pd.Series)
train_new = pd.concat([train_new, parsed_new], axis=1)
print("train_new numeric fields parsed and concatenated.")

# 13. Define the normalize_unit function (already defined, redefining)
def normalize_unit(unit_value, unit_type):
    # Ensure scalar values are passed and handle potential Series
    if isinstance(unit_value, pd.Series) or isinstance(unit_type, pd.Series):
        # This should not happen with axis=1, but as a safeguard:
        return (np.nan, None)
    if pd.isna(unit_value) or pd.isna(unit_type):
        return (np.nan, None)
    try:
        val = float(unit_value)
    except (ValueError, TypeError):
        return (np.nan, None)
    u = str(unit_type).lower() # Ensure unit_type is a string

    if u == 'oz':
        # conservative assumption: treat oz as grams (documented!)
        return (val * 28.3495, 'g')
    if u == 'fl_oz':
        return (val * 29.5735, 'ml')
    if u == 'kg':
        return (val * 1000.0, 'g')
    if u == 'g':
        return (val, 'g')
    if u == 'ml':
        return (val, 'ml')
    if u in ('count','ct','piece','pcs','bottle','bottles'):
        return (val, 'count')
    return (val, u)

# 14. Apply the normalize_unit function
norms_new = train_new.apply(lambda r: normalize_unit(r['unit_value'], r['unit_type']), axis=1)
train_new['norm_quantity'] = norms_new.map(lambda x: x[0])
train_new['norm_unit'] = norms_new.map(lambda x: x[1])
print("train_new norm_quantity and norm_unit created.")

# 15. Fill missing total_units and total_norm_quantity
# Calculate the product first, then fill NaNs
calculated_total_units = train_new['unit_value'] * train_new['pack_count']
train_new['total_units'] = train_new['total_units'].fillna(calculated_total_units)
train_new['total_norm_quantity'] = train_new['total_units'].copy() # Start with filled total_units

mask_new = train_new['total_norm_quantity'].isna() & train_new['norm_quantity'].notna() & train_new['pack_count'].notna()
train_new.loc[mask_new, 'total_norm_quantity'] = train_new.loc[mask_new, 'norm_quantity'] * train_new.loc[mask_new, 'pack_count']
print("train_new total_units and total_norm_quantity filled.")

# 16. Create numeric fills for clustering
train_new['norm_quantity_fill'] = train_new.get('norm_quantity', pd.Series(index=train_new.index)).fillna(0.0).astype(float)
train_new['pack_count_fill'] = train_new.get('pack_count', pd.Series(index=train_new.index)).fillna(0.0).astype(float)
train_new['desc_len'] = train_new['catalog_clean'].str.split().str.len().fillna(0).astype(int)
print("train_new numeric fills (norm_quantity_fill, pack_count_fill, desc_len) created.")

train_new numeric fields parsed and concatenated.
train_new norm_quantity and norm_unit created.


ValueError: Columns must be same length as key

**Reasoning**:
The previous attempt to fill `total_norm_quantity` also resulted in a `ValueError`. It seems the issue might be with how pandas handles the `.copy()` operation in this context or a lingering effect from the previous assignment attempt. I will try a more explicit way to assign the filled `total_units` to `total_norm_quantity` by accessing the underlying NumPy array or by using `.loc` for assignment. I will regenerate the entire code block (steps 11-16) again with this adjustment.



In [72]:
# 11. Define the parse_numeric_fields function (already defined, redefining)
import re
import numpy as np
import pandas as pd # Ensure pandas is imported for pd.Series and pd.isna

pack_pat = re.compile(r'\bpack\s*of\s*(\d+)\b', flags=re.I)
pack_pat2 = re.compile(r'\b(\d+)\s*[-]?\s*pack\b', flags=re.I)
mult_pat = re.compile(r'(\d+)\s*[x×]\s*(\d+\.?\d*)\s*(fl_oz|oz|ml|g|kg)?', flags=re.I)
unit_pat = re.compile(r'(\d+\.?\d*)\s*(fl_oz|oz|ml|g|kg|count|ct|piece|pcs|bottle|bottles)\b', flags=re.I)

def parse_numeric_fields(text):
    out = {'pack_count': np.nan, 'unit_value': np.nan, 'unit_type': None, 'total_units': np.nan}
    if pd.isna(text):
        return out
    # pack_count
    m = pack_pat.search(text) or pack_pat2.search(text)
    if m:
        try: out['pack_count'] = int(m.group(1))
        except: out['pack_count'] = np.nan
    # multiplication like "2 x 250 ml"
    m = mult_pat.search(text)
    if m:
        try:
            num = float(m.group(1)); size = float(m.group(2)); unit = (m.group(3) or '').lower()
            out['total_units'] = num * size
            out['unit_value'] = size
            out['unit_type'] = unit if unit else None
        except:
            pass
    # simple unit pattern
    if np.isnan(out['unit_value']):
        m = unit_pat.search(text)
        if m:
            try:
                out['unit_value'] = float(m.group(1))
                out['unit_type'] = m.group(2).lower()
            except:
                pass
    return out

# 12. Apply the parse_numeric_fields function
parsed_new = train_new['catalog_clean'].apply(parse_numeric_fields).apply(pd.Series)
train_new = pd.concat([train_new, parsed_new], axis=1)
print("train_new numeric fields parsed and concatenated.")

# 13. Define the normalize_unit function (already defined, redefining)
def normalize_unit(unit_value, unit_type):
    # Ensure scalar values are passed and handle potential Series
    if isinstance(unit_value, pd.Series) or isinstance(unit_type, pd.Series):
        # This should not happen with axis=1, but as a safeguard:
        return (np.nan, None)
    if pd.isna(unit_value) or pd.isna(unit_type):
        return (np.nan, None)
    try:
        val = float(unit_value)
    except (ValueError, TypeError):
        return (np.nan, None)
    u = str(unit_type).lower() # Ensure unit_type is a string

    if u == 'oz':
        # conservative assumption: treat oz as grams (documented!)
        return (val * 28.3495, 'g')
    if u == 'fl_oz':
        return (val * 29.5735, 'ml')
    if u == 'kg':
        return (val * 1000.0, 'g')
    if u == 'g':
        return (val, 'g')
    if u == 'ml':
        return (val, 'ml')
    if u in ('count','ct','piece','pcs','bottle','bottles'):
        return (val, 'count')
    return (val, u)

# 14. Apply the normalize_unit function
norms_new = train_new.apply(lambda r: normalize_unit(r['unit_value'], r['unit_type']), axis=1)
train_new['norm_quantity'] = norms_new.map(lambda x: x[0])
train_new['norm_unit'] = norms_new.map(lambda x: x[1])
print("train_new norm_quantity and norm_unit created.")

# 15. Fill missing total_units and total_norm_quantity
# Calculate the product first, then fill NaNs
calculated_total_units = train_new['unit_value'] * train_new['pack_count']
train_new['total_units'] = train_new['total_units'].fillna(calculated_total_units)
# Use .values to ensure a simple array is assigned
train_new['total_norm_quantity'] = train_new['total_units'].values # Start with filled total_units

mask_new = train_new['total_norm_quantity'].isna() & train_new['norm_quantity'].notna() & train_new['pack_count'].notna()
train_new.loc[mask_new, 'total_norm_quantity'] = train_new.loc[mask_new, 'norm_quantity'] * train_new.loc[mask_new, 'pack_count']
print("train_new total_units and total_norm_quantity filled.")

# 16. Create numeric fills for clustering
train_new['norm_quantity_fill'] = train_new.get('norm_quantity', pd.Series(index=train_new.index)).fillna(0.0).astype(float)
train_new['pack_count_fill'] = train_new.get('pack_count', pd.Series(index=train_new.index)).fillna(0.0).astype(float)
train_new['desc_len'] = train_new['catalog_clean'].str.split().str.len().fillna(0).astype(int)
print("train_new numeric fills (norm_quantity_fill, pack_count_fill, desc_len) created.")

train_new numeric fields parsed and concatenated.
train_new norm_quantity and norm_unit created.


ValueError: cannot reindex on an axis with duplicate labels

# Task
Calculate the SMAPE score for the 'pred_cluster_median' and 'pred_hybrid' columns in "test_predictions.csv" using the 'price_clean' column in "test_out.csv" as the ground truth. Report the SMAPE scores.

## Load data

### Subtask:
Load `test_predictions.csv` and `test_out.csv` into pandas DataFrames.


**Reasoning**:
Load the two CSV files into pandas DataFrames and display their heads and columns to confirm they are loaded correctly.



In [1]:
import pandas as pd

df_predictions = pd.read_csv('test_predictions.csv')
df_actuals = pd.read_csv('test_out.csv')

print("df_predictions head:")
display(df_predictions.head())
print("\ndf_predictions columns:", df_predictions.columns.tolist())

print("\ndf_actuals head:")
display(df_actuals.head())
print("\ndf_actuals columns:", df_actuals.columns.tolist())

df_predictions head:


Unnamed: 0,sample_id,pred_cluster_median,pred_hybrid
0,100179,18.28,18.28
1,245611,18.28,18.28
2,146263,18.28,18.28
3,95658,10.12,18.910109
4,36806,18.28,18.28



df_predictions columns: ['sample_id', 'pred_cluster_median', 'pred_hybrid']

df_actuals head:


Unnamed: 0,sample_id,price
0,100179,25.94581
1,245611,22.815003
2,146263,27.192009
3,95658,17.980817
4,36806,31.610154



df_actuals columns: ['sample_id', 'price']


## Merge data

### Subtask:
Merge the two DataFrames on a common identifier (assuming 'sample_id').


**Reasoning**:
Merge the two DataFrames on a common identifier (assuming 'sample_id').



In [2]:
df_merged = pd.merge(df_predictions, df_actuals, on='sample_id', how='inner')
display(df_merged.head())

Unnamed: 0,sample_id,pred_cluster_median,pred_hybrid,price
0,100179,18.28,18.28,25.94581
1,245611,18.28,18.28,22.815003
2,146263,18.28,18.28,27.192009
3,95658,10.12,18.910109,17.980817
4,36806,18.28,18.28,31.610154


## Define smape function

### Subtask:
Create a Python function to calculate SMAPE based on the provided formula.


**Reasoning**:
Define the SMAPE function as requested in the instructions.



In [3]:
import numpy as np

def smape(y_true, y_pred):
    """
    Calculates the Symmetric Mean Absolute Percentage Error (SMAPE).

    Args:
        y_true: The true values.
        y_pred: The predicted values.

    Returns:
        The SMAPE score.
    """
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    # Add a small epsilon to avoid division by zero where both y_true and y_pred are zero
    epsilon = 1e-8
    ratio = numerator / (denominator + epsilon)
    return np.mean(ratio) * 100

**Reasoning**:
The SMAPE function has been defined. Now, calculate the SMAPE for both prediction columns using the `smape` function and the 'price' column from the merged DataFrame as the true values.



In [4]:
smape_cluster_median = smape(df_merged['price'], df_merged['pred_cluster_median'])
smape_hybrid = smape(df_merged['price'], df_merged['pred_hybrid'])

print(f"SMAPE for cluster median baseline: {smape_cluster_median:.4f}")
print(f"SMAPE for hybrid prediction: {smape_hybrid:.4f}")

SMAPE for cluster median baseline: 38.2202
SMAPE for hybrid prediction: 39.2181


## Summary:

### Data Analysis Key Findings

*   The SMAPE score for the `pred_cluster_median` predictions compared to the actual `price` was 38.2202%.
*   The SMAPE score for the `pred_hybrid` predictions compared to the actual `price` was 39.2181%.

### Insights or Next Steps

*   The `pred_cluster_median` model performed slightly better than the `pred_hybrid` model based on the SMAPE metric.
*   Further analysis could investigate the specific instances where the `pred_hybrid` model performed worse to identify potential areas for improvement.
