In [31]:
import re
import io
import pandas as pd
import numpy as np
from typing import List, Callable, Dict
from sklearn.feature_extraction.text import TfidfVectorizer

In [32]:
train_df = pd.read_csv("./train_with_features_384.csv")
test_df = pd.read_csv("./test_with_features_384.csv")

In [33]:
train_df.columns

Index(['sample_id', 'catalog_content', 'image_link', 'price', 'parsed_input',
       'hidden_0', 'hidden_1', 'hidden_2', 'hidden_3', 'hidden_4',
       ...
       'hidden_758', 'hidden_759', 'hidden_760', 'hidden_761', 'hidden_762',
       'hidden_763', 'hidden_764', 'hidden_765', 'hidden_766', 'hidden_767'],
      dtype='object', length=773)

In [34]:
train_df.head()

Unnamed: 0,sample_id,catalog_content,image_link,price,parsed_input,hidden_0,hidden_1,hidden_2,hidden_3,hidden_4,...,hidden_758,hidden_759,hidden_760,hidden_761,hidden_762,hidden_763,hidden_764,hidden_765,hidden_766,hidden_767
0,33127,"Item Name: La Victoria Green Taco Sauce Mild, ...",https://m.media-amazon.com/images/I/51mo8htwTH...,4.89,"La Victoria Green Taco Sauce Mild, 12 Ounce (P...",0.000263,0.044933,-0.029134,0.021077,-0.009784,...,-0.00576,0.039948,0.021019,-0.005191,0.011502,-0.009,0.051639,-0.010311,-0.032146,0.002894
1,198967,"Item Name: Salerno Cookies, The Original Butte...",https://m.media-amazon.com/images/I/71YtriIHAA...,13.12,"Salerno Cookies, The Original Butter Cookies, ...",-0.010102,0.034376,-0.007231,0.019871,0.007795,...,-0.063794,0.037696,-0.001306,0.042444,0.022255,-0.002965,0.092735,-0.003079,-0.040803,0.038916
2,261251,"Item Name: Bear Creek Hearty Soup Bowl, Creamy...",https://m.media-amazon.com/images/I/51+PFEe-w-...,1.97,"Bear Creek Hearty Soup Bowl, Creamy Chicken wi...",0.028091,0.038097,-0.011193,0.035106,0.021745,...,-0.031041,0.023984,0.000819,0.018358,0.016486,0.002006,0.043939,-0.050079,-0.042624,0.008628
3,55858,Item Name: Judee’s Blue Cheese Powder 11.25 oz...,https://m.media-amazon.com/images/I/41mu0HAToD...,30.34,Judee’s Blue Cheese Powder 11.25 oz - Gluten-F...,-0.009852,0.039714,-0.035323,0.043441,0.007543,...,-0.036955,0.007519,0.030852,0.025755,0.019191,-0.005598,0.020466,-0.031266,-0.051086,0.030734
4,292686,"Item Name: kedem Sherry Cooking Wine, 12.7 Oun...",https://m.media-amazon.com/images/I/41sA037+Qv...,66.49,"kedem Sherry Cooking Wine, 12.7 Ounce - 12 per...",0.010596,0.012416,0.008149,0.038313,0.014897,...,-0.050469,0.034222,0.011103,0.018327,0.022207,-0.013384,0.071318,-0.043456,-0.030218,0.042696


In [5]:
# %pip install scikit-learn

In [36]:
from sklearn.model_selection import train_test_split

# # df → your dataframe
train_df_train, train_df_val = train_test_split(
    train_df,
    test_size=0.01,       # 20% test split
    random_state=42,     # reproducibility
    shuffle=True         # always shuffle before splitting
)

price_threshold = train_df_train["price"].quantile(0.99)
train_df_train = train_df_train[train_df_train["price"] <= price_threshold].reset_index(drop=True)
train_df_val = train_df_val.reset_index(drop=True)

In [37]:
train_df_val.columns

Index(['sample_id', 'catalog_content', 'image_link', 'price', 'parsed_input',
       'hidden_0', 'hidden_1', 'hidden_2', 'hidden_3', 'hidden_4',
       ...
       'hidden_758', 'hidden_759', 'hidden_760', 'hidden_761', 'hidden_762',
       'hidden_763', 'hidden_764', 'hidden_765', 'hidden_766', 'hidden_767'],
      dtype='object', length=773)

In [38]:
# Sequential, readable, and easily-extendable text-only feature engineering pipeline.
# This will run on the CSV you provided earlier and print a preview of new, meaningful features.
# Each step is its own function and returns the dataframe (so steps can be applied in order).
# To add a new feature step, create a function with signature `def step_xxx(df: pd.DataFrame) -> pd.DataFrame:`
# and append it to `pipeline_steps` in `run_text_feature_pipeline`.


# ---------- Helpers ----------
def _clean_text(s: str) -> str:
    if pd.isna(s):
        return ""
    s = str(s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s

def _lower(s: str) -> str:
    return _clean_text(s).lower()

# ---------- Step 1: Parse catalog_content ----------
def step_1_parse_catalog(df: pd.DataFrame, catalog_col: str = 'catalog_content') -> pd.DataFrame:
    """
    Parse raw catalog_content into structured fields:
      - item_name (string)
      - bullets (list of strings)
      - value_total (numeric if present)
      - unit_field (string if present)
    """
    def extract_item_name(s: str) -> str:
        m = re.search(r'Item Name:\s*(.*?)(?:\n|$)', str(s), flags=re.I|re.S)
        return _clean_text(m.group(1)) if m else ""
    def extract_bullets(s: str) -> List[str]:
        bullets = re.findall(r'Bullet Point\s*\d+:\s*(.*?)(?:\n|$)', str(s), flags=re.I|re.S)
        return [ _clean_text(b) for b in bullets ]
    def extract_value(s: str) -> float:
        m = re.search(r'Value:\s*([0-9]+(?:\.[0-9]+)?)', str(s), flags=re.I)
        return float(m.group(1)) if m else np.nan
    def extract_unit_field(s: str) -> str:
        m = re.search(r'Unit:\s*([A-Za-z\. ]+)', str(s), flags=re.I)
        return _clean_text(m.group(1)) if m else ""

    df = df.copy()
    df['item_name_raw'] = df[catalog_col].apply(extract_item_name)
    df['bullets_list'] = df[catalog_col].apply(extract_bullets)
    df['value_total'] = df[catalog_col].apply(extract_value)
    df['unit_field_raw'] = df[catalog_col].apply(extract_unit_field)
    return df

# ---------- Step 2: Brand candidate heuristics ----------
PRODUCT_STOP_WORDS = {
    'pack','of','oz','ounce','ounces','fl','fl.','fl oz','pack of','box','bottle',
    'cup','cups','tray','single','serve','packets','packet','bag','box','lb','pound'
}

def step_2_brand_candidate(df: pd.DataFrame, title_col: str = 'item_name_raw') -> pd.DataFrame:
    """
    Create brand candidate features:
      - brand_candidate_15 : first 15 characters (lowercased)
      - brand_candidate_2words : first up-to-two alphabetic words (max 2 words), lowercased
      - brand_candidate (preferred) : prefer 2-words if they don't look like product words
    """
    def candidate_15(s: str) -> str:
        return _lower(s[:15])
    def candidate_2words(s: str) -> str:
        s = _lower(s)
        words = re.findall(r"[a-zA-Z']+", s)
        picked = []
        for w in words[:2]:
            if w in PRODUCT_STOP_WORDS:
                break
            picked.append(w)
        return " ".join(picked) if picked else _lower(words[0]) if words else ""
    def choose_preferred(s: str):
        two = candidate_2words(s)
        if two and len(two) > 0:
            return two
        return candidate_15(s)

    df = df.copy()
    df['brand_candidate_15'] = df[title_col].fillna("").apply(candidate_15)
    df['brand_candidate_2words'] = df[title_col].fillna("").apply(candidate_2words)
    df['brand_candidate'] = df[title_col].fillna("").apply(choose_preferred)
    return df

# ---------- Step 3: Size and pack extraction (text-only) ----------
UNIT_MAP = {
    'oz': 'oz', 'ounce': 'oz', 'ounces': 'oz', 'fl oz': 'fl_oz', 'fl. oz': 'fl_oz',
    'g': 'g', 'gram': 'g', 'grams': 'g', 'kg': 'kg', 'ml': 'ml', 'l': 'l', 'lb': 'lb', 'pound': 'lb'
}

def step_3_extract_size_pack(df: pd.DataFrame, title_col: str = 'item_name_raw') -> pd.DataFrame:
    """
    Extract size (numeric) and unit (normalized), and pack count from title text (or infer from value_total).
    Only uses text fields already present (no price usage).
    """
    def find_size_unit(s: str):
        # common patterns: "12 Ounce", "1.9 Ounce", "12 fl oz", "12fl oz", "12oz"
        s0 = s.lower()
        m = re.search(r'(\d+(?:\.\d+)?)\s*(fl oz|fl\. oz|ounce|ounces|oz|g|grams|gram|kg|ml|l|pound|lb)\b', s0)
        if m:
            val = float(m.group(1))
            raw_unit = m.group(2)
            normalized = UNIT_MAP.get(raw_unit.replace('.', '').strip(), raw_unit)
            return val, normalized
        # fallback: contiguous digits followed by oz or g with no space
        m2 = re.search(r'(\d+(?:\.\d+)?)(oz|g|kg|ml|l)\b', s.lower())
        if m2:
            val = float(m2.group(1))
            normalized = UNIT_MAP.get(m2.group(2), m2.group(2))
            return val, normalized
        return np.nan, ""

    def find_pack_count(s: str):
        s0 = s.lower()
        # explicit "pack of N", "(Pack of N)", "Pack N", "N pack"
        m = re.search(r'pack(?:\s*of)?\s*(\d+)', s0)
        if m:
            return int(m.group(1))
        m2 = re.search(r'\b(\d+)\s*-\s*pack\b', s0)
        if m2:
            return int(m2.group(1))
        # "4-pack" or "6 pack" or "6-pack"
        m3 = re.search(r'(\d+)[\s-]*pack\b', s0)
        if m3:
            return int(m3.group(1))
        return np.nan

    df = df.copy()
    sizes = df[title_col].fillna("").apply(find_size_unit)
    df['item_unit_size_from_title'] = sizes.apply(lambda t: t[0])
    df['item_unit_type_from_title'] = sizes.apply(lambda t: t[1])

    df['pack_count_from_title'] = df[title_col].apply(find_pack_count)
    # infer pack_count using value_total if title missing and value_total available
    def infer_pack(r):
        if not pd.isna(r.get('pack_count_from_title')):
            return r['pack_count_from_title']
        if not pd.isna(r.get('value_total')) and not pd.isna(r.get('item_unit_size_from_title')) and r['item_unit_size_from_title']>0:
            approx = r['value_total'] / r['item_unit_size_from_title']
            if approx>0:
                # round to nearest integer if close
                if abs(round(approx)-approx) / max(approx,1e-6) < 0.05:
                    return int(round(approx))
        return np.nan

    df['pack_count_inferred'] = df.apply(infer_pack, axis=1)
    # final pack count prefer explicit title then inferred then NaN
    df['pack_count_text'] = df['pack_count_from_title'].fillna(df['pack_count_inferred'])

    # normalized unit field preference: title -> unit_field_raw -> unknown
    df['unit_normalized'] = df['item_unit_type_from_title']
    df.loc[df['unit_normalized']== "", 'unit_normalized'] = df['unit_field_raw'].str.lower().fillna("")
    return df

def step_3b_normalize_units(df):
    """
    Normalize all numeric size/value quantities to a common comparable unit.
    - Liquids → milliliters (ml)
    - Solids  → grams (g)
    - Counts  → units (pcs)
    """
    # Common unit conversion tables
    to_ml = {
        "fl oz": 29.5735,
        "ounce": 29.5735,
        "oz": 29.5735,
        "pint": 473.176,
        "quart": 946.353,
        "gallon": 3785.41,
        "ml": 1.0,
        "l": 1000.0,
        "liter": 1000.0
    }
    to_g = {
        "oz": 28.3495,
        "ounce": 28.3495,
        "lb": 453.592,
        "pound": 453.592,
        "g": 1.0,
        "gram": 1.0,
        "kg": 1000.0,
        "kilogram": 1000.0
    }
    to_units = {
        "count": 1.0,
        "piece": 1.0,
        "pack": 1.0,
        "pcs": 1.0,
        "unit": 1.0
    }

    def normalize_row(row):
        unit = str(row.get("unit_normalized", "")).lower().strip()
        value = row.get("item_unit_size_from_title") or row.get("value_total")
        if value is None or pd.isna(value):
            return np.nan, None
        
        # Match the right conversion table
        if unit in to_ml:
            return value * to_ml[unit], "ml"
        elif unit in to_g:
            return value * to_g[unit], "g"
        elif unit in to_units:
            return value * to_units[unit], "unit"
        else:
            return value, None  # unknown unit, keep as is

    df[["normalized_value", "normalized_unit"]] = df.apply(
        lambda r: pd.Series(normalize_row(r)), axis=1
    )

    # # Create price per normalized unit (e.g. per gram or per ml)
    # df["price_per_norm_unit"] = df.apply(
    #     lambda r: r["price"] / r["normalized_value"]
    #     if pd.notna(r["normalized_value"]) and r["normalized_value"] > 0
    #     else np.nan,
    #     axis=1
    # )
    return df


# ---------- Step 4: Lexical & readability features ----------
def step_4_lexical_features(df: pd.DataFrame, title_col: str = 'item_name_raw') -> pd.DataFrame:
    df = df.copy()
    def stats(s: str) -> Dict:
        s_clean = _clean_text(s)
        s_low = s_clean.lower()
        words = re.findall(r"\w+", s_clean)
        word_count = len(words)
        unique_word_count = len(set([w.lower() for w in words]))
        avg_word_len = np.mean([len(w) for w in words]) if words else 0
        digit_count = sum(ch.isdigit() for ch in s_clean)
        punct_count = sum(1 for ch in s_clean if ch in '.,:;()[]/+-&%$')
        uppercase_words = sum(1 for w in s_clean.split() if any(ch.isupper() for ch in w))
        return {
            'title_word_count': word_count,
            'title_unique_word_count': unique_word_count,
            'title_avg_word_len': avg_word_len,
            'title_digit_count': digit_count,
            'title_punct_count': punct_count,
            'title_uppercase_word_count': uppercase_words,
            'title_char_count': len(s_clean)
        }
    lex = df[title_col].fillna("").apply(stats).apply(pd.Series)
    return pd.concat([df.reset_index(drop=True), lex.reset_index(drop=True)], axis=1)

# ---------- Step 5: Bullets features ----------
def step_5_bullets_features(df: pd.DataFrame, bullets_col: str = 'bullets_list') -> pd.DataFrame:
    df = df.copy()
    def bullet_stats(bullets: List[str]):
        if not bullets:
            return {
                'bullet_count': 0,
                'bullets_combined': "",
                'bullets_word_count': 0,
                'bullets_num_digits': 0,
                'bullets_avg_len': 0
            }
        combined = " ".join(bullets)
        words = re.findall(r"\w+", combined)
        nums = sum(ch.isdigit() for ch in combined)
        lens = [len(b) for b in bullets]
        return {
            'bullet_count': len(bullets),
            'bullets_combined': combined,
            'bullets_word_count': len(words),
            'bullets_num_digits': nums,
            'bullets_avg_len': np.mean(lens) if lens else 0
        }
    bfeat = df[bullets_col].apply(bullet_stats).apply(pd.Series)
    return pd.concat([df.reset_index(drop=True), bfeat.reset_index(drop=True)], axis=1)

# ---------- Step 6: Keyword flags (meaningful features) ----------
KEYWORDS = [
    'premium','premium quality','organic','butter','real butter','trusted brand','single serve',
    'family pack','variety pack','classic','original','mild','spicy','creamy','hearty','easy to prepare',
    'zero grams','0 grams','trans fat','low sodium','gluten free','no sugar','sugar free','imported',
    'gourmet','artisan','artisanal','bulk','best seller','new','fresh'
]

def step_6_keyword_flags(df: pd.DataFrame, text_cols: List[str] = None) -> pd.DataFrame:
    """
    Create binary flags for presence of meaningful keywords across provided text columns
    (defaults to item_name_raw + bullets_combined + catalog_content)
    """
    df = df.copy()
    if text_cols is None:
        text_cols = ['item_name_raw', 'bullets_combined', 'catalog_content']

    # build a lowercase combined field for keyword search
    df['__text_search_field'] = df[text_cols].fillna("").agg(" ".join, axis=1).str.lower()

    for kw in KEYWORDS:
        colname = 'kw_' + re.sub(r'[^a-z0-9]+', '_', kw.strip().lower())
        df[colname] = df['__text_search_field'].str.contains(re.escape(kw), regex=True).astype(int)
    # also add a keyword richness score
    kw_cols = [c for c in df.columns if c.startswith('kw_')]
    df['kw_count'] = df[kw_cols].sum(axis=1)
    df.drop(columns=['__text_search_field'], inplace=True)
    return df

# ---------- Step 7: Optional TF-IDF (compact top-n components) ----------
def step_7_tfidf_topn(df: pd.DataFrame, text_col: str = 'item_name_raw', top_n: int = 1) -> pd.DataFrame:
    """
    Fit a small TF-IDF on titles and add top_n TF-IDF feature columns (tfidf_0 ... tfidf_{n-1}).
    This keeps the pipeline text-only but gives dense numeric summarization of the title.
    """
    df = df.copy()
    corpus = df[text_col].fillna("").astype(str).tolist()
    vect = TfidfVectorizer(ngram_range=(1,2), max_features=1000, stop_words='english')
    X = vect.fit_transform(corpus)
    # reduce to top_n dimensions by taking highest-variance columns (simple heuristic)
    # compute column variances
    import numpy as _np
    col_vars = _np.array(X.toarray()).var(axis=0)
    if X.shape[1] == 0:
        # no features produced
        for i in range(top_n):
            df[f'tfidf_{i}'] = 0.0
        return df
    idxs = col_vars.argsort()[::-1][:min(top_n, X.shape[1])]
    Xarr = X.toarray()[:, idxs]
    # insert into df as tfidf_0 ... tfidf_{k-1}
    for i in range(Xarr.shape[1]):
        df[f'tfidf_{i}'] = Xarr[:, i]
    # if less than top_n created, pad remaining with zeros
    for i in range(Xarr.shape[1], top_n):
        df[f'tfidf_{i}'] = 0.0
    return df



# # ---------- Run pipeline on sample data and show a compact preview ----------
# df_features = run_text_feature_pipeline(df_raw)

# # Display a subset of informative columns for preview
# cols_to_show = [
#     'sample_id','item_name_raw','brand_candidate','brand_candidate_15','brand_candidate_2words',
#     'item_unit_size_from_title','item_unit_type_from_title','pack_count_from_title','pack_count_inferred','pack_count_text','unit_normalized',
#     'title_word_count','title_char_count','title_unique_word_count','title_avg_word_len' if 'title_avg_word_len' in df_features.columns else 'title_avg_word_len',
#     'bullet_count','bullets_word_count','bullets_avg_len',
#     'kw_count'
# ]
# # filter only existing columns
# cols_to_show = [c for c in cols_to_show if c in df_features.columns]

# df_preview = df_features[cols_to_show].copy()
# df_preview.reset_index(drop=True, inplace=True)

# # Show the preview (this will be visible in the notebook output)
# df_preview



In [39]:
# def run_train_text_feature_pipeline(df):
#     df = step_1_parse_catalog(df)
#     df = step_2_brand_candidate(df)
#     df = step_3_extract_size_pack(df)
#     df = step_3b_normalize_units(df)   # ⬅️ add this line here
#     df = step_4_lexical_features(df)
#     df = step_5_bullets_features(df)
#     df = step_6_keyword_flags(df)
    
#     return df




In [40]:
for dname, dset in [("train", train_df_train), ("val", train_df_val), ("test", test_df)]:
    dset = step_1_parse_catalog(dset)
    dset = step_2_brand_candidate(dset)
    dset = step_3_extract_size_pack(dset)
    dset = step_3b_normalize_units(dset)
    dset = step_4_lexical_features(dset)
    dset = step_5_bullets_features(dset)
    dset = step_7_tfidf_topn(dset)
    
    if dname == "train":
        train_df_train = dset
    elif dname == "val":
        train_df_val = dset
    else:
        test_df = dset

# # # Compute brand-level maps from train
# brand_stats = train_df_train.groupby("brand_candidate")["price"].agg(["median"])
# brand_stats_dict = brand_stats.to_dict()  # nested dicts: {"mean": {...}, "median": {...}, ...}

# def populate_from_hashmaps(df, brand_stats_dict):
#     for stat_name, mapping in brand_stats_dict.items():
#         df[f"brand_price_{stat_name}"] = df["brand_candidate"].map(mapping)
#     return df

# train_df_train = populate_from_hashmaps(train_df_train, brand_stats_dict)
# train_df_val = populate_from_hashmaps(train_df_val, brand_stats_dict)


# for stat in ["mean", "median", "max", "min"]:
    # train_df_train[f"price_vs_brand_{stat}"] = train_df_train["price"] / train_df_train[f"brand_price_{stat}"]
    # train_df_val[f"price_vs_brand_{stat}"] = train_df_val["price"] / train_df_val[f"brand_price_{stat}"]

    # train_df_train[f"price_vs_brand_{stat}"].replace([np.inf, -np.inf], np.nan, inplace=True)
    # train_df_val[f"price_vs_brand_{stat}"].replace([np.inf, -np.inf], np.nan, inplace=True)

    # train_df_train[f"price_vs_brand_{stat}"].fillna(1.0, inplace=True)
    # train_df_val[f"price_vs_brand_{stat}"].fillna(1.0, inplace=True)

In [41]:
train_df_train.columns

Index(['sample_id', 'catalog_content', 'image_link', 'price', 'parsed_input',
       'hidden_0', 'hidden_1', 'hidden_2', 'hidden_3', 'hidden_4',
       ...
       'title_digit_count', 'title_punct_count', 'title_uppercase_word_count',
       'title_char_count', 'bullet_count', 'bullets_combined',
       'bullets_word_count', 'bullets_num_digits', 'bullets_avg_len',
       'tfidf_0'],
      dtype='object', length=801)

In [42]:
train_df_val.columns

Index(['sample_id', 'catalog_content', 'image_link', 'price', 'parsed_input',
       'hidden_0', 'hidden_1', 'hidden_2', 'hidden_3', 'hidden_4',
       ...
       'title_digit_count', 'title_punct_count', 'title_uppercase_word_count',
       'title_char_count', 'bullet_count', 'bullets_combined',
       'bullets_word_count', 'bullets_num_digits', 'bullets_avg_len',
       'tfidf_0'],
      dtype='object', length=801)

In [43]:
len(test_df)

75000

In [44]:
import numpy as np
import pandas as pd
import lightgbm as lgb

def smape(y_true, y_pred):
    """Calculate Symmetric Mean Absolute Percentage Error."""
    return 100 * np.mean(
        2 * np.abs(y_pred - y_true) / (np.abs(y_pred) + np.abs(y_true) + 1e-8)
    )

# Prepare features
feature_cols = [
    c for c in train_df_train.columns
    if c not in ["sample_id", "price", "item_name_raw", "catalog_content", "image_link"]
    and train_df_train[c].dtype != "O"
]

print(f"Total features: {len(feature_cols)}")

# Prepare train data
X_train = train_df_train[feature_cols]
y_train = train_df_train["price"]

# Prepare validation data
X_val = train_df_val[feature_cols]
y_val = train_df_val["price"]

X_test = test_df[feature_cols]

# Transform target to log scale
y_train_log = np.log1p(y_train)
y_val_log = np.log1p(y_val)

# Create LightGBM datasets
train_ds = lgb.Dataset(X_train, label=y_train_log)
val_ds = lgb.Dataset(X_val, label=y_val_log, reference=train_ds)

# Parameters optimized to use MORE features
params = {
    "objective": "regression",
    "metric": "mape",
    "learning_rate": 0.03,  # Lower learning rate for more complex model
    "num_leaves": 64,  # Increased from 31 to handle more features
    "max_depth": 8,  # Control depth to avoid overfitting
    "min_child_samples": 10,  # Allow smaller leaf nodes
    
    # Feature usage parameters - KEY for using more features
    "feature_fraction": 0.5,  # Use ALL features (was 0.9)
    "feature_fraction_bynode": 0.5,  # Use 80% features per split
    
    # Bagging parameters
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    
    # Regularization to prevent overfitting with many features
    "reg_alpha": 0.1,  # L1 regularization
    "reg_lambda": 0.25,  # L2 regularization
    "min_gain_to_split": 0.01,  # Minimum gain to make split
    
    # Other params
    "seed": 42,
    "verbose": -1,
    "force_col_wise": True,  # Better for many features
}

# Train model with more rounds for complex model
model = lgb.train(
    params,
    train_ds,
    valid_sets=[train_ds, val_ds],
    valid_names=["train", "val"],
    num_boost_round=10000,  # Increased from 2000
    callbacks=[
        lgb.early_stopping(150),  # More patience
        lgb.log_evaluation(100)
    ]
)

# Predict on validation set
y_pred_log = model.predict(X_val, num_iteration=model.best_iteration)
y_pred = np.expm1(y_pred_log)

# Calculate SMAPE
score = smape(y_val.values, y_pred)
print(f"\nValidation SMAPE: {score:.2f}%")

# Analyze feature usage
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': model.feature_importance(importance_type='gain'),
    'split_count': model.feature_importance(importance_type='split')
})
feature_importance = feature_importance.sort_values('importance', ascending=False)

# Count how many features are actually used
features_used = (feature_importance['split_count'] > 0).sum()
print(f"\nFeatures actually used: {features_used} out of {len(feature_cols)} ({100*features_used/len(feature_cols):.1f}%)")

print("\nTop 25 Features:")
print(feature_importance.head(25).to_string(index=False))

test_output = model.predict(X_test, num_iteration=model.best_iteration)
test_output = np.expm1(test_output)

output_df = test_df[['sample_id']]
output_df['price'] = test_output
output_df.to_csv("out.csv", index = False)

Total features: 786
Training until validation scores don't improve for 150 rounds
[100]	train's mape: 0.264284	val's mape: 0.270382
[200]	train's mape: 0.240728	val's mape: 0.259573
[300]	train's mape: 0.224946	val's mape: 0.254169
[400]	train's mape: 0.212628	val's mape: 0.25077
[500]	train's mape: 0.201953	val's mape: 0.248248
[600]	train's mape: 0.192491	val's mape: 0.245781
[700]	train's mape: 0.184021	val's mape: 0.244723
[800]	train's mape: 0.176285	val's mape: 0.242871
[900]	train's mape: 0.169191	val's mape: 0.242253
[1000]	train's mape: 0.162588	val's mape: 0.240906
[1100]	train's mape: 0.156335	val's mape: 0.240204
[1200]	train's mape: 0.15044	val's mape: 0.23914
[1300]	train's mape: 0.144878	val's mape: 0.238504
[1400]	train's mape: 0.139691	val's mape: 0.237708
[1500]	train's mape: 0.13477	val's mape: 0.237154
[1600]	train's mape: 0.129955	val's mape: 0.23643
[1700]	train's mape: 0.125461	val's mape: 0.235499
[1800]	train's mape: 0.121236	val's mape: 0.234908
[1900]	train's

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output_df['price'] = test_output


In [45]:
# len(test_output)

In [46]:
# Get top 25 features
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': model.feature_importance(importance_type='gain')
})
feature_importance = feature_importance.sort_values('importance', ascending=False)

print("\nTop 25 Features:")
print(feature_importance.head(25).to_string(index=False))

# Get list of top 25 feature names
top_25_features = feature_importance.head(25)['feature'].tolist()
print(f"\nTop 25 feature names: {top_25_features}")


Top 25 Features:
                   feature   importance
          normalized_value 33545.027619
       pack_count_inferred 31134.526273
           pack_count_text 27818.681184
         title_digit_count 20598.956626
               value_total 17198.951239
          title_char_count 12843.159570
     pack_count_from_title 10584.128904
 item_unit_size_from_title  8576.321956
                hidden_262  8456.419174
                   tfidf_0  7913.744757
          title_word_count  7269.733815
                 hidden_36  7076.022996
           bullets_avg_len  6080.119144
                hidden_384  5399.800886
                hidden_151  5129.810827
        bullets_word_count  4782.037857
        title_avg_word_len  4735.310014
title_uppercase_word_count  4666.063877
         title_punct_count  4626.901746
                hidden_250  4426.859081
                hidden_750  4061.903189
                hidden_341  3664.780169
   title_unique_word_count  3475.428866
        bullets_num_di

In [16]:
# %pip install optuna

In [None]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import optuna
from optuna.samplers import TPESampler

def smape(y_true, y_pred):
    """Calculate Symmetric Mean Absolute Percentage Error."""
    return 100 * np.mean(
        2 * np.abs(y_pred - y_true) / (np.abs(y_pred) + np.abs(y_true) + 1e-8)
    )

# Prepare features
feature_cols = [
    c for c in train_df_train.columns
    if c not in ["sample_id", "price", "item_name_raw", "catalog_content", "image_link"]
    and train_df_train[c].dtype != "O"
]
print(f"Total features: {len(feature_cols)}")

# Prepare train data
X_train = train_df_train[feature_cols]
y_train = train_df_train["price"]

# Prepare validation data
X_val = train_df_val[feature_cols]
y_val = train_df_val["price"]

# Transform target to log scale
y_train_log = np.log1p(y_train)
y_val_log = np.log1p(y_val)

# Create LightGBM datasets
train_ds = lgb.Dataset(X_train, label=y_train_log)
val_ds = lgb.Dataset(X_val, label=y_val_log, reference=train_ds)


def objective(trial):
    """Optuna objective function for hyperparameter tuning."""
    
    # Suggest objective function
    objective_type = trial.suggest_categorical("objective", [
        "regression",      # L2 loss (MSE)
        "regression_l1",   # L1 loss (MAE) - robust to outliers
        "huber",          # Huber loss - combines L1 and L2
        "fair",           # Fair loss - another robust alternative
        "poisson",        # Poisson regression - for count-like positive data
        "quantile",       # Quantile regression
        "mape",           # Mean Absolute Percentage Error
    ])
    
    # Suggest hyperparameters
    params = {
        "objective": objective_type,
        "metric": "mae",
        "verbosity": -1,
        "seed": 42,
        "force_col_wise": True,
        "feature_pre_filter": False,  # Required for dynamic min_child_samples
        
        # Learning parameters
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 31, 127),
        "max_depth": trial.suggest_int("max_depth", 5, 12),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 50),
        "min_child_weight": trial.suggest_float("min_child_weight", 1e-3, 10.0, log=True),
        
        # Feature parameters
        "feature_fraction": trial.suggest_float("feature_fraction", 0.7, 1.0),
        "feature_fraction_bynode": trial.suggest_float("feature_fraction_bynode", 0.7, 1.0),
        
        # Bagging parameters
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.6, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 10),
        
        # Regularization
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
        "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0.0, 0.1),
        
        # Additional parameters
        "path_smooth": trial.suggest_float("path_smooth", 0.0, 1.0),
    }
    
    # Train model
    model = lgb.train(
        params,
        train_ds,
        valid_sets=[val_ds],
        valid_names=["val"],
        num_boost_round=2000,
        callbacks=[
            lgb.early_stopping(100),
        ]
    )
    
    # Predict and calculate SMAPE
    y_pred_log = model.predict(X_val, num_iteration=model.best_iteration)
    y_pred = np.expm1(y_pred_log)
    score = smape(y_val.values, y_pred)
    
    return score


# Run Optuna optimization
print("\n" + "="*80)
print("Starting Optuna Hyperparameter Tuning")
print("="*80 + "\n")

study = optuna.create_study(
    direction="minimize",
    sampler=TPESampler(seed=42),
    study_name="lgb_price_prediction"
)

study.optimize(
    objective,
    n_trials=100,  # Adjust number of trials as needed
    timeout=None,
    show_progress_bar=True,
    n_jobs=1  # Set to -1 for parallel execution if you have multiple cores
)

print("\n" + "="*80)
print("Optimization Complete!")
print("="*80)
print(f"\nBest SMAPE Score: {study.best_value:.2f}%")
print(f"Best iteration: {study.best_trial.number}")
print("\nBest Parameters:")
for key, value in study.best_params.items():
    print(f"  {key}: {value}")

# Train final model with best parameters
print("\n" + "="*80)
print("Training Final Model with Best Parameters")
print("="*80 + "\n")

best_params = study.best_params.copy()
best_params.update({
    "objective": "regression",
    "metric": "mae",
    "seed": 42,
    "verbose": -1,
    "force_col_wise": True,
    "feature_pre_filter": False,  # Required for dynamic min_child_samples
})

final_model = lgb.train(
    best_params,
    train_ds,
    valid_sets=[train_ds, val_ds],
    valid_names=["train", "val"],
    num_boost_round=3000,
    callbacks=[
        lgb.early_stopping(150),
        lgb.log_evaluation(100)
    ]
)

# Final predictions
y_pred_log = final_model.predict(X_val, num_iteration=final_model.best_iteration)
y_pred = np.expm1(y_pred_log)

# Calculate final SMAPE
final_score = smape(y_val.values, y_pred)
print(f"\n{'='*80}")
print(f"Final Validation SMAPE: {final_score:.2f}%")
print(f"{'='*80}\n")

# Analyze feature usage
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': final_model.feature_importance(importance_type='gain'),
    'split_count': final_model.feature_importance(importance_type='split')
})
feature_importance = feature_importance.sort_values('importance', ascending=False)

features_used = (feature_importance['split_count'] > 0).sum()
print(f"Features actually used: {features_used} out of {len(feature_cols)} ({100*features_used/len(feature_cols):.1f}%)")

print("\nTop 25 Features:")
print(feature_importance.head(25).to_string(index=False))

# Save best parameters to file (optional)
import json
with open('best_lgb_params.json', 'w') as f:
    json.dump(best_params, f, indent=4)
print("\n✅ Best parameters saved to 'best_lgb_params.json'")

# Optional: Plot optimization history
try:
    import matplotlib.pyplot as plt
    
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    # Plot optimization history
    optuna.visualization.matplotlib.plot_optimization_history(study, ax=axes[0])
    axes[0].set_title('Optimization History')
    
    # Plot parameter importances
    optuna.visualization.matplotlib.plot_param_importances(study, ax=axes[1])
    axes[1].set_title('Parameter Importances')
    
    plt.tight_layout()
    plt.savefig('optuna_results.png', dpi=150, bbox_inches='tight')
    print("✅ Optimization plots saved to 'optuna_results.png'")
    plt.show()
except ImportError:
    print("\n⚠️  Install matplotlib for visualization: pip install matplotlib")

[I 2025-10-11 15:47:36,351] A new study created in memory with name: lgb_price_prediction


Total features: 785

Starting Optuna Hyperparameter Tuning



  0%|          | 0/100 [00:00<?, ?it/s]

Training until validation scores don't improve for 100 rounds


In [14]:
# Get top 25 features
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': model.feature_importance(importance_type='gain')
})
feature_importance = feature_importance.sort_values('importance', ascending=False)

print("\nTop 25 Features:")
print(feature_importance.head(25).to_string(index=False))

# Get list of top 25 feature names
top_25_features = feature_importance.head(25)['feature'].tolist()
print(f"\nTop 25 feature names: {top_25_features}")


Top 25 Features:
                   feature   importance
        brand_price_median 97383.570609
          brand_price_mean 72230.582977
          normalized_value 28725.112673
       price_per_norm_unit 23533.447547
           brand_price_min  7346.459112
               value_total  4002.426605
           brand_price_max  3050.919962
       pack_count_inferred  1270.831382
        title_avg_word_len  1110.407192
          title_char_count  1001.989447
 item_unit_size_from_title   940.833961
     pack_count_from_title   743.565365
        bullets_word_count   639.325641
           bullets_avg_len   529.777716
         title_digit_count   500.485639
         title_punct_count   418.364085
   title_unique_word_count   391.537754
title_uppercase_word_count   362.827658
        bullets_num_digits   345.826939
          title_word_count   301.698374
              bullet_count   164.361400
           pack_count_text    60.898665

Top 25 feature names: ['brand_price_median', 'brand_price_mea

In [29]:
import numpy as np
import pandas as pd
import lightgbm as lgb
# import optuna
from typing import Dict, Tuple, List


# ============================================================================
# METRICS
# ============================================================================

def smape(y_true, y_pred):
    """Calculate Symmetric Mean Absolute Percentage Error."""
    return 100 * np.mean(
        2 * np.abs(y_pred - y_true) / (np.abs(y_pred) + np.abs(y_true) + 1e-8)
    )


# ============================================================================
# DATA PREPARATION
# ============================================================================

def prepare_features(df, target_col="price", exclude_cols=None):
    """
    Extract features and target from dataframe.
    
    Args:
        df: Input dataframe
        target_col: Name of target column
        exclude_cols: Columns to exclude from features
        
    Returns:
        X (DataFrame), y (Series), feature_cols (list)
    """
    if exclude_cols is None:
        exclude_cols = ["sample_id", "price", "item_name_raw"]
    
    # Select only numeric features
    feature_cols = [
        c for c in df.columns
        if c not in exclude_cols and df[c].dtype != "O"
    ]
    
    X = df[feature_cols].copy()
    y = df[target_col].copy()
    
    return X, y, feature_cols


def transform_target(y, use_log=True):
    """Apply log transformation to target."""
    if use_log:
        return np.log1p(y)
    return y


def inverse_transform_target(y, use_log=True):
    """Inverse transform predictions back to original scale."""
    if use_log:
        return np.expm1(y)
    return y


# ============================================================================
# MODEL TRAINING
# ============================================================================

def get_default_params():
    """Get default LightGBM parameters."""
    return {
        "objective": "regression",
        "metric": "mape",
        "learning_rate": 0.05,
        "num_leaves": 31,
        "feature_fraction": 0.9,
        "bagging_fraction": 0.8,
        "bagging_freq": 5,
        "min_child_samples": 20,
        "reg_alpha": 0.0,
        "reg_lambda": 0.0,
        "seed": 42,
        "verbose": -1,
    }


def train_model(
    X_train, 
    y_train, 
    X_val, 
    y_val, 
    params=None,
    num_boost_round=2000,
    early_stopping_rounds=100,
    verbose_eval=100,
    use_log_target=True
):
    """
    Train LightGBM model.
    
    Args:
        X_train, y_train: Training data
        X_val, y_val: Validation data
        params: LightGBM parameters (uses defaults if None)
        num_boost_round: Maximum number of boosting rounds
        early_stopping_rounds: Early stopping rounds
        verbose_eval: Verbose evaluation frequency
        use_log_target: Whether to use log-transformed target
        
    Returns:
        trained model, training history
    """
    # Transform targets if needed
    y_train_transformed = transform_target(y_train, use_log_target)
    y_val_transformed = transform_target(y_val, use_log_target)
    
    # Use default params if not provided
    if params is None:
        params = get_default_params()
    
    # Create datasets
    train_ds = lgb.Dataset(X_train, label=y_train_transformed)
    val_ds = lgb.Dataset(X_val, label=y_val_transformed, reference=train_ds)
    
    # Train model
    callbacks = []
    if early_stopping_rounds:
        callbacks.append(lgb.early_stopping(early_stopping_rounds))
    if verbose_eval:
        callbacks.append(lgb.log_evaluation(verbose_eval))
    
    model = lgb.train(
        params,
        train_ds,
        valid_sets=[train_ds, val_ds],
        valid_names=["train", "val"],
        num_boost_round=num_boost_round,
        callbacks=callbacks,
    )
    
    return model


def predict(model, X, use_log_target=True):
    """Make predictions and inverse transform if needed."""
    y_pred = model.predict(X, num_iteration=model.best_iteration)
    return inverse_transform_target(y_pred, use_log_target)


def evaluate_model(model, X_val, y_val, use_log_target=True):
    """Evaluate model and return SMAPE score."""
    y_pred = predict(model, X_val, use_log_target)
    score = smape(y_val.values, y_pred)
    return score

In [30]:
def run_pipeline(
    train_df_train,
    train_df_val,
    use_optuna=False,
    n_trials=50,
    custom_params=None
):
    """
    Complete training pipeline.
    
    Args:
        train_df_train: Training dataframe
        train_df_val: Validation dataframe
        use_optuna: Whether to use Optuna for hyperparameter tuning
        n_trials: Number of Optuna trials (if use_optuna=True)
        custom_params: Custom LightGBM parameters (if not using Optuna)
        
    Returns:
        model, validation_score, best_params
    """
    # Prepare data
    X_train, y_train, feature_cols = prepare_features(train_df_train)
    X_val, y_val, _ = prepare_features(train_df_val)
    
    print(f"Training with {len(feature_cols)} features")
    print(f"Train size: {len(X_train)}, Val size: {len(X_val)}")
    
    # Hyperparameter tuning
    if use_optuna:
        print("\n=== Starting Optuna Hyperparameter Tuning ===")
        best_params, study = tune_hyperparameters(
            X_train, y_train, X_val, y_val,
            n_trials=n_trials,
            use_log_target=True
        )
        # Add fixed params
        best_params.update({
            "objective": "regression",
            "metric": "mae",
            "seed": 42,
            "verbose": -1
        })
    else:
        best_params = custom_params if custom_params else get_default_params()
    
    # Train final model
    print("\n=== Training Final Model ===")
    model = train_model(
        X_train, y_train, X_val, y_val,
        params=best_params,
        num_boost_round=2000,
        early_stopping_rounds=100,
        verbose_eval=100,
        use_log_target=True
    )
    
    # Evaluate
    val_score = evaluate_model(model, X_val, y_val, use_log_target=True)
    print(f"\n=== Final Validation SMAPE: {val_score:.2f}% ===")
    
    return model, val_score, best_params



In [31]:
  # # Example 1: Train with default parameters
print("Example 1: Default parameters")
model, score, params = run_pipeline(
    train_df_train,
    train_df_val,
    use_optuna=False
)

Example 1: Default parameters
Training with 27 features
Train size: 15000, Val size: 15000

=== Training Final Model ===
Training until validation scores don't improve for 100 rounds
[100]	train's mape: 0.00401511	val's mape: 0.00401511
[200]	train's mape: 0.00270892	val's mape: 0.00270892
[300]	train's mape: 0.00244483	val's mape: 0.00244483
[400]	train's mape: 0.00227862	val's mape: 0.00227862
[500]	train's mape: 0.0021398	val's mape: 0.0021398
[600]	train's mape: 0.00202009	val's mape: 0.00202009
[700]	train's mape: 0.0019298	val's mape: 0.0019298
[800]	train's mape: 0.00183392	val's mape: 0.00183392
[900]	train's mape: 0.00176103	val's mape: 0.00176103
[1000]	train's mape: 0.00168378	val's mape: 0.00168378
[1100]	train's mape: 0.00162758	val's mape: 0.00162758
[1200]	train's mape: 0.00156752	val's mape: 0.00156752
[1300]	train's mape: 0.00151451	val's mape: 0.00151451
[1400]	train's mape: 0.0014663	val's mape: 0.0014663
[1500]	train's mape: 0.0014133	val's mape: 0.0014133
[1600]	tr

In [15]:
train_df_train.columns

Index(['sample_id', 'catalog_content', 'image_link', 'price', 'item_name_raw',
       'bullets_list', 'value_total', 'unit_field_raw', 'brand_candidate_15',
       'brand_candidate_2words', 'brand_candidate',
       'item_unit_size_from_title', 'item_unit_type_from_title',
       'pack_count_from_title', 'pack_count_inferred', 'pack_count_text',
       'unit_normalized', 'normalized_value', 'normalized_unit',
       'price_per_norm_unit', 'title_word_count', 'title_unique_word_count',
       'title_avg_word_len', 'title_digit_count', 'title_punct_count',
       'title_uppercase_word_count', 'title_char_count', 'bullet_count',
       'bullets_combined', 'bullets_word_count', 'bullets_num_digits',
       'bullets_avg_len', 'brand_price_mean', 'brand_price_median',
       'brand_price_max', 'brand_price_min', 'price_vs_brand_mean',
       'price_vs_brand_median', 'price_vs_brand_max', 'price_vs_brand_min'],
      dtype='object')

In [16]:
def smape(y_true, y_pred):
    return 100 * np.mean(
        2 * np.abs(y_pred - y_true) / (np.abs(y_pred) + np.abs(y_true) + 1e-8)
    )

In [18]:
train_df_train["log_price"] = np.log1p(train_df_train["price"])
train_df_val["log_price"] = np.log1p(train_df_val["price"])

In [19]:
feature_cols = [
    c for c in train_df_train.columns
    if c not in ["sample_id", "price", "item_name_raw"]
    and train_df_train[c].dtype != "O"  # exclude non-numeric
]
X_train, y_train = train_df_train[feature_cols], train_df["price"]
X_val, y_val = train_df_val[feature_cols], train_df_val["price"]


In [22]:
# %pip install lightgbm

In [24]:
import lightgbm as lgb

train_ds = lgb.Dataset(X_train, label=y_train)
val_ds = lgb.Dataset(X_val, label=y_val, reference=train_ds)

params = {
    "objective": "regression_l1",  # robust loss
    "metric": "mae",
    "learning_rate": 0.05,
    "num_leaves": 31,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "seed": 42,
    "verbose": -1,
}

model = lgb.train(
    params,
    train_ds,
    valid_sets=[train_ds, val_ds],
    valid_names=["train", "val"],
    num_boost_round=2000,
    # early_stopping_rounds=100,
    # verbose_eval=100,
)

TypeError: train() got an unexpected keyword argument 'verbose_eval'

In [None]:
y_pred = np.expm1(model.predict(X_val, num_iteration=model.best_iteration))
score = smape(y_val.values, y_pred)
print(f"Validation SMAPE: {score:.2f}%")


In [None]:
lgb.plot_importance(model, max_num_features=25, importance_type="gain")