In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix, save_npz
import re
import joblib

# --- Utils ---
def extract_ipq_and_text(text):
    if pd.isna(text): 
        return "", 1.0
    s = str(text)
    ipq = 1.0
    m = re.search(r'(\d+)\s*[-x×]*\s*(pack|pcs|pieces|count|units)\b', s, flags=re.I)
    if not m:
        m = re.search(r'pack of\s*(\d+)', s, flags=re.I)
    if not m:
        m = re.search(r'(\d+)\s*[x×]\s*\d+', s)
    if m:
        try:
            ipq = float(m.group(1))
            if ipq <= 0: ipq = 1.0
        except:
            ipq = 1.0
    text_clean = re.sub(r'http\S+','', s)
    text_clean = re.sub(r'[\r\n]+',' ', text_clean)
    return text_clean, ipq

# --- Load data ---
train = pd.read_csv('train.csv')
test  = pd.read_csv('test.csv')

# Preprocess and extract text + numeric features
for df in (train, test):
    df['catalog_content'] = df['catalog_content'].fillna('').astype(str)
    out = df['catalog_content'].apply(lambda t: extract_ipq_and_text(t))
    df['catalog_text'] = out.apply(lambda x: x[0])
    df['ipq'] = out.apply(lambda x: x[1])
    df['text_len'] = df['catalog_text'].str.len().fillna(0)
    df['word_count'] = df['catalog_text'].str.split().apply(lambda x: len(x) if isinstance(x, list) else 0)

# --- TF-IDF ---
tfidf = TfidfVectorizer(max_features=50000, ngram_range=(1,2), min_df=3)
tfidf.fit(train['catalog_text'])

# Transform train + test
X_train_text = tfidf.transform(train['catalog_text'])
X_test_text  = tfidf.transform(test['catalog_text'])

# Combine with numeric features
num_cols = ['ipq', 'text_len', 'word_count']
X_train_num = csr_matrix(train[num_cols].values)
X_test_num  = csr_matrix(test[num_cols].values)

X_train = hstack([X_train_text, X_train_num]).tocsr()
X_test  = hstack([X_test_text, X_test_num]).tocsr()

print("TF-IDF features ready!")
print("Train shape:", X_train.shape)
print("Test shape :", X_test.shape)

# --- Save features ---
save_npz('X_train.npz', X_train)
save_npz('X_test.npz', X_test)
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')
print("Saved X_train.npz, X_test.npz and tfidf_vectorizer.pkl")


TF-IDF features ready!
Train shape: (75000, 50003)
Test shape : (75000, 50003)
Saved X_train.npz, X_test.npz and tfidf_vectorizer.pkl


In [6]:
# ===============================
# Smart Product Pricing Challenge
# TF-IDF + Numeric Features + LightGBM Regression
# ===============================

import numpy as np
import pandas as pd
from scipy.sparse import load_npz, hstack, csr_matrix
from sklearn.model_selection import KFold
import lightgbm as lgb
import joblib

# -------------------------------
# SMAPE function
# -------------------------------
def smape(y_true, y_pred):
    denom = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    denom[denom == 0] = 1e-9
    return np.mean(np.abs(y_pred - y_true) / denom) * 100

# -------------------------------
# Load features
# -------------------------------
X_train = load_npz('X_train.npz')  # saved TF-IDF + numeric features
X_test  = load_npz('X_test.npz')

train = pd.read_csv('train.csv')
test  = pd.read_csv('test.csv')

# Target transform
train['price'] = train['price'].clip(lower=0.01)
y = np.log1p(train['price'].values)  # log-transform for stability

# -------------------------------
# LightGBM parameters
# -------------------------------
lgb_params = {
    'objective': 'regression',
    'boosting_type': 'gbdt',
    'metric': 'l2',
    'learning_rate': 0.05,
    'num_leaves': 127,
    'feature_fraction': 0.6,
    'bagging_fraction': 0.8,
    'bagging_freq': 3,
    'seed': 42,
    'verbose': -1,
    'n_jobs': 8
}

# -------------------------------
# Cross-validation
# -------------------------------
# Cross-validation loop (old LightGBM compatible)
kf = KFold(n_splits=5, shuffle=True, random_state=42)
oof = np.zeros(len(train))
preds = np.zeros(len(test))

for fold, (tr_idx, val_idx) in enumerate(kf.split(X_train)):
    print(f"\n===== Fold {fold+1} =====")
    
    X_tr, X_val = X_train[tr_idx], X_train[val_idx]
    y_tr, y_val = y[tr_idx], y[val_idx]

    dtrain = lgb.Dataset(X_tr, label=y_tr)
    dval   = lgb.Dataset(X_val, label=y_val)
    
    model = lgb.train(
        lgb_params,
        dtrain,
        num_boost_round=1000,           # adjust if needed
        valid_sets=[dtrain, dval]
        # early_stopping_rounds and verbose_eval removed
    )
    
    oof[val_idx] = model.predict(X_val)
    preds += model.predict(X_test) / kf.n_splits
    joblib.dump(model, f'lgb_fold{fold+1}.pkl')


# -------------------------------
# Evaluate OOF
# -------------------------------
oof_price = np.expm1(oof)
train_price = train['price'].values
print("\nOOF SMAPE:", smape(train_price, oof_price))

# -------------------------------
# Prepare submission
# -------------------------------
pred_price = np.expm1(preds)
pred_price = np.maximum(pred_price, 0.01)  # enforce positive prices

submission = pd.DataFrame({
    'sample_id': test['sample_id'].values,
    'price': pred_price
})

submission.to_csv('test_out.csv', index=False)
print("Saved test_out.csv ✅")



===== Fold 1 =====

===== Fold 2 =====

===== Fold 3 =====

===== Fold 4 =====

===== Fold 5 =====

OOF SMAPE: 50.439717822023766
Saved test_out.csv ✅


In [7]:
import numpy as np

def smape(y_true, y_pred):
    """
    Calculate Symmetric Mean Absolute Percentage Error (SMAPE)
    Formula: SMAPE = 100% * (1/n) * Σ( |y_pred - y_true| / ((|y_true| + |y_pred|) / 2) )
    """
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    diff = np.abs(y_pred - y_true)
    smape_value = np.mean(diff / denominator) * 100
    return smape_value


In [8]:
oof[val_idx] = model.predict(X_val)
fold_smape = smape(y_val, oof[val_idx])
print(f"Fold {fold+1} SMAPE: {fold_smape:.4f}%")


Fold 5 SMAPE: 20.0611%


In [9]:
overall_smape = smape(y, oof)
print(f"Overall SMAPE: {overall_smape:.4f}%")


Overall SMAPE: 20.0280%


In [13]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error
import numpy as np
from scipy.sparse import load_npz

# --- Load prepared features ---
X_train = load_npz('X_train.npz')
X_test = load_npz('X_test.npz')
tfidf = joblib.load('tfidf_vectorizer.pkl')

# Target variable
y_train = train['price']


# --- Train a regression model (Ridge works well for TF-IDF) ---
model = Ridge(alpha=2.0, random_state=42)
model.fit(X_train, y_train)

# --- Predict ---
y_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# --- Evaluate using SMAPE ---
def smape(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0
    return np.mean(diff) * 100

smape_value = smape(y_train, y_pred)
overall_score = 100 - smape_value

print(f"SMAPE on training data: {smape_value:.4f}")
print(f"Leaderboard-style score: {overall_score:.4f}")



SMAPE on training data: 79.0590
Leaderboard-style score: 20.9410
