In [None]:
!pip install lightgbm scikit-learn pandas numpy
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_absolute_error
import lightgbm as lgb



In [None]:
train = pd.read_csv("/content/Dataset/train.csv")
test = pd.read_csv("/content/Dataset/test.csv")

train["catalog_content"] = train["catalog_content"].fillna("")
test["catalog_content"]  = test["catalog_content"].fillna("")
y = np.log1p(train["price"])

In [None]:
import re

def extract_ipq(text):
    match = re.search(r'(\d+)\s?(pack|pcs|pieces|units|count|x|ml|g|kg)', text.lower())
    if match:
        return float(match.group(1))
    return 1.0

train["IPQ"] = train["catalog_content"].apply(extract_ipq)
test["IPQ"]  = test["catalog_content"].apply(extract_ipq)

train["text_length"] = train["catalog_content"].apply(len)
test["text_length"]  = test["catalog_content"].apply(len)

In [None]:
from scipy import sparse

# Word-level TF-IDF
tfidf_word = TfidfVectorizer(max_features=20000, ngram_range=(1,2), stop_words='english')
X_word = tfidf_word.fit_transform(train["catalog_content"])
X_word_test = tfidf_word.transform(test["catalog_content"])

# Char-level TF-IDF
tfidf_char = TfidfVectorizer(max_features=10000, analyzer='char', ngram_range=(3,5))
X_char = tfidf_char.fit_transform(train["catalog_content"])
X_char_test = tfidf_char.transform(test["catalog_content"])

# Combine both
X_text = sparse.hstack([X_word, X_char])
X_test_text = sparse.hstack([X_word_test, X_char_test])

In [6]:
svd = TruncatedSVD(n_components=200, random_state=42)
X_reduced = svd.fit_transform(X_text)
X_test_reduced = svd.transform(X_test_text)

num_features = np.log1p(train[["IPQ", "text_length"]].values)
num_features_test = np.log1p(test[["IPQ", "text_length"]].values)

X_final = np.hstack([X_reduced, num_features])
X_test_final = np.hstack([X_test_reduced, num_features_test])

In [10]:
# 1️⃣ Clean missing target values
train = train.dropna(subset=["price"]).reset_index(drop=True)
y = np.log1p(train["price"])

# 2️⃣ Extract numeric features again (after dropping NaNs)
train["IPQ"] = train["catalog_content"].apply(extract_ipq)
train["text_length"] = train["catalog_content"].apply(len)
test["IPQ"] = test["catalog_content"].apply(extract_ipq)
test["text_length"] = test["catalog_content"].apply(len)

# 3️⃣ Rebuild TF-IDF features
vectorizer = TfidfVectorizer(max_features=30000, stop_words='english')
X_text = vectorizer.fit_transform(train["catalog_content"])
X_test_text = vectorizer.transform(test["catalog_content"])

# 4️⃣ Rebuild numeric features
from scipy import sparse
num_features = np.log1p(train[["IPQ", "text_length"]].values)
num_features_test = np.log1p(test[["IPQ", "text_length"]].values)
num_features_sparse = sparse.csr_matrix(num_features)
num_features_test_sparse = sparse.csr_matrix(num_features_test)

# 5️⃣ Combine text + numeric
X_final = sparse.hstack([X_text, num_features_sparse])
X_test_final = sparse.hstack([X_test_text, num_features_test_sparse])

# 6️⃣ Split
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(
    X_final, y, test_size=0.2, random_state=42
)

# 7️⃣ Model
from lightgbm import LGBMRegressor, early_stopping, log_evaluation

model = LGBMRegressor(
    objective='regression',
    learning_rate=0.03,
    num_leaves=256,
    n_estimators=5000,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_alpha=0.2,
    reg_lambda=0.3,
    min_child_weight=5,
    random_state=42
)

model.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],
    eval_metric='mae',
    callbacks=[early_stopping(stopping_rounds=200), log_evaluation(period=200)]
)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.554543 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 215877
[LightGBM] [Info] Number of data points in the train set: 15403, number of used features: 4431
[LightGBM] [Info] Start training from score 2.739532
Training until validation scores don't improve for 200 rounds
[200]	valid_0's l1: 0.555775	valid_0's l2: 0.525067
Early stopping, best iteration is:
[179]	valid_0's l1: 0.556505	valid_0's l2: 0.524945


In [11]:
def smape(y_true, y_pred):
    return 100/len(y_true) * np.sum(
        np.abs(y_pred - y_true) / ((np.abs(y_true) + np.abs(y_pred))/2)
    )

val_pred = np.expm1(model.predict(X_valid))
print("Validation SMAPE:", smape(np.expm1(y_valid), val_pred))



Validation SMAPE: 55.69119871633999


In [12]:
pred_test = np.expm1(model.predict(X_test_final))
pred_test = np.maximum(pred_test, 0)

submission = pd.DataFrame({
    "sample_id": test["sample_id"],
    "price": pred_test
})

submission.to_csv("test_out.csv", index=False)
print("✅ Submission file created: test_out.csv")
submission.head()

✅ Submission file created: test_out.csv


Unnamed: 0,sample_id,price
0,100179,13.946297
1,245611,14.725736
2,146263,23.582529
3,95658,21.752908
4,36806,17.394546
