<a href="https://colab.research.google.com/github/Santhosh261005/Machine-Learning/blob/main/ML_Challenge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [1]:
!nvidia-smi


Sun Oct 12 05:37:59 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   62C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
import pandas as pd

train_path = "/content/drive/MyDrive/train.csv"
test_path = "/content/drive/MyDrive/test.csv"

train = pd.read_csv(
    train_path,
    engine='python',
    quotechar='"',
    encoding='utf-8',
    on_bad_lines='skip'  # skips malformed rows safely
)

test = pd.read_csv(
    test_path,
    engine='python',
    quotechar='"',
    encoding='utf-8',
    on_bad_lines='skip'
)


print("Train shape:", train.shape)  # should be (75000, 4)

print("Test shape:", test.shape)  # should be (75000, 4)

Train shape: (75000, 4)
Test shape: (75000, 3)


In [None]:
# ===============================
# 1️⃣ Import libraries
# ===============================
import pandas as pd
import numpy as np
import re, html
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge

# ===============================
# 2️⃣ Load train and test CSVs safely
# ===============================
train_path = "/content/drive/MyDrive/train.csv"
test_path = "/content/drive/MyDrive/test.csv"

train = pd.read_csv(
    train_path,
    engine='python',
    quotechar='"',
    encoding='utf-8',
    on_bad_lines='skip'  # skips malformed rows safely
)

test = pd.read_csv(
    test_path,
    engine='python',
    quotechar='"',
    encoding='utf-8',
    on_bad_lines='skip'
)


print("Train shape:", train.shape)  # should be (75000, 4)

print("Test shape:", test.shape)  # should be (75000, 4)

# ===============================
# 3️⃣ Text cleaning function
# ===============================
def clean_text(s):
    if pd.isna(s):
        return ""
    s = str(s)
    s = html.unescape(s)
    s = re.sub(r'<[^>]+>', ' ', s)
    s = s.lower()
    s = re.sub(r'[^a-z0-9\s\.\-\_x]', ' ', s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s

for df in [train, test]:
    df['catalog_clean'] = df['catalog_content'].apply(clean_text)

# ===============================
# 4️⃣ Numeric features
# ===============================
def extract_ipq(text):
    if pd.isna(text):
        return 1
    patterns = [
        r'(\d+)\s*(?:pack|packs|pk|pk\.)\b',
        r'pack of (\d+)',
        r'(\d+)\s*(?:pcs|pieces|ct|count)\b',
        r'(\d+)\s*x\b',
        r'(\d+)-pack\b',
        r'(\d+)p\b'
    ]
    for p in patterns:
        m = re.search(p, text, flags=re.I)
        if m:
            try:
                val = int(m.group(1))
                if val > 0 and val < 1000:
                    return val
            except:
                pass
    return 1

for df in [train, test]:
    df['text_len'] = df['catalog_clean'].str.len()
    df['num_words'] = df['catalog_clean'].str.split().map(len)
    df['num_digits'] = df['catalog_clean'].str.count(r'\d+')
    df['ipq'] = df['catalog_clean'].apply(extract_ipq)

# ===============================
# 5️⃣ Keep only numeric prices
# ===============================
def is_float(x):
    try:
        float(x)
        return True
    except:
        return False

train = train[train['price'].apply(is_float)].copy()
train['price'] = train['price'].astype(float)
print("Cleaned train shape:", train.shape)

# ===============================
# 6️⃣ TF-IDF + SVD
# ===============================
all_text = pd.concat([train['catalog_clean'], test['catalog_clean']])
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1,2), min_df=3)
X_all_tfidf = tfidf.fit_transform(all_text)

svd = TruncatedSVD(n_components=100, random_state=42)
X_all_svd = svd.fit_transform(X_all_tfidf)

numeric_cols = ['text_len','num_words','num_digits','ipq']
X_numeric_train = train[numeric_cols].values
X_numeric_test  = test[numeric_cols].values

X_train_svd = X_all_svd[:len(train)]
X_test_svd  = X_all_svd[len(train):]

X_train_final = np.hstack([X_train_svd, X_numeric_train])
X_test_final  = np.hstack([X_test_svd, X_numeric_test])

y = np.log1p(train['price'].values)

# ===============================
# 7️⃣ Train/Validation split
# ===============================
price_bins = pd.qcut(train['price'], q=10, labels=False, duplicates='drop')
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train_final, y, test_size=0.2, random_state=42, stratify=price_bins
)

# ===============================
# 8️⃣ Define SMAPE
# ===============================
def smape(y_true, y_pred, eps=1e-8):
    a = np.array(y_true)
    p = np.array(y_pred)
    num = np.abs(p - a)
    denom = (np.abs(a) + np.abs(p)) / 2.0
    denom = np.maximum(denom, eps)
    return np.mean(num / denom) * 100

# ===============================
# 9️⃣ Train Ridge Regression
# ===============================
model = Ridge(alpha=1.0, random_state=42)
model.fit(X_tr, y_tr)

val_pred_log = model.predict(X_val)
val_pred = np.expm1(val_pred_log)
y_val_orig = np.expm1(y_val)

print("Validation SMAPE (Ridge): {:.2f}%".format(smape(y_val_orig, val_pred)))

# ===============================
# 🔟 Predict on full 75k test set and save
# ===============================
test_pred_log = model.predict(X_test_final)
test_pred = np.expm1(test_pred_log)
test_pred = np.maximum(test_pred, 0.01)  # ensure positive

submission = test[['sample_id']].copy()
submission['price'] = test_pred
submission.to_csv("/content/test_out.csv", index=False, float_format='%.4f')

print("✅ Submission saved to /content/test_out.csv")
print("Submission rows:", submission.shape[0])
submission.head()


Train shape: (75000, 4)
Test shape: (75000, 3)
Cleaned train shape: (75000, 9)
Validation SMAPE (Ridge): 63.00%
✅ Submission saved to /content/test_out.csv
Submission rows: 75000


Unnamed: 0,sample_id,price
0,100179,14.878142
1,245611,15.078333
2,146263,26.668296
3,95658,13.571757
4,36806,19.743165


Nice — let’s upgrade your pipeline to LightGBM, which almost always gives a big jump vs linear models for tabular + embedding features. I’ll give you a complete Colab-ready code block that:

Loads exactly 75k train/test rows (safe for multi-line text)

Repeats the same cleaning / TF-IDF → SVD → numeric features pipeline you already used

Trains a 5-fold LightGBM CV with early stopping on log1p(price)

Produces out-of-fold (OOF) SMAPE and a final test_out.csv with 75k predictions

In [None]:
# ===============================
# 2️⃣ Load train and test CSVs safely
# ===============================
train_path = "/content/drive/MyDrive/train.csv"
test_path = "/content/drive/MyDrive/test.csv"

train = pd.read_csv(
    train_path,
    engine='python',
    quotechar='"',
    encoding='utf-8',
    on_bad_lines='skip'  # skips malformed rows safely
)

test = pd.read_csv(
    test_path,
    engine='python',
    quotechar='"',
    encoding='utf-8',
    on_bad_lines='skip'
)


print("Train shape:", train.shape)  # should be (75000, 4)

print("Test shape:", test.shape)  # should be (75000, 4)

In [2]:
# ======= Install LightGBM (Colab) =======
# (Uncomment if not already installed)
# !pip install -q lightgbm

# ======= Imports & seed =======
import os, re, html
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation


RND = 42
np.random.seed(RND)

# ======= Utility: SMAPE =======
def smape(y_true, y_pred, eps=1e-8):
    a = np.array(y_true).astype(float)
    p = np.array(y_pred).astype(float)
    num = np.abs(p - a)
    denom = (np.abs(a) + np.abs(p)) / 2.0
    denom = np.maximum(denom, eps)
    return np.mean(num / denom) * 100.0

train_path = "/content/drive/MyDrive/train.csv"
test_path = "/content/drive/MyDrive/test.csv"

train = pd.read_csv(
    train_path,
    engine='python',
    quotechar='"',
    encoding='utf-8',
    on_bad_lines='skip'  # skips malformed rows safely
)

test = pd.read_csv(
    test_path,
    engine='python',
    quotechar='"',
    encoding='utf-8',
    on_bad_lines='skip'
)


print("Train shape:", train.shape)  # should be (75000, 4)

print("Test shape:", test.shape)  # should be (75000, 4)

# ======= 2) Clean text (same as before) =======
def clean_text(s):
    if pd.isna(s):
        return ""
    s = str(s)
    s = html.unescape(s)
    s = re.sub(r'<[^>]+>', ' ', s)
    s = s.lower()
    s = re.sub(r'[^a-z0-9\s\.\-\_x]', ' ', s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s

for df in [train, test]:
    df['catalog_clean'] = df['catalog_content'].apply(clean_text)

# ======= 3) Numeric features =======
def extract_ipq(text):
    if pd.isna(text):
        return 1
    patterns = [
        r'(\d+)\s*(?:pack|packs|pk|pk\.)\b',
        r'pack of (\d+)',
        r'(\d+)\s*(?:pcs|pieces|ct|count)\b',
        r'(\d+)\s*x\b',
        r'(\d+)-pack\b',
        r'(\d+)p\b'
    ]
    for p in patterns:
        m = re.search(p, text, flags=re.I)
        if m:
            try:
                val = int(m.group(1))
                if 0 < val < 1000:
                    return val
            except:
                pass
    return 1

for df in [train, test]:
    df['text_len']   = df['catalog_clean'].str.len()
    df['num_words']  = df['catalog_clean'].str.split().map(len)
    df['num_digits'] = df['catalog_clean'].str.count(r'\d+')
    df['ipq']        = df['catalog_clean'].apply(extract_ipq)

# ======= 4) Keep only numeric prices in train =======
def is_float(x):
    try:
        float(x)
        return True
    except:
        return False

train = train[train['price'].apply(is_float)].copy()
train['price'] = train['price'].astype(float)
print("Filtered numeric-price train shape:", train.shape)

# ======= 5) TF-IDF + SVD (fit on train+test) =======
all_text = pd.concat([train['catalog_clean'], test['catalog_clean']])
tfidf = TfidfVectorizer(max_features=15000, ngram_range=(1,2), min_df=3)
X_all_tfidf = tfidf.fit_transform(all_text)

svd = TruncatedSVD(n_components=200, random_state=RND)  # larger SVD -> stronger features
X_all_svd = svd.fit_transform(X_all_tfidf)

# split back
X_train_svd = X_all_svd[:len(train)]
X_test_svd  = X_all_svd[len(train):]

# ======= 6) Final feature matrices =======
num_cols = ['text_len','num_words','num_digits','ipq']
X_num_train = train[num_cols].fillna(0).values
X_num_test  = test[num_cols].fillna(0).values

X_train = np.hstack([X_train_svd, X_num_train])
X_test  = np.hstack([X_test_svd, X_num_test])
y_train = np.log1p(train['price'].values)

print("X_train shape:", X_train.shape, "X_test shape:", X_test.shape)

# ======= 7) LightGBM CV training (5-fold stratified by price bins) =======
# Make price bins for stratification
price_bins = pd.qcut(train['price'], q=10, labels=False, duplicates='drop')

folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=RND)
oof_preds = np.zeros(len(train))
test_preds = np.zeros(X_test.shape[0])

lgb_params = {
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.05,
    'num_leaves': 127,
    'min_child_samples': 20,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 0.2,
    'reg_lambda': 0.2,
    'random_state': RND,
    'n_jobs': -1,
    'verbosity': -1,
    'device_type' : 'gpu'
}

for fold, (tr_idx, val_idx) in enumerate(folds.split(X_train, price_bins)):
    print(f"\n=== Fold {fold+1} ===")
    X_tr, X_val = X_train[tr_idx], X_train[val_idx]
    y_tr, y_val = y_train[tr_idx], y_train[val_idx]

    dtrain = lgb.Dataset(X_tr, label=y_tr)
    dval   = lgb.Dataset(X_val, label=y_val)

    model = lgb.train(
        params=lgb_params,
        train_set=dtrain,
        num_boost_round=10000,
        valid_sets=[dtrain, dval],
        valid_names=['train','valid'],
        callbacks=[early_stopping(stopping_rounds=200), log_evaluation(200)]
    )

    oof_preds[val_idx] = model.predict(X_val, num_iteration=model.best_iteration)
    test_preds += model.predict(X_test, num_iteration=model.best_iteration) / 5.0

# ======= 8) Evaluate OOF SMAPE (on original price scale) =======
oof_price = np.expm1(oof_preds)
train_price_orig = np.expm1(y_train)
print("\nOOF SMAPE (LightGBM): {:.4f}%".format(smape(train_price_orig, oof_price)))

# ======= 9) Prepare submission (ensure we predict for exactly the test rows used) =======
test_pred_price = np.expm1(test_preds)
test_pred_price = np.maximum(test_pred_price, 0.01)

submission = test[['sample_id']].copy()
submission['price'] = test_pred_price
print("Submission rows:", submission.shape[0])
submission.to_csv("/content/test_out_lgbm.csv", index=False, float_format='%.4f')
print("Saved /content/test_out_lgbm.csv")


Train shape: (75000, 4)
Test shape: (75000, 3)
Filtered numeric-price train shape: (75000, 9)
X_train shape: (75000, 204) X_test shape: (75000, 204)

=== Fold 1 ===
Training until validation scores don't improve for 200 rounds
[200]	train's rmse: 0.538492	valid's rmse: 0.72922
[400]	train's rmse: 0.419434	valid's rmse: 0.721169
[600]	train's rmse: 0.337634	valid's rmse: 0.717908
[800]	train's rmse: 0.273787	valid's rmse: 0.715602
[1000]	train's rmse: 0.22409	valid's rmse: 0.714504
[1200]	train's rmse: 0.184305	valid's rmse: 0.71371
[1400]	train's rmse: 0.153726	valid's rmse: 0.713472
[1600]	train's rmse: 0.12894	valid's rmse: 0.713078
[1800]	train's rmse: 0.108529	valid's rmse: 0.712861
[2000]	train's rmse: 0.0918299	valid's rmse: 0.712788
[2200]	train's rmse: 0.0790501	valid's rmse: 0.712733
[2400]	train's rmse: 0.0684014	valid's rmse: 0.712663
[2600]	train's rmse: 0.0595746	valid's rmse: 0.712564
[2800]	train's rmse: 0.0524123	valid's rmse: 0.712557
[3000]	train's rmse: 0.0468007	val