In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import os

# Change path according to your folder name in Google Drive
project_path = '/content/drive/MyDrive/student_resource'

# Change directory to that path
os.chdir(project_path)

# Confirm
print("Current working directory:", os.getcwd())

Current working directory: /content/drive/MyDrive/student_resource


In [6]:
!ls dataset

sample_test.csv  sample_test_out.csv  test.csv	train.csv


In [7]:
# Step 4: Import libraries

import pandas as pd   # For data handling
import numpy as np    # For numerical operations
import matplotlib.pyplot as plt   # For plotting graphs
import seaborn as sns  # For advanced visualizations

# FIX: use seaborn‚Äôs theme directly instead of matplotlib style
sns.set_theme(style="whitegrid")

print("Libraries imported¬†successfully!")

Libraries imported¬†successfully!


In [8]:
# Step 5: Load train and test datasets

train_df = pd.read_csv("dataset/train.csv")
test_df = pd.read_csv("dataset/test.csv")

# Check their shapes
print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

# Display first 5 rows
train_df.head()

Train shape: (75000, 4)
Test shape: (75000, 3)


Unnamed: 0,sample_id,catalog_content,image_link,price
0,33127,"Item Name: La Victoria Green Taco Sauce Mild, ...",https://m.media-amazon.com/images/I/51mo8htwTH...,4.89
1,198967,"Item Name: Salerno Cookies, The Original Butte...",https://m.media-amazon.com/images/I/71YtriIHAA...,13.12
2,261251,"Item Name: Bear Creek Hearty Soup Bowl, Creamy...",https://m.media-amazon.com/images/I/51+PFEe-w-...,1.97
3,55858,Item Name: Judee‚Äôs Blue Cheese Powder 11.25 oz...,https://m.media-amazon.com/images/I/41mu0HAToD...,30.34
4,292686,"Item Name: kedem Sherry Cooking Wine, 12.7 Oun...",https://m.media-amazon.com/images/I/41sA037+Qv...,66.49


In [9]:
import sys
sys.path.append('/content/drive/MyDrive/student_resource/src')  # Add the folder containing utils.py to Python path

from utils import download_images, download_image

In [10]:
import numpy as np
import pandas as pd

# Load train embeddings if saved
train_text_embeddings = np.load('/content/drive/MyDrive/student_resource/embeddings/train_text_embeddings_75k.npy')
train_image_embeddings = np.load('/content/drive/MyDrive/student_resource/embeddings/train_image_embeddings_75k.npy')

# Extra features
train_df['text_len'] = train_df['catalog_content'].apply(len)
train_df['num_words'] = train_df['catalog_content'].apply(lambda x: len(x.split()))

# Combine all features
X = np.concatenate([
    train_text_embeddings,
    train_image_embeddings,
    train_df[['text_len','num_words']].values
], axis=1)

y = np.log1p(train_df['price'].values)  # log-transform for stability

print("X shape:", X.shape)
print("y shape:", y.shape)

X shape: (75000, 1026)
y shape: (75000,)


In [2]:
!pip install catboost -q

[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m99.2/99.2 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [11]:
# ============================================================
# GPU-Optimized Ensemble Training (1 Fold Each)
# LightGBM + XGBoost + CatBoost
# ============================================================

import numpy as np
import pandas as pd
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
import torch

# -----------------------------
# Detect device
# -----------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# -----------------------------
# Prepare train/test split (1 fold)
# -----------------------------
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print(f"Train shape: {X_train.shape}, Val shape: {X_val.shape}")

# ============================================================
# ‚ø° LightGBM
# ============================================================
print("\nüü¢ Training LightGBM...")

lgb_train = lgb.Dataset(X_train, label=y_train)
lgb_val = lgb.Dataset(X_val, label=y_val)

lgb_params = {
    'objective': 'regression',
    'metric': 'rmse',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'seed': 42,
    'verbose': -1
}

# Enable GPU if available
if device == "cuda":
    lgb_params['device'] = 'gpu'
    lgb_params['gpu_platform_id'] = 0
    lgb_params['gpu_device_id'] = 0
    print("‚úÖ Using GPU for LightGBM")

lgb_model = lgb.train(
    lgb_params,
    lgb_train,
    valid_sets=[lgb_train, lgb_val],
    num_boost_round=1000,
    callbacks=[lgb.early_stopping(stopping_rounds=50),
               lgb.log_evaluation(100)]
)
print("‚úÖ LightGBM done!")


# ============================================================
# ‚ø¢ XGBoost
# ============================================================
print("\nüîµ Training XGBoost...")

xgb_params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'learning_rate': 0.05,
    'max_depth': 8,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'seed': 42,
}

# Enable GPU if available
if device == "cuda":
    xgb_params['tree_method'] = 'gpu_hist'
    print("‚úÖ Using GPU for XGBoost")

dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

xgb_model = xgb.train(
    params=xgb_params,
    dtrain=dtrain,
    num_boost_round=1000,
    evals=[(dtrain, "train"), (dval, "val")],
    early_stopping_rounds=50,
    verbose_eval=100
)
print("‚úÖ XGBoost done!")


# ============================================================
# ‚ø£ CatBoost
# ============================================================
print("\nüü£ Training CatBoost...")

cat_params = {
    'iterations': 1000,
    'learning_rate': 0.05,
    'depth': 8,
    'loss_function': 'RMSE',
    'eval_metric': 'RMSE',
    'random_seed': 42,
    'task_type': 'GPU' if device == "cuda" else 'CPU',
    'verbose': 100
}

cat_model = CatBoostRegressor(**cat_params)
cat_model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=50)
print("‚úÖ CatBoost done!")

# ============================================================
# Save models for later ensembling
# ============================================================
import joblib
joblib.dump(lgb_model, "/content/lgb_model_gpu.pkl")
joblib.dump(xgb_model, "/content/xgb_model_gpu.pkl")
cat_model.save_model("/content/cat_model_gpu.cbm")

print("\nüéâ All models trained and saved¬†successfully!")


Using device: cuda
Train shape: (60000, 1026), Val shape: (15000, 1026)

üü¢ Training LightGBM...
‚úÖ Using GPU for LightGBM
Training until validation scores don't improve for 50 rounds
[100]	training's rmse: 0.740431	valid_1's rmse: 0.79061
[200]	training's rmse: 0.685735	valid_1's rmse: 0.767987
[300]	training's rmse: 0.644815	valid_1's rmse: 0.756805
[400]	training's rmse: 0.610353	valid_1's rmse: 0.749728
[500]	training's rmse: 0.580497	valid_1's rmse: 0.744975
[600]	training's rmse: 0.553418	valid_1's rmse: 0.741437
[700]	training's rmse: 0.528894	valid_1's rmse: 0.73871
[800]	training's rmse: 0.506197	valid_1's rmse: 0.735861
[900]	training's rmse: 0.485292	valid_1's rmse: 0.734194
[1000]	training's rmse: 0.465084	valid_1's rmse: 0.732531
Did not meet early stopping. Best iteration is:
[1000]	training's rmse: 0.465084	valid_1's rmse: 0.732531
‚úÖ LightGBM done!

üîµ Training XGBoost...
‚úÖ Using GPU for XGBoost



    E.g. tree_method = "hist", device = "cuda"

  self.starting_round = model.num_boosted_rounds()


[0]	train-rmse:0.92719	val-rmse:0.94189
[100]	train-rmse:0.57729	val-rmse:0.76813
[200]	train-rmse:0.45752	val-rmse:0.75093
[300]	train-rmse:0.37633	val-rmse:0.74253
[400]	train-rmse:0.31498	val-rmse:0.73699
[500]	train-rmse:0.26745	val-rmse:0.73360
[600]	train-rmse:0.22606	val-rmse:0.73075
[700]	train-rmse:0.19464	val-rmse:0.72917
[800]	train-rmse:0.16756	val-rmse:0.72777
[900]	train-rmse:0.14514	val-rmse:0.72673
[999]	train-rmse:0.12671	val-rmse:0.72594
‚úÖ XGBoost done!

üü£ Training CatBoost...
0:	learn: 0.9308914	test: 0.9438483	best: 0.9438483 (0)	total: 88.5ms	remaining: 1m 28s
100:	learn: 0.7691850	test: 0.8069969	best: 0.8069969 (100)	total: 4.03s	remaining: 35.9s
200:	learn: 0.7315073	test: 0.7888679	best: 0.7888679 (200)	total: 8.97s	remaining: 35.6s
300:	learn: 0.7023356	test: 0.7780510	best: 0.7780510 (300)	total: 12.6s	remaining: 29.4s
400:	learn: 0.6774354	test: 0.7705950	best: 0.7705950 (400)	total: 16.3s	remaining: 24.3s
500:	learn: 0.6552311	test: 0.7652638	best: 0.7


    E.g. tree_method = "hist", device = "cuda"

  rv = reduce(self.proto)


In [14]:
import joblib

# Save models
joblib.dump(lgb_model, "lgb_model.pkl")
# joblib.dump(xgb_model, "xgb_model.pkl")
# joblib.dump(cat_model, "cat_model.pkl")

['lgb_model.pkl']

In [None]:
import joblib

# Save models
joblib.dump(lgb_model, "lgb_model.pkl")
# joblib.dump(xgb_model, "xgb_model.pkl")
# joblib.dump(cat_model, "cat_model.pkl")

In [16]:
import xgboost as xgb

dtrain = xgb.DMatrix(X_train)
dval = xgb.DMatrix(X_val)

train_pred_xgb = xgb_model.predict(dtrain)
val_pred_xgb   = xgb_model.predict(dval)


    E.g. tree_method = "hist", device = "cuda"

  return func(**kwargs)


In [17]:
import numpy as np
import joblib
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_squared_error

# ====================================================
# Load trained base models
# ====================================================
lgb_model = joblib.load("lgb_model.pkl")
xgb_model = joblib.load("xgb_model.pkl")
cat_model = joblib.load("cat_model.pkl")

# ====================================================
# Generate out-of-fold predictions for stacking
# ====================================================
print("üîπ Generating meta-features (level-1 predictions)...")

train_pred_lgb = lgb_model.predict(X_train)
val_pred_lgb = lgb_model.predict(X_val)

train_pred_xgb = xgb_model.predict(X_train)
val_pred_xgb = xgb_model.predict(X_val)

train_pred_cat = cat_model.predict(X_train)
val_pred_cat = cat_model.predict(X_val)

# Stack predictions horizontally ‚Üí meta-features
X_train_meta = np.column_stack((train_pred_lgb, train_pred_xgb, train_pred_cat))
X_val_meta = np.column_stack((val_pred_lgb, val_pred_xgb, val_pred_cat))

# ====================================================
# Train meta-learner (Ridge Regression with CV)
# ====================================================
print("üîπ Training meta-learner (RidgeCV)...")

meta_model = RidgeCV(alphas=np.logspace(-3, 3, 7), cv=5)
meta_model.fit(X_train_meta, y_train)

# ====================================================
# Evaluate stacking performance
# ====================================================
val_meta_pred = meta_model.predict(X_val_meta)
stack_rmse = mean_squared_error(y_val, val_meta_pred, squared=False)

print(f"‚úÖ Stacking Ensemble RMSE: {stack_rmse:.6f}")

# ====================================================
# Save final stacked model
# ====================================================
joblib.dump(meta_model, "stacked_model.pkl")
print("üéØ Final stacked model saved as 'stacked_model.pkl'")


üîπ Generating meta-features (level-1 predictions)...


TypeError: ('Expecting data to be a DMatrix object, got: ', <class 'numpy.ndarray'>)

In [18]:
# LightGBM OOF or trained model predictions
train_pred_lgb = lgb_model.predict(X_train)
val_pred_lgb   = lgb_model.predict(X_val)

# XGBoost (use numpy arrays directly)
train_pred_xgb = xgb_model.predict(X_train)
val_pred_xgb   = xgb_model.predict(X_val)

# CatBoost
train_pred_cat = cat_model.predict(X_train)
val_pred_cat   = cat_model.predict(X_val)

# Stack into meta-features
X_meta_train = np.column_stack([train_pred_lgb, train_pred_xgb, train_pred_cat])
X_meta_val   = np.column_stack([val_pred_lgb, val_pred_xgb, val_pred_cat])

print("Meta-train shape:", X_meta_train.shape)
print("Meta-val shape:", X_meta_val.shape)

TypeError: ('Expecting data to be a DMatrix object, got: ', <class 'numpy.ndarray'>)

In [22]:
import xgboost as xgb

# Create DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train)
dval   = xgb.DMatrix(X_val, label=y_val)

# Set parameters
params = {
    "objective": "reg:squarederror",
    "tree_method": "gpu_hist",
    "predictor": "gpu_predictor",
    "learning_rate": 0.05,
    "max_depth": 8,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "random_state": 42,
    "eval_metric": "rmse"
}

# Train with early stopping
model = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=1000,
    evals=[(dtrain, "train"), (dval, "val")],
    early_stopping_rounds=50,
    verbose_eval=100
)

# Predictions
train_pred_xgb = model.predict(dtrain)
val_pred_xgb   = model.predict(dval)



    E.g. tree_method = "hist", device = "cuda"

  self.starting_round = model.num_boosted_rounds()
Parameters: { "predictor" } are not used.

  self.starting_round = model.num_boosted_rounds()


[0]	train-rmse:0.92719	val-rmse:0.94189
[100]	train-rmse:0.57729	val-rmse:0.76813
[200]	train-rmse:0.45752	val-rmse:0.75093
[300]	train-rmse:0.37633	val-rmse:0.74253
[400]	train-rmse:0.31498	val-rmse:0.73699
[500]	train-rmse:0.26745	val-rmse:0.73360
[600]	train-rmse:0.22606	val-rmse:0.73075
[700]	train-rmse:0.19464	val-rmse:0.72917
[800]	train-rmse:0.16756	val-rmse:0.72777
[900]	train-rmse:0.14514	val-rmse:0.72673
[999]	train-rmse:0.12671	val-rmse:0.72594



    E.g. tree_method = "hist", device = "cuda"

  return func(**kwargs)


In [24]:
import pickle
import joblib
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score

# -----------------------------
# 1Ô∏è‚É£ Load all trained models
# -----------------------------
xgb_model = pickle.load(open("/content/xgb_model_gpu.pkl", "rb"))
lgb_model = pickle.load(open("/content/lgb_model_gpu.pkl", "rb"))
cat_model = cb.CatBoostRegressor()
cat_model.load_model("/content/cat_model_gpu.cbm")

# -----------------------------
# 2Ô∏è‚É£ Get base model predictions
# -----------------------------
# assuming you already have X_train, X_val, y_train, y_val

train_pred_xgb = xgb_model.predict(X_train)
train_pred_lgb = lgb_model.predict(X_train)
train_pred_cat = cat_model.predict(X_train)

val_pred_xgb = xgb_model.predict(X_val)
val_pred_lgb = lgb_model.predict(X_val)
val_pred_cat = cat_model.predict(X_val)

# -----------------------------
# 3Ô∏è‚É£ Create new feature set (stacked)
# -----------------------------
X_train_stack = np.column_stack((train_pred_xgb, train_pred_lgb, train_pred_cat))
X_val_stack = np.column_stack((val_pred_xgb, val_pred_lgb, val_pred_cat))

# -----------------------------
# 4Ô∏è‚É£ Train a meta-model
# -----------------------------
meta_model = Ridge(alpha=1.0)
meta_model.fit(X_train_stack, y_train)

# -----------------------------
# 5Ô∏è‚É£ Evaluate stacking performance
# -----------------------------
val_pred_stack = meta_model.predict(X_val_stack)

rmse = mean_squared_error(y_val, val_pred_stack, squared=False)
r2 = r2_score(y_val, val_pred_stack)

print(f"Stacked Model RMSE: {rmse:.4f}")
print(f"Stacked Model R¬≤:   {r2:.4f}")

# -----------------------------
# 6Ô∏è‚É£ (Optional) Save stacked model
# -----------------------------
joblib.dump(meta_model, "stacked_meta_model.pkl")
print("‚úÖ Stacking completed and model saved as stacked_meta_model.pkl")


TypeError: ('Expecting data to be a DMatrix object, got: ', <class 'numpy.ndarray'>)

In [27]:
# ------------------------- Step 0: Imports -------------------------
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split

# ------------------------- Step 1: Load Features -------------------------
# Load your precomputed embeddings
train_text_embeddings = np.load('/content/drive/MyDrive/student_resource/embeddings/train_text_embeddings_75k.npy')
train_image_embeddings = np.load('/content/drive/MyDrive/student_resource/embeddings/train_image_embeddings_75k.npy')

# Extra features from DataFrame
train_df['text_len'] = train_df['catalog_content'].apply(len)
train_df['num_words'] = train_df['catalog_content'].apply(lambda x: len(x.split()))

# Combine features
X = np.concatenate([
    train_text_embeddings,
    train_image_embeddings,
    train_df[['text_len','num_words']].values
], axis=1)

y = np.log1p(train_df['price'].values)  # log-transform for stability

print("X shape:", X.shape)
print("y shape:", y.shape)

# ------------------------- Step 2: Train/Validation Split -------------------------
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print("Train shape:", X_train.shape, "Val shape:", X_val.shape)

# ------------------------- Step 3: LightGBM GPU Training -------------------------
lgb_model = lgb.LGBMRegressor(
    objective='regression',
    metric='rmse',
    learning_rate=0.05,
    num_leaves=31,
    feature_fraction=0.8,
    bagging_fraction=0.8,
    bagging_freq=5,
    n_estimators=1000,
    device='gpu',          # GPU training
    verbose=-1
)

# Train with early stopping
lgb_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric='rmse',
    callbacks=[lgb.early_stopping(50), lgb.log_evaluation(100)]
)

# ------------------------- Step 4: Save Model -------------------------
import joblib
joblib.dump(lgb_model, "lgb_model_single_fold.pkl")
print("‚úÖ LightGBM model trained and saved successfully!")

# ------------------------- Step 5: Predictions -------------------------
train_pred = lgb_model.predict(X_train)
val_pred   = lgb_model.predict(X_val)

print("Training RMSE:", np.sqrt(np.mean((train_pred - y_train)**2)))
print("Validation RMSE:", np.sqrt(np.mean((val_pred - y_val)**2)))


X shape: (75000, 1026)
y shape: (75000,)
Train shape: (60000, 1026) Val shape: (15000, 1026)
Training until validation scores don't improve for 50 rounds
[100]	valid_0's rmse: 0.790806
[200]	valid_0's rmse: 0.768999
[300]	valid_0's rmse: 0.757248
[400]	valid_0's rmse: 0.749737
[500]	valid_0's rmse: 0.745122
[600]	valid_0's rmse: 0.741246
[700]	valid_0's rmse: 0.738432
[800]	valid_0's rmse: 0.736553
[900]	valid_0's rmse: 0.734807
[1000]	valid_0's rmse: 0.73348
Did not meet early stopping. Best iteration is:
[995]	valid_0's rmse: 0.73347
‚úÖ LightGBM model trained and saved successfully!




Training RMSE: 0.46554230544162095
Validation RMSE: 0.7334695495685832


In [28]:
# ------------------------- Step 0: Imports -------------------------
import numpy as np
import pandas as pd
import joblib

# ------------------------- Step 1: Load Test Features -------------------------
# Load your precomputed test embeddings
test_text_embeddings = np.load('/content/drive/MyDrive/student_resource/embeddings/train_image_embeddings_75k.npy')
test_image_embeddings = np.load('/content/drive/MyDrive/student_resource/embeddings/train_image_embeddings_75k.npy')

# Extra features from test DataFrame
test_df['text_len'] = test_df['catalog_content'].apply(len)
test_df['num_words'] = test_df['catalog_content'].apply(lambda x: len(x.split()))

# Combine features
X_test = np.concatenate([
    test_text_embeddings,
    test_image_embeddings,
    test_df[['text_len','num_words']].values
], axis=1)

print("X_test shape:", X_test.shape)

# ------------------------- Step 2: Load Trained LightGBM Model -------------------------
lgb_model = joblib.load("lgb_model_single_fold.pkl")
print("‚úÖ Model loaded successfully!")

# ------------------------- Step 3: Predict -------------------------
test_pred_log = lgb_model.predict(X_test)       # predictions in log space
test_pred = np.expm1(test_pred_log)            # revert log transform

# ------------------------- Step 4: Prepare Submission -------------------------
submission = pd.DataFrame({
    'item_id': test_df['item_id'],   # or the appropriate ID column in your dataset
    'predicted_price': test_pred
})

# Save CSV
submission.to_csv('submission_lgb_single_fold.csv', index=False)
print("‚úÖ Submission CSV saved: submission_lgb_single_fold.csv")


X_test shape: (75000, 1026)
‚úÖ Model loaded successfully!




KeyError: 'item_id'

In [29]:
print(test_df.columns)

Index(['sample_id', 'catalog_content', 'image_link', 'text_len', 'num_words'], dtype='object')


In [30]:
import pandas as pd
import numpy as np
import joblib

# Load test embeddings
X_test = np.load('/content/test_embeddings.npy')  # update path if needed
print("X_test shape:", X_test.shape)

# Load trained LightGBM model
lgb_model = joblib.load('/content/lgb_model_single_fold.pkl')  # update path
print("‚úÖ Model loaded successfully!")

# Make predictions
test_pred_log = lgb_model.predict(X_test)
test_pred = np.expm1(test_pred_log)  # inverse of log1p

# Prepare submission
submission = pd.DataFrame({
    'sample_id': test_df['sample_id'],  # use the correct ID column
    'predicted_price': test_pred
})

# Save CSV
submission.to_csv('submission_lgb_single_fold.csv', index=False)
print("‚úÖ Submission CSV saved!")
import pandas as pd
import numpy as np
import joblib

# Load test embeddings
X_test = np.load('/content/test_embeddings.npy')  # update path if needed
print("X_test shape:", X_test.shape)

# Load trained LightGBM model
lgb_model = joblib.load('/content/lgb_model_single_fold.pkl')  # update path
print("‚úÖ Model loaded successfully!")

# Make predictions
test_pred_log = lgb_model.predict(X_test)
test_pred = np.expm1(test_pred_log)  # inverse of log1p

# Prepare submission
submission = pd.DataFrame({
    'sample_id': test_df['sample_id'],  # use the correct ID column
    'predicted_price': test_pred
})

# Save CSV
submission.to_csv('submission_lgb_single_fold.csv', index=False)
print("‚úÖ Submission¬†CSV¬†saved!")


FileNotFoundError: [Errno 2] No such file or directory: '/content/test_embeddings.npy'

In [32]:
# Load embeddings
test_text_embeddings = np.load('/content/test_image_embeddings_75k.npy')
test_image_embeddings = np.load('/content/test_text_embeddings_75k.npy')

# Extra features
test_df['text_len'] = test_df['catalog_content'].apply(len)
test_df['num_words'] = test_df['catalog_content'].apply(lambda x: len(x.split()))

# Combine all features exactly like training
X_test = np.concatenate([
    test_text_embeddings,
    test_image_embeddings,
    test_df[['text_len', 'num_words']].values
], axis=1)

print("X_test shape:", X_test.shape)

X_test shape: (75000, 1026)


In [34]:
import pandas as pd
import numpy as np
import joblib

# ------------------- Step 1: Load test embeddings -------------------
test_text_embeddings = np.load('/content/test_text_embeddings_75k.npy')   # text embeddings
test_image_embeddings = np.load('/content/test_image_embeddings_75k.npy') # image embeddings

# ------------------- Step 2: Prepare extra features -------------------
# Make sure test_df is already loaded
test_df['text_len'] = test_df['catalog_content'].apply(len)
test_df['num_words'] = test_df['catalog_content'].apply(lambda x: len(x.split()))

# ------------------- Step 3: Combine features -------------------
X_test = np.concatenate([
    test_text_embeddings,
    test_image_embeddings,
    test_df[['text_len', 'num_words']].values
], axis=1)

print("X_test shape:", X_test.shape)

# ------------------- Step 4: Load trained LightGBM model -------------------
lgb_model = joblib.load('/content/drive/MyDrive/student_resource/lgb_model_single_fold.pkl')  # replace with your saved model path
print("‚úÖ Model loaded successfully!")

# ------------------- Step 5: Make predictions -------------------
test_pred_log = lgb_model.predict(X_test)    # predictions are in log1p scale
test_pred = np.expm1(test_pred_log)          # inverse of log1p

# ------------------- Step 6: Prepare submission -------------------
submission = pd.DataFrame({
    'sample_id': test_df['sample_id'],  # correct ID column
    'predicted_price': test_pred
})

# ------------------- Step 7: Save CSV -------------------
submission.to_csv('submission_lgb_single_fold_final.csv', index=False)
print("‚úÖ Submission¬†CSV¬†saved!")


X_test shape: (75000, 1026)
‚úÖ Model loaded successfully!




‚úÖ Submission¬†CSV¬†saved!
