In [5]:
import pandas as pd
import lightgbm as lgb
import numpy as np

# --- 1. Load All Necessary Data ---
print("Loading data...")

# Load the features you created
X_train = pd.read_csv("X_train_features.csv")
X_test = pd.read_csv("X_test_features.csv")

# Load original data to get the target variable ('price') and sample IDs
try:
    train_df = pd.read_csv("train.csv")
    test_df = pd.read_csv("test.csv")
    y_train = train_df['price']
except FileNotFoundError:
    print("Error: Make sure original train.csv and test.csv are available.")
except KeyError:
    print("Error: The column 'price' was not found in train.csv.")

print("Data loaded successfully.")
print(f"Train features shape: {X_train.shape}")
print(f"Test features shape: {X_test.shape}")

# --- 2. CRITICAL CHANGE: Apply Log Transform to the Target Variable ---
# This helps the model handle the skewed distribution of price data and
# significantly improves SMAPE score.
y_train_log = np.log1p(y_train)
print("\nApplied log transform (log1p) to the price data.")


# --- 3. Train the LightGBM Regression Model ---
# Using more robust hyperparameters
model = lgb.LGBMRegressor(
    objective='regression_l1', # MAE is a good objective for price data
    metric='rmse',             # Use RMSE for training, as it's sensitive to large errors
    random_state=42,
    n_estimators=2000,         # Increased estimators
    learning_rate=0.01,        # Reduced learning rate
    num_leaves=40,
    max_depth=7,
    colsample_bytree=0.7,
    n_jobs=-1
)

print("\nTraining the regression model on log-transformed prices...")
# Train on the LOG-TRANSFORMED target
model.fit(X_train, y_train_log)
print("Model training complete.")


# --- 4. Make Predictions and Inverse Transform ---
print("\nMaking predictions on the test data...")
# Predict on the test set, which will give log-scale predictions
predictions_log = model.predict(X_test)

# CRITICAL CHANGE: Convert predictions back to the original price scale
# np.expm1 is the inverse of np.log1p
predictions = np.expm1(predictions_log)

# CONSTRAINT: Ensure all predicted prices are positive
predictions[predictions < 0] = 0
print("Converted predictions back to the original price scale.")


# --- 5. Create the Submission File ---
print("\nCreating the submission file...")
submission_df = pd.DataFrame({
    'sample_id': test_df['sample_id'],
    'price': predictions
})

submission_df.to_csv('submission2.csv', index=False, float_format='%.4f')

print("\nSubmission file 'submission.csv' created successfully!")
print("Submission file head:")
print(submission_df.head())


Loading data...
Data loaded successfully.
Train features shape: (75000, 1666)
Test features shape: (75000, 1666)

Applied log transform (log1p) to the price data.

Training the regression model on log-transformed prices...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.872724 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 424575
[LightGBM] [Info] Number of data points in the train set: 75000, number of used features: 1665
[LightGBM] [Info] Start training from score 2.708050
Model training complete.

Making predictions on the test data...
Converted predictions back to the original price scale.

Creating the submission file...

Submission file 'submission.csv' created successfully!
Submission file head:
   sample_id      price
0     100179  17.678548
1     245611  20.294969
2     146263  22.746099
3      95658   7.932223
4      36806  18.751184


In [6]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split

# --- 1. Define the SMAPE Metric Function ---
def smape(y_true, y_pred):
    """
    Calculates the Symmetric Mean Absolute Percentage Error (SMAPE).
    """
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    # Handle the case where both true and predicted are zero
    # to avoid division by zero
    ratio = np.where(denominator == 0, 0, numerator / denominator)
    return np.mean(ratio) * 100

# --- 2. Load Your Data ---
print("Loading features and labels...")
X = pd.read_csv("X_train_features.csv")
train_df = pd.read_csv("train.csv")
y = train_df['price']

# --- 3. Split Data into Training and Validation Sets ---
# Using 80% for training and 20% for validation
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Training data shape: {X_train.shape}")
print(f"Validation data shape: {X_val.shape}")

# --- 4. Train Model on the SPLIT Data ---
print("\nTraining model on the 80% split...")
model = lgb.LGBMRegressor(
    objective='regression_l1',
    random_state=42,
    n_estimators=1000
)
model.fit(X_train, y_train)

# --- 5. Predict on Validation Set and Calculate Score ---
print("Making predictions on the 20% validation set...")
val_predictions = model.predict(X_val)

# Ensure predictions are non-negative
val_predictions[val_predictions < 0] = 0

# Calculate and print the score
validation_smape = smape(y_val, val_predictions)

print("\n-------------------------------------")
print(f"Validation SMAPE Score: {validation_smape:.4f}%")
print("-------------------------------------")
# Lower SMAPE is better

Loading features and labels...
Training data shape: (60000, 1666)
Validation data shape: (15000, 1666)

Training model on the 80% split...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.936320 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 424575
[LightGBM] [Info] Number of data points in the train set: 60000, number of used features: 1665
[LightGBM] [Info] Start training from score 14.090000
Making predictions on the 20% validation set...

-------------------------------------
Validation SMAPE Score: 59.8039%
-------------------------------------
