In [2]:
import pandas as pd
import numpy as np
import os
import lightgbm as lgb

# --- Scikit-learn for modeling tools ---
# KFold for creating cross-validation splits
from sklearn.model_selection import KFold
# mean_squared_error is a common loss function. We'll use it to train on the log-transformed target.
from sklearn.metrics import mean_squared_error

print("Libraries imported successfully.")

# --- Load our feature-engineered data ---
PROJECT_ROOT = ".." 
PROCESSED_DIR = os.path.join(PROJECT_ROOT, "data", "processed")
FEATURES_PATH = os.path.join(PROCESSED_DIR, "features_v1.parquet")

# Loading the Parquet file
full_df = pd.read_parquet(FEATURES_PATH)

print(f"\nFeature-engineered data loaded successfully from:\n{FEATURES_PATH}")
print(f"Data shape: {full_df.shape}")
display(full_df.head())

Libraries imported successfully.

Feature-engineered data loaded successfully from:
..\data\processed\features_v1.parquet
Data shape: (150000, 20)


Unnamed: 0,sample_id,price,is_train,log_price,extracted_value,extracted_unit,ipq,brand,is_organic,is_gluten_free,is_vegan,is_non-gmo,is_kosher,is_natural,is_gourmet,is_premium,text_len,word_count,avg_word_len,unique_word_count
0,33127,4.89,1,1.773256,72.0,Fl,6,La Victoria Green Taco Sauce Mild,0,0,0,0,0,0,0,0,91,18,5.055555,18
1,198967,13.12,1,2.647592,32.0,Ounce,4,Salerno Cookies,0,0,0,0,0,0,0,0,511,80,6.3875,58
2,261251,1.97,1,1.088562,11.4,Ounce,6,Bear Creek Hearty Soup Bowl,0,0,0,0,0,0,0,0,328,59,5.559322,47
3,55858,30.34,1,3.444895,11.25,Ounce,1,Judee’s Blue Cheese Powder 11.25 oz,0,0,0,0,0,0,0,0,1318,211,6.246445,142
4,292686,66.49,1,4.211979,12.0,Count,1,kedem Sherry Cooking Wine,0,0,0,0,0,0,0,0,155,28,5.535714,18


In [3]:
# --- Prepare the Data ---
# We use the 'is_train' flag we created earlier to split the data back apart.
train_df = full_df[full_df['is_train'] == 1].copy()
test_df = full_df[full_df['is_train'] == 0].copy()

# --- Define Features (X) and Target (y) ---
# Our target is the log-transformed price. This is what the model will predict.
y = train_df['log_price']

# Our features are all the useful columns we have.
# We must drop identifiers and the target variables themselves.
features_to_drop = ['sample_id', 'price', 'log_price', 'is_train']
features = [col for col in train_df.columns if col not in features_to_drop]

# Create our final training and testing feature sets
X = train_df[features]
X_test = test_df[features]


# --- Handle Categorical Features for LightGBM ---
# LightGBM is great because it can handle categories directly, but we need to tell it which ones they are.
categorical_features = ['extracted_unit', 'brand']

# It's good practice to convert these columns to the 'category' data type.
for col in categorical_features:
    X[col] = X[col].astype('category')
    X_test[col] = X_test[col].astype('category')
    
    
print("Data preparation complete.")
print(f"We are training on {len(features)} features.")
print(f"The features are: {features}")
print("\nFirst 5 rows of our final training data (X):")
display(X.head())

Data preparation complete.
We are training on 16 features.
The features are: ['extracted_value', 'extracted_unit', 'ipq', 'brand', 'is_organic', 'is_gluten_free', 'is_vegan', 'is_non-gmo', 'is_kosher', 'is_natural', 'is_gourmet', 'is_premium', 'text_len', 'word_count', 'avg_word_len', 'unique_word_count']

First 5 rows of our final training data (X):


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[col] = X_test[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = X[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_inde

Unnamed: 0,extracted_value,extracted_unit,ipq,brand,is_organic,is_gluten_free,is_vegan,is_non-gmo,is_kosher,is_natural,is_gourmet,is_premium,text_len,word_count,avg_word_len,unique_word_count
0,72.0,Fl,6,La Victoria Green Taco Sauce Mild,0,0,0,0,0,0,0,0,91,18,5.055555,18
1,32.0,Ounce,4,Salerno Cookies,0,0,0,0,0,0,0,0,511,80,6.3875,58
2,11.4,Ounce,6,Bear Creek Hearty Soup Bowl,0,0,0,0,0,0,0,0,328,59,5.559322,47
3,11.25,Ounce,1,Judee’s Blue Cheese Powder 11.25 oz,0,0,0,0,0,0,0,0,1318,211,6.246445,142
4,12.0,Count,1,kedem Sherry Cooking Wine,0,0,0,0,0,0,0,0,155,28,5.535714,18


In [4]:
# Define the number of folds. 5 is a standard and robust choice.
N_SPLITS = 5
kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

# Create arrays to store our predictions
# oof_predictions will hold the prediction for each training sample when it was in the validation fold.
oof_predictions = np.zeros(len(X))
# test_predictions will hold the average prediction from all 5 models on the test set.
test_predictions = np.zeros(len(X_test))

# This list will store our 5 trained models.
models = []

print(f"--- Starting training with {N_SPLITS}-Fold Cross-Validation ---")

# Loop through each fold
for fold, (train_index, val_index) in enumerate(kf.split(X, y)):
    print(f"\n--- Fold {fold+1}/{N_SPLITS} ---")
    
    # Split the data for this fold
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    
    # --- Define the LightGBM Model ---
    # These are solid baseline parameters.
    lgb_params = {
        'objective': 'regression_l1', # MAE (Mean Absolute Error) objective is robust
        'metric': 'rmse',             # Monitor RMSE during training
        'n_estimators': 2000,         # Maximum number of trees
        'learning_rate': 0.05,        # A good balance of speed and accuracy
        'feature_fraction': 0.8,      # Use 80% of features per tree
        'bagging_fraction': 0.8,      # Use 80% of data per tree
        'bagging_freq': 1,
        'num_leaves': 31,             # Controls model complexity
        'verbose': -1,                # Surpress LightGBM's internal logs
        'n_jobs': -1,                 # Use all CPU cores
        'seed': 42 + fold,            # Different seed for each fold for diversity
        'boosting_type': 'gbdt',
    }
    
    model = lgb.LGBMRegressor(**lgb_params)
    
    # --- Train the Model ---
    # We use early stopping. If the validation score doesn't improve for 100 rounds, training stops.
    # This finds the optimal number of trees automatically.
    model.fit(X_train, y_train,
              eval_set=[(X_val, y_val)],
              eval_metric='rmse',
              categorical_feature=categorical_features, # Tell LightGBM about our categorical columns
              callbacks=[lgb.early_stopping(100, verbose=False)])
    
    # --- Make Predictions ---
    # Predict on the validation set for this fold
    val_preds = model.predict(X_val)
    oof_predictions[val_index] = val_preds
    
    # Predict on the test set. We will average these predictions later.
    test_predictions += model.predict(X_test) / N_SPLITS
    
    models.append(model)
    
    # --- Evaluate Fold Performance ---
    # Calculate RMSE on the log_price
    fold_rmse = np.sqrt(mean_squared_error(y_val, val_preds))
    print(f"Fold {fold+1} Log RMSE: {fold_rmse:.5f}")

# --- Overall Cross-Validation Score ---
overall_rmse = np.sqrt(mean_squared_error(y, oof_predictions))
print(f"\n\n--- Overall CV Log RMSE: {overall_rmse:.5f} ---")
print("Training complete!")

--- Starting training with 5-Fold Cross-Validation ---

--- Fold 1/5 ---


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[col] = data[col].cat.set_categories(category)


Fold 1 Log RMSE: 0.80125

--- Fold 2/5 ---
Fold 2 Log RMSE: 0.78410

--- Fold 3/5 ---
Fold 3 Log RMSE: 0.77959

--- Fold 4/5 ---
Fold 4 Log RMSE: 0.76777

--- Fold 5/5 ---
Fold 5 Log RMSE: 0.77798


--- Overall CV Log RMSE: 0.78221 ---
Training complete!


In [5]:
# --- Create the Submission File ---

# Our model predicted the log_price. We must convert it back to the original price scale.
# The inverse of np.log1p() is np.expm1().
predictions = np.expm1(test_predictions)

# --- Safety Check: Ensure all predictions are positive ---
# The competition rules state that prices must be positive.
# np.expm1() will naturally produce positive values, but this is a good safety check.
predictions[predictions < 0] = 0


# Create a new DataFrame for our submission
submission_df = pd.DataFrame({
    'sample_id': test_df['sample_id'],
    'price': predictions
})


# --- Save the Submission File ---
# We will save it in the main project folder for easy access.
SUBMISSION_PATH = os.path.join(PROJECT_ROOT, "submission_baseline_v1.csv")
submission_df.to_csv(SUBMISSION_PATH, index=False)

print(f"✅ Submission file created successfully at:\n{SUBMISSION_PATH}")
print("\nHere's a preview of your submission file:")
display(submission_df.head())

✅ Submission file created successfully at:
..\submission_baseline_v1.csv

Here's a preview of your submission file:


Unnamed: 0,sample_id,price
75000,100179,11.196545
75001,245611,14.659256
75002,146263,15.32471
75003,95658,11.790372
75004,36806,14.502683
