In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split,RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline 
from xgboost import XGBRegressor
# Import the RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor 
import numpy as np

# Assuming df_processed is loaded from your CSV
file="parsed_catalog_data (1).csv"
df_processed=pd.read_csv(file)

# Basic data cleaning
df_processed['amount per pack'] = np.log1p(df_processed['amount per pack'].fillna(1))
df_processed['unit'] = df_processed['unit'].fillna('unknown')
df_processed['catalog_info'] = df_processed['catalog_info'].fillna('unknown')
df_processed.dropna(subset=['price', 'quantity'], inplace=True) 

df_processed['brand'] = df_processed['catalog_info'].apply(lambda x: x.split(' ')[0].strip().lower())  
keywords = ['organic', 'sugar free', 'gluten free', 'diet', 'light', 'natural', 'whole grain']
for keyword in keywords:
    df_processed[f'is_{keyword.replace(" ", "_")}'] = df_processed['catalog_info'].str.contains(keyword, case=False).astype(int) 

df_processed['brand_unit_interaction'] = df_processed['brand'] + '_' + df_processed['unit'] #price of ounce is different for different brands

df_processed['total_quantity'] = df_processed['quantity'] * df_processed['amount per pack'] 
df_processed.drop(columns=['quantity', 'amount per pack'], inplace=True, errors='ignore') 


df_processed['price_per_unit'] = df_processed['price'] / df_processed['total_quantity']

# Identify outlier thresholds using quantiles
lower_bound = df_processed['price_per_unit'].quantile(0.01)
upper_bound = df_processed['price_per_unit'].quantile(0.99) 

original_rows = len(df_processed)
df_processed = df_processed[(df_processed['price_per_unit'] >= lower_bound) & (df_processed['price_per_unit'] <= upper_bound)]
print(f"--- Outlier Handling: Removed {original_rows - len(df_processed)} extreme outliers. ---")

# --- Define the Preprocessing Pipeline --- 
text_features = 'catalog_info'
categorical_features = ['unit','brand','brand_unit_interaction']
numerical_features = [col for col in df_processed.columns if 'is_' in col] + ['total_quantity']

# --- FIX IS HERE ---
# Add the column names as the third element in ea- Create and Train the Full Model Pipeline ---

# Split data for training and evaluation  
df_processed['target']= np.log1p(df['price_per_unit'] 
                                
drop_cols=['price','price_per_unit','target']
X = df_processed.drop(drop_cols, axis=1) 
y_raw = df_processed['target'] 
y = np.log1p(y_raw)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

preprocessor = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(max_features=2000,ngram_range =(1, 2)), text_features),
        ('category', TargetEncoder(cols=categorical_features), categorical_features),
        ('numeric', StandardScaler(), numerical_features)
    ],
    remainder='drop'
) 
base_estimators=[
    ('xgb',XGBRegressor(objectives='reg:squarederror',random_state=42),
    ('rgf',RandomForestRegressor(random_state=42),
    ('lgbm',LGBMRegressor(objectives='reg:squarederror',random_state=42)
] 

meta_model = RidgeCV()

stacking= StackingRegressor(
    estimators= base_estimator,
    final_estimator= meta_model, 
    cv=5, 
    n_jobs=-1

)

# Create the full pipeline with the new RandomForestRegressor
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('stacking',stacking         # Controls tree depth
                               ))
# Use all available CPU cores
])

param_dist = {
    'regressor__learning_rate': [0.03, 0.05, 0.1],
    'regressor__n_estimators': [1000, 2000, 3000],
    'regressor__max_depth': [7, 8, 10],
    'regressor__subsample': [0.7, 0.8],
    'regressor__colsample_bytree': [0.7, 0.8],
    'regressor__gamma': [0.1, 0.2],
    'regressor__reg_alpha': [0.05, 0.1]
} 

random_search = RandomizedSearchCV(
    model_pipeline,
    param_distributions=param_dist,
    n_iter=10,  # Number of parameter settings that are sampled. Increase for more thorough search.
    scoring='r2',
    cv=5,       # Use 3-fold cross-validation
    verbose=2,
    random_state=42,
       # Use all available CPU cores
) 
X_train_sub, X_val_sub, y_train_sub, y_val_sub = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Fit RandomizedSearchCV
random_search.fit(X_train, y_train)


print("Hyperparameter tuning complete.")
print("Best parameters found: ", random_search.best_params_)

best_model = random_search.best_estimator_


# Use the best model found by the search
best_model = random_search.best_estimator_
# Train the entire pipeline on your training data




print("Training the XGBoost pipeline...")
score = best_model.score(X_test, y_test)   
print(f"Model R² score on test data: {score:.4f}") 


ERROR! Session/line number was not unique in database. History logging moved to new session 665
--- Outlier Handling: Removed 1496 extreme outliers. ---
Fitting 2 folds for each of 5 candidates, totalling 10 fits
[CV] END regressor__colsample_bytree=0.8, regressor__gamma=0.2, regressor__learning_rate=0.1, regressor__max_depth=10, regressor__n_estimators=2000, regressor__reg_alpha=0.05, regressor__subsample=0.7; total time= 8.7min
[CV] END regressor__colsample_bytree=0.8, regressor__gamma=0.2, regressor__learning_rate=0.1, regressor__max_depth=10, regressor__n_estimators=2000, regressor__reg_alpha=0.05, regressor__subsample=0.7; total time= 9.1min
[CV] END regressor__colsample_bytree=0.7, regressor__gamma=0.1, regressor__learning_rate=0.1, regressor__max_depth=7, regressor__n_estimators=1000, regressor__reg_alpha=0.1, regressor__subsample=0.8; total time= 3.5min
[CV] END regressor__colsample_bytree=0.7, regressor__gamma=0.1, regressor__learning_rate=0.1, regressor__max_depth=7, regresso