In [3]:
import pandas as pd
import numpy as np
import xgboost as xgb
import optuna
from tqdm import tqdm
import warnings
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler

warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

# ----------------------- File Paths -----------------------
TRAIN_FILE_PATH = 'train.csv'
TEST_FILE_PATH = 'test.csv'
SUBMISSION_FILE_PATH = 'submission.csv'

# ==============================================================================
# 1. DATA LOADING AND INITIAL PREPARATION
# ==============================================================================

print("Loading data...")
try:
    df_train = pd.read_csv(TRAIN_FILE_PATH, dtype={'userId': 'str', 'sessionId': 'str'})
    df_test = pd.read_csv(TEST_FILE_PATH, dtype={'userId': 'str', 'sessionId': 'str'})
except FileNotFoundError:
    print("Dummy data created as files not found.")
    train_data = {'date': ['20230101']*1000, 'sessionStart': np.random.randint(1672531200, 1672617600, 1000),
                  'browser': np.random.choice(['Chrome', 'Safari', 'Firefox'], 1000), 'os': np.random.choice(['Windows', 'Macintosh', 'Linux'], 1000),
                  'totalHits': np.random.randint(1, 50, 1000), 'pageViews': np.random.randint(1, 30, 1000),
                  'userId': [f'user_{i}' for i in np.random.randint(0, 100, 1000)], 'sessionId': [f'session_{i}' for i in range(1000)],
                  'purchaseValue': [np.random.rand()*100 if np.random.rand() > 0.9 else 0 for _ in range(1000)]}
    df_train = pd.DataFrame(train_data)
    test_data = train_data.copy()
    del test_data['purchaseValue']
    test_data['sessionId'] = [f'session_test_{i}' for i in range(1000)]
    df_test = pd.DataFrame(test_data)

test_session_ids = df_test['sessionId']

# --- THIS IS THE CORRECTED LINE ---
# Exclude essential columns from being dropped, even if they have only one value
essential_cols = ['date', 'sessionStart']
one_value_cols = [col for col in df_train.columns if df_train[col].nunique(dropna=False) == 1 and col not in essential_cols]
# --- END OF CORRECTION ---

df_train = df_train.drop(columns=one_value_cols)
df_test = df_test.drop(columns=[c for c in one_value_cols if c in df_test.columns], errors='ignore')

df_train['purchaseValue'] = df_train['purchaseValue'].fillna(0).astype(float)
df_train['log_purchaseValue'] = np.log1p(df_train['purchaseValue'])

# ==============================================================================
# 2. FEATURE ENGINEERING
# ==============================================================================

def create_features(df, user_agg_map=None):
    df_copy = df.copy()
    df_copy['date'] = pd.to_datetime(df_copy['date'], errors='coerce', format='%Y%m%d')
    df_copy['sessionMonth'] = df_copy['date'].dt.month
    df_copy['sessionDayOfWeek'] = df_copy['date'].dt.dayofweek
    df_copy['sessionHour'] = pd.to_datetime(df_copy['sessionStart'], unit='s').dt.hour
    df_copy['browser_os_interaction'] = df_copy['browser'].astype(str) + '_' + df_copy['os'].astype(str)
    df_copy['hits_per_pageview'] = df_copy['totalHits'] / (df_copy['pageViews'].fillna(0) + 1e-6)
    if user_agg_map is not None:
        df_copy = pd.merge(df_copy, user_agg_map, on='userId', how='left')
    df_copy = df_copy.drop(columns=['date', 'sessionStart'], errors='ignore')
    return df_copy

print("Creating user-level aggregates from full training data...")
df_train['made_purchase'] = (df_train['purchaseValue'] > 0).astype(int)
user_aggregates = df_train.groupby('userId').agg(
    user_session_count=('sessionId', 'nunique'), user_total_hits=('totalHits', 'sum'),
    user_avg_hits=('totalHits', 'mean'), user_total_pageviews=('pageViews', 'sum'),
    user_avg_pageviews=('pageViews', 'mean'), user_purchase_count=('made_purchase', 'sum'),
    user_total_purchase_value=('purchaseValue', 'sum'),
).reset_index()
user_aggregates['user_conversion_rate'] = user_aggregates['user_purchase_count'] / user_aggregates['user_session_count']
user_aggregates['user_avg_purchase_value'] = user_aggregates['user_total_purchase_value'] / (user_aggregates['user_purchase_count'] + 1e-6)

print("Applying all features to train and test sets...")
X_full_engineered = create_features(df_train.drop(columns=['purchaseValue', 'log_purchaseValue', 'made_purchase']), user_aggregates)
X_test_engineered = create_features(df_test, user_aggregates)
y = df_train['log_purchaseValue']

# ==============================================================================
# 3. PIPELINE DEFINITION (No changes here)
# ==============================================================================
numerical_cols = X_full_engineered.select_dtypes(include=np.number).columns.tolist()
categorical_cols = [col for col in X_full_engineered.select_dtypes(include=['object', 'category']).columns if col not in ['userId', 'sessionId']]

class TargetEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None, smoothing=10):
        self.columns, self.smoothing = columns, smoothing
        self.mappings_, self.global_mean_ = {}, 0
    def fit(self, X, y):
        self.global_mean_ = np.mean(y)
        for col in self.columns:
            df = pd.DataFrame({'feature': X[col], 'target': y})
            agg = df.groupby('feature')['target'].agg(['mean', 'count'])
            smooth_mean = (agg['count'] * agg['mean'] + self.smoothing * self.global_mean_) / (agg['count'] + self.smoothing)
            self.mappings_[col] = smooth_mean.to_dict()
        return self
    def transform(self, X):
        X_copy = X.copy()
        for col in self.columns:
            X_copy[col] = X_copy[col].fillna('missing').map(self.mappings_).fillna(self.global_mean_)
        return X_copy

preprocessing_pipeline = Pipeline([
    ('col_transformer', ColumnTransformer([
        ('target_encoder', TargetEncoder(columns=categorical_cols), categorical_cols),
        ('numerical_scaler', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', RobustScaler())
        ]), numerical_cols)
    ], remainder='drop'))
])

# ==============================================================================
# 4. HYPERPARAMETER TUNING WITH OPTUNA and TQDM (No changes here)
# ==============================================================================
print("\n--- Splitting data for hyperparameter tuning ---")
X_train, X_val, y_train, y_val = train_test_split(
    X_full_engineered, y, test_size=0.25, random_state=42
)

print("Preprocessing data for tuning...")
X_train_processed = preprocessing_pipeline.fit_transform(X_train, y_train)
X_val_processed = preprocessing_pipeline.transform(X_val)

def objective(trial):
    params = {
        'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'n_jobs': -1,
        'random_state': 42, 'booster': trial.suggest_categorical('booster', ['gbtree', 'dart']),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
        'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True),
    }
    model = xgb.XGBRegressor(n_estimators=2000, **params)
    model.fit(X_train_processed, y_train,
              eval_set=[(X_val_processed, y_val)],
              early_stopping_rounds=50, verbose=False)
    preds = model.predict(X_val_processed)
    rmse = np.sqrt(mean_squared_error(y_val, preds))
    return rmse

print("\n--- Starting Hyperparameter Tuning with Optuna ---")
study = optuna.create_study(direction='minimize')
N_TRIALS = 30
with tqdm(total=N_TRIALS, desc="Optimizing Hyperparameters") as pbar:
    def tqdm_callback(study, trial):
        pbar.update(1)
    study.optimize(objective, n_trials=N_TRIALS, callbacks=[tqdm_callback])

print(f"\n--- Optuna Study Complete ---")
print(f"Number of finished trials: {len(study.trials)}")
print(f"Best trial's RMSE: {study.best_value:.4f}")
print("Best trial's parameters: ")
for key, value in study.best_params.items():
    print(f"    {key}: {value}")

# ==============================================================================
# 5. TRAIN FINAL MODEL WITH BEST PARAMETERS (No changes here)
# ==============================================================================
print("\n\n--- Training Final Model on 100% of the Data with Best Parameters ---")
print("Preprocessing full training data...")
X_full_processed = preprocessing_pipeline.fit_transform(X_full_engineered, y)

best_params = study.best_params
final_xgb_params = {
    'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'random_state': 42,
    'n_jobs': -1, **best_params
}

print("Determining optimal number of estimators with early stopping...")
temp_model = xgb.XGBRegressor(n_estimators=2000, **final_xgb_params)
temp_model.fit(X_train_processed, y_train,
               eval_set=[(X_val_processed, y_val)],
               early_stopping_rounds=50, verbose=False)
optimal_n_estimators = temp_model.best_iteration

print(f"Optimal number of estimators found: {optimal_n_estimators}")
final_xgb_params['n_estimators'] = optimal_n_estimators

print(f"Training final model for {optimal_n_estimators} rounds...")
final_model = xgb.XGBRegressor(**final_xgb_params)
final_model.fit(X_full_processed, y)
print("Final model trained.")

print("\n--- Generating predictions on the test set ---")
X_test_processed = preprocessing_pipeline.transform(X_test_engineered)
test_preds_log = final_model.predict(X_test_processed)

test_preds_real = np.expm1(test_preds_log)
test_preds_real[test_preds_real < 0] = 0

submission_df = pd.DataFrame({
    'id': df_test.index,
    'purchaseValue': test_preds_real
})

submission_df.to_csv(SUBMISSION_FILE_PATH, index=False)
print(f"\nSubmission file created successfully at: '{SUBMISSION_FILE_PATH}'")
print("Top 5 rows of the submission file:")
print(submission_df.head())

[I 2025-07-24 14:33:27,778] A new study created in memory with name: no-name-ee7a4fca-f155-4b52-b55b-9e32ad939688


Loading data...
Dummy data created as files not found.
Creating user-level aggregates from full training data...
Applying all features to train and test sets...

--- Splitting data for hyperparameter tuning ---
Preprocessing data for tuning...

--- Starting Hyperparameter Tuning with Optuna ---


Optimizing Hyperparameters:   0%|          | 0/30 [00:00<?, ?it/s][W 2025-07-24 14:33:27,793] Trial 0 failed with parameters: {'booster': 'dart', 'learning_rate': 0.026646935925302404, 'max_depth': 6, 'subsample': 0.7962727479528825, 'colsample_bytree': 0.5153529832016173, 'gamma': 2.7270689315288854e-06, 'lambda': 0.00017150648028317985, 'alpha': 0.11222788594653742} because of the following error: TypeError("fit() got an unexpected keyword argument 'early_stopping_rounds'").
Traceback (most recent call last):
  File "/Users/shrinarayan/Desktop/Prediction-PurchaseValues/venv/lib/python3.9/site-packages/optuna/study/_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
  File "/var/folders/dm/98lvl5s96xn5qcs0j_dnn9l80000gn/T/ipykernel_54457/4253649522.py", line 150, in objective
    model.fit(X_train_processed, y_train,
  File "/Users/shrinarayan/Desktop/Prediction-PurchaseValues/venv/lib/python3.9/site-packages/xgboost/core.py", line 726, in inner_f
    return func(

TypeError: fit() got an unexpected keyword argument 'early_stopping_rounds'