In [4]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold
import xgboost as xgb
from sklearn.metrics import roc_auc_score, mean_squared_error, r2_score
import optuna
import warnings

warnings.filterwarnings('ignore', category=FutureWarning)

# ==============================================================================
# PART 1: ADVANCED FEATURE ENGINEERING
# ==============================================================================

class AdvancedFeatureEngineering(BaseEstimator, TransformerMixin):
    """
    Creates a rich set of features for the model.
    """
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_copy = X.copy()
        
        # Date/Time Engineering
        X_copy['date'] = pd.to_datetime(X_copy['date'], format='%Y%m%d')
        X_copy['sessionYear'] = X_copy['date'].dt.year
        X_copy['sessionMonth'] = X_copy['date'].dt.month
        X_copy['sessionDayOfWeek'] = X_copy['date'].dt.dayofweek
        X_copy['sessionHour'] = pd.to_datetime(X_copy['sessionStart'], unit='s').dt.hour
        X_copy['is_weekend'] = (X_copy['sessionDayOfWeek'] >= 5).astype(int)
        
        # Interaction Features
        X_copy['month_day_interaction'] = X_copy['sessionMonth'].astype(str) + '_' + X_copy['sessionDayOfWeek'].astype(str)
        X_copy['browser_os_interaction'] = X_copy['browser'].astype(str) + '_' + X_copy['os'].astype(str)
        
        # Ratio Features (handle division by zero)
        X_copy['hits_per_pageview'] = X_copy['totalHits'] / (X_copy['pageViews'] + 1e-6)
        
        # Binning AdWords Page
        X_copy['ad_page_binned'] = X_copy['trafficSource.adwordsClickInfo.page'].apply(
            lambda p: 1 if p == 1.0 else (2 if pd.notna(p) else 0)
        )
        
        cols_to_drop = ['date', 'sessionStart', 'userId', 'sessionId', 'trafficSource.adwordsClickInfo.page']
        X_copy = X_copy.drop(columns=cols_to_drop, errors='ignore')
        
        return X_copy

# TargetEncoder class remains the same
class TargetEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns; self.mappings_ = {}; self.global_mean_ = 0
    def fit(self, X, y):
        X_fit, y_fit = X.copy(), y.copy()
        self.global_mean_ = np.mean(y_fit)
        for col in self.columns:
            X_fit[col] = X_fit[col].fillna('missing')
            self.mappings_[col] = y_fit.groupby(X_fit[col]).mean().to_dict()
        return self
    def transform(self, X):
        X_transform = X.copy()
        for col in self.columns:
            X_transform[col] = X_transform[col].fillna('missing')
            X_transform[col] = X_transform[col].map(self.mappings_[col]).fillna(self.global_mean_)
        return X_transform

# ==============================================================================
# PART 2: DATA PREPARATION
# ==============================================================================
TRAIN_FILE_PATH = './dataset/train_data.csv'
TEST_FILE_PATH = './dataset/test_data.csv'

print("Loading and preparing data...")
df = pd.read_csv(TRAIN_FILE_PATH, dtype={'fullVisitorId': 'str'})
one_value_cols = [col for col in df.columns if df[col].nunique(dropna=False) == 1]
df = df.drop(columns=one_value_cols)

df['purchaseValue'] = df['purchaseValue'].fillna(0) / 1e6
df['made_purchase'] = (df['purchaseValue'] > 0).astype(int)
df['log_purchaseValue'] = np.log1p(df['purchaseValue'])

X = df.drop(columns=['purchaseValue', 'made_purchase', 'log_purchaseValue'])
y = df[['made_purchase', 'log_purchaseValue']]

# --- Define Column Groups for Preprocessing ---
temp_engineered_df = AdvancedFeatureEngineering().fit_transform(X)
numerical_cols = ['sessionNumber', 'pageViews', 'totalHits', 'sessionYear', 'sessionMonth', 'sessionDayOfWeek', 'sessionHour', 'hits_per_pageview']
categorical_cols = [col for col in temp_engineered_df.columns if col not in numerical_cols]

# ==============================================================================
# PART 3: HYPERPARAMETER TUNING WITH OPTUNA
# ==============================================================================

# --- Tuning the Classifier ---
print("\n--- Tuning Classifier with Optuna ---")
X_engineered = AdvancedFeatureEngineering().fit_transform(X)
y_clf_target = y['made_purchase']

def objective_clf(trial):
    params = {
        'objective': 'binary:logistic', 'eval_metric': 'logloss', 'random_state': 42,
        'n_estimators': trial.suggest_int('n_estimators', 400, 1000),
        'max_depth': trial.suggest_int('max_depth', 4, 8),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'subsample': trial.suggest_float('subsample', 0.7, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
        'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
    }
    
    cv = KFold(n_splits=4, shuffle=True, random_state=42)
    cv_scores = []
    
    for train_idx, val_idx in cv.split(X_engineered):
        X_train, X_val = X_engineered.iloc[train_idx], X_engineered.iloc[val_idx]
        y_train, y_val = y_clf_target.iloc[train_idx], y_clf_target.iloc[val_idx]

        preprocessor = ColumnTransformer([('num', Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]), numerical_cols), ('cat', TargetEncoder(columns=categorical_cols), categorical_cols)], remainder='drop')
        
        preprocessor.fit(X_train, y_train)
        X_train_processed = preprocessor.transform(X_train)
        X_val_processed = preprocessor.transform(X_val)

        model = xgb.XGBClassifier(**params)
        model.fit(X_train_processed, y_train)
        preds = model.predict_proba(X_val_processed)[:, 1]
        cv_scores.append(roc_auc_score(y_val, preds))
        
    return np.mean(cv_scores)

study_clf = optuna.create_study(direction='maximize')
study_clf.optimize(objective_clf, n_trials=30) # Run 30 trials
best_clf_params = study_clf.best_params

# --- Tuning the Regressor ---
print("\n--- Tuning Regressor with Optuna ---")
X_buyers_engineered = X_engineered[y['made_purchase'] == 1]
y_reg_target = y.loc[y['made_purchase'] == 1, 'log_purchaseValue']

def objective_reg(trial):
    params = {
        'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'random_state': 42,
        'n_estimators': trial.suggest_int('n_estimators', 400, 1000),
        'max_depth': trial.suggest_int('max_depth', 4, 8),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'subsample': trial.suggest_float('subsample', 0.7, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
    }

    cv = KFold(n_splits=4, shuffle=True, random_state=42)
    cv_scores = []

    for train_idx, val_idx in cv.split(X_buyers_engineered):
        X_train, X_val = X_buyers_engineered.iloc[train_idx], X_buyers_engineered.iloc[val_idx]
        y_train, y_val = y_reg_target.iloc[train_idx], y_reg_target.iloc[val_idx]

        preprocessor = ColumnTransformer([('num', Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]), numerical_cols), ('cat', TargetEncoder(columns=categorical_cols), categorical_cols)], remainder='drop')

        preprocessor.fit(X_train, y_train)
        X_train_processed = preprocessor.transform(X_train)
        X_val_processed = preprocessor.transform(X_val)

        model = xgb.XGBRegressor(**params)
        model.fit(X_train_processed, y_train)
        preds = model.predict(X_val_processed)
        cv_scores.append(np.sqrt(mean_squared_error(y_val, preds)))

    return np.mean(cv_scores)

study_reg = optuna.create_study(direction='minimize')
study_reg.optimize(objective_reg, n_trials=30) # Run 30 trials
best_reg_params = study_reg.best_params

# ==============================================================================
# PART 4: FINAL MODEL TRAINING AND SUBMISSION
# ==============================================================================

print("\n--- Training Final Models with Best Parameters ---")
# Build and fit the final classifier pipeline on ALL training data
final_clf_pipeline = Pipeline([
    ('engineering', AdvancedFeatureEngineering()),
    ('preprocessing', ColumnTransformer([('num', Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]), numerical_cols), ('cat', TargetEncoder(columns=categorical_cols), categorical_cols)], remainder='drop')),
    ('classifier', xgb.XGBClassifier(**best_clf_params, random_state=42))
])
final_clf_pipeline.fit(X, y['made_purchase'])

# Build and fit the final regressor pipeline on ALL buyer data
final_reg_pipeline = Pipeline([
    ('engineering', AdvancedFeatureEngineering()),
    ('preprocessing', ColumnTransformer([('num', Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]), numerical_cols), ('cat', TargetEncoder(columns=categorical_cols), categorical_cols)], remainder='drop')),
    ('regressor', xgb.XGBRegressor(**best_reg_params, random_state=42))
])
final_reg_pipeline.fit(X[y['made_purchase'] == 1], y.loc[y['made_purchase'] == 1, 'log_purchaseValue'])

print("\n--- Generating Final Kaggle Submission ---")
try:
    kaggle_test_df = pd.read_csv(TEST_FILE_PATH, dtype={'fullVisitorId': 'str'})
    
    kaggle_prob_purchase = final_clf_pipeline.predict_proba(kaggle_test_df)[:, 1]
    kaggle_log_value_pred = final_reg_pipeline.predict(kaggle_test_df)
    
    kaggle_value_pred = np.expm1(kaggle_log_value_pred)
    kaggle_final_predictions_dollars = kaggle_prob_purchase * kaggle_value_pred
    kaggle_final_predictions_dollars[kaggle_final_predictions_dollars < 0] = 0
    
    kaggle_final_predictions_scaled = kaggle_final_predictions_dollars * 1e6
    
    # 👇 Changed 'ID' to lowercase 'id'
    submission_df = pd.DataFrame({'id': kaggle_test_df.index, 'purchaseValue': kaggle_final_predictions_scaled})
    
    # Save to /kaggle/working explicitly for Kaggle
    submission_df.to_csv('/kaggle/working/submission_final.csv', index=False)
    print("✅ Submission file 'submission_final.csv' created successfully.")
    
except FileNotFoundError:
    print(f"\nKaggle '{TEST_FILE_PATH}' not found.")
except Exception as e:
    print(f"\nAn error occurred during submission generation: {e}")

Loading and preparing data...

--- Tuning Classifier with Optuna ---


[I 2025-07-22 20:15:28,425] A new study created in memory with name: no-name-5cf6981e-7351-466a-ad9f-9af1ddf872e3
[I 2025-07-22 20:15:39,136] Trial 0 finished with value: 0.9895436234394619 and parameters: {'n_estimators': 497, 'max_depth': 5, 'learning_rate': 0.045867062470863355, 'subsample': 0.8646651669650627, 'colsample_bytree': 0.7662024815888056, 'gamma': 5.712279949855512e-08}. Best is trial 0 with value: 0.9895436234394619.
[I 2025-07-22 20:15:48,539] Trial 1 finished with value: 0.9891107746816205 and parameters: {'n_estimators': 442, 'max_depth': 5, 'learning_rate': 0.020557432052152273, 'subsample': 0.9862210971509013, 'colsample_bytree': 0.7687252505865382, 'gamma': 1.5415536274811217e-05}. Best is trial 0 with value: 0.9895436234394619.
[I 2025-07-22 20:16:00,308] Trial 2 finished with value: 0.9896494527815003 and parameters: {'n_estimators': 498, 'max_depth': 6, 'learning_rate': 0.028312955743009086, 'subsample': 0.7516512280047092, 'colsample_bytree': 0.891630788741750


--- Tuning Regressor with Optuna ---


[I 2025-07-22 20:23:46,363] Trial 0 finished with value: 0.9417748036024891 and parameters: {'n_estimators': 864, 'max_depth': 7, 'learning_rate': 0.010884057037511748, 'subsample': 0.7401576999868297, 'colsample_bytree': 0.9043608237512015}. Best is trial 0 with value: 0.9417748036024891.
[I 2025-07-22 20:23:52,194] Trial 1 finished with value: 0.9804922913372645 and parameters: {'n_estimators': 672, 'max_depth': 5, 'learning_rate': 0.012723902142819248, 'subsample': 0.9509688684519855, 'colsample_bytree': 0.9585241629560958}. Best is trial 0 with value: 0.9417748036024891.
[I 2025-07-22 20:24:13,484] Trial 2 finished with value: 0.8268046482477835 and parameters: {'n_estimators': 966, 'max_depth': 8, 'learning_rate': 0.05827460234722526, 'subsample': 0.7588544733288076, 'colsample_bytree': 0.9329530292556752}. Best is trial 2 with value: 0.8268046482477835.
[I 2025-07-22 20:24:21,686] Trial 3 finished with value: 0.9913209019796123 and parameters: {'n_estimators': 443, 'max_depth': 4


--- Training Final Models with Best Parameters ---

--- Generating Final Kaggle Submission ---

An error occurred during submission generation: Cannot save file into a non-existent directory: '/kaggle/working'
