In [9]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import r2_score, mean_squared_error
import warnings

warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

# ==============================================================================
# PART 1: FEATURE ENGINEERING & PREPROCESSING CLASSES (UNCHANGED)
# ==============================================================================

class AdvancedFeatureEngineering(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None): return self
    def transform(self, X):
        X_copy = X.copy()
        X_copy['date'] = pd.to_datetime(X_copy['date'], format='%Y%m%d')
        X_copy['sessionYear'] = X_copy['date'].dt.year
        X_copy['sessionMonth'] = X_copy['date'].dt.month
        X_copy['sessionDayOfWeek'] = X_copy['date'].dt.dayofweek
        X_copy['sessionHour'] = pd.to_datetime(X_copy['sessionStart'], unit='s').dt.hour
        X_copy['is_weekend'] = (X_copy['sessionDayOfWeek'] >= 5).astype(int)
        X_copy['month_day_interaction'] = X_copy['sessionMonth'].astype(str) + '_' + X_copy['sessionDayOfWeek'].astype(str)
        X_copy['browser_os_interaction'] = X_copy['browser'].astype(str) + '_' + X_copy['os'].astype(str)
        X_copy['hits_per_pageview'] = X_copy['totalHits'] / (X_copy['pageViews'].fillna(0) + 1e-6)
        X_copy['ad_page_binned'] = X_copy['trafficSource.adwordsClickInfo.page'].apply(lambda p: 1 if p == 1.0 else (2 if pd.notna(p) else 0))
        cols_to_drop = ['date', 'sessionStart', 'userId', 'sessionId', 'trafficSource.adwordsClickInfo.page']
        X_copy = X_copy.drop(columns=cols_to_drop, errors='ignore')
        return X_copy

class TargetEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns; self.mappings_ = {}; self.global_mean_ = 0
    def fit(self, X, y):
        X_fit, y_fit = X.copy(), y.copy()
        self.global_mean_ = np.mean(y_fit)
        for col in self.columns:
            X_fit[col] = X_fit[col].fillna('missing')
            self.mappings_[col] = y_fit.groupby(X_fit[col]).mean().to_dict()
        return self
    def transform(self, X):
        X_transform = X.copy()
        for col in self.columns:
            X_transform[col] = X_transform[col].fillna('missing')
            X_transform[col] = X_transform[col].map(self.mappings_[col]).fillna(self.global_mean_)
        return X_transform

# ==============================================================================
# PART 2: DATA PREPARATION
# ==============================================================================
TRAIN_FILE_PATH = './dataset/train_data.csv'
TEST_FILE_PATH = './dataset/test_data.csv'
SUBMISSION_FILE_PATH = 'submission.csv'

print("Loading and preparing data...")
df = pd.read_csv(TRAIN_FILE_PATH, dtype={'userId': 'str', 'sessionId': 'str'})
one_value_cols = [col for col in df.columns if df[col].nunique(dropna=False) == 1]
df = df.drop(columns=one_value_cols)

df['purchaseValue'] = df['purchaseValue'].fillna(0) / 1e6
df['log_purchaseValue'] = np.log1p(df['purchaseValue'])

X = df.drop(columns=['purchaseValue', 'log_purchaseValue'])
y = df[['log_purchaseValue', 'purchaseValue']]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# ==============================================================================
# PART 3: PREPROCESSING & BULLETPROOF VALIDATION
# ==============================================================================
print("\n--- Preprocessing Data and Validating Model ---")

temp_engineered_df = AdvancedFeatureEngineering().fit_transform(X)
numerical_cols = ['sessionNumber', 'pageViews', 'totalHits', 'sessionYear', 'sessionMonth', 'sessionDayOfWeek', 'sessionHour', 'hits_per_pageview', 'ad_page_binned', 'is_weekend']
categorical_cols = [col for col in temp_engineered_df.columns if col not in numerical_cols]

preprocessor = Pipeline(steps=[
    ('engineering', AdvancedFeatureEngineering()),
    ('col_transformer', ColumnTransformer([
        ('num', Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]), numerical_cols),
        ('cat', TargetEncoder(columns=categorical_cols), categorical_cols)
    ]))
])

print("Fitting preprocessor and transforming data...")
X_train_processed = preprocessor.fit_transform(X_train, y_train['log_purchaseValue'])
X_val_processed = preprocessor.transform(X_val)

print("Training model and recording validation performance...")
model = xgb.XGBRegressor(
    objective='reg:squarederror',
    eval_metric='rmse',
    random_state=42,
    n_jobs=-1,
    n_estimators=2500,
    learning_rate=0.01,
    max_depth=10,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0.1
)

# *** THE FIX: We train for all rounds and find the best one AFTERWARDS ***
# This avoids any problematic keyword arguments in .fit()
model.fit(X_train_processed, y_train['log_purchaseValue'],
          eval_set=[(X_val_processed, y_val['log_purchaseValue'])],
          verbose=False) # verbose=False keeps the log clean

# Find the best iteration by looking at the model's history
results = model.evals_result()
# The history is in a nested dict: {'validation_0': {'rmse': [val1, val2, ...]}}
validation_rmse = results['validation_0']['rmse']
best_iteration = np.argmin(validation_rmse) + 1 # +1 because list is 0-indexed
print(f"Validation complete. Best iteration found at: {best_iteration} with RMSE: {validation_rmse[best_iteration-1]:.4f}")


# ==============================================================================
# PART 4: FINAL MODEL TRAINING AND SUBMISSION
# ==============================================================================
print("\n--- Training Final Model on ALL Data ---")

# Define final model parameters using the best iteration we found
final_model_params = model.get_params()
final_model_params['n_estimators'] = best_iteration

final_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', xgb.XGBRegressor(**final_model_params))
])

print(f"Training final pipeline for {best_iteration} rounds...")
final_pipeline.fit(X, y['log_purchaseValue'])

print("\n--- Generating Final Kaggle Submission ---")
try:
    kaggle_test_df = pd.read_csv(TEST_FILE_PATH, dtype={'userId': 'str', 'sessionId': 'str'})
    kaggle_test_df = kaggle_test_df.drop(columns=[c for c in one_value_cols if c in kaggle_test_df.columns], errors='ignore')

    final_log_pred = final_pipeline.predict(kaggle_test_df)
    
    final_value_pred = np.expm1(final_log_pred)
    final_value_pred[final_value_pred < 0] = 0
    final_predictions_scaled = final_value_pred * 1e6
    
    submission_df = pd.DataFrame({'id': kaggle_test_df.index, 'purchaseValue': final_predictions_scaled})
    submission_df.to_csv(SUBMISSION_FILE_PATH, index=False)
    
    print(f"✅ Submission file '{SUBMISSION_FILE_PATH}' created successfully.")
    print("Top 5 rows of submission file:")
    print(submission_df.head())
    
except FileNotFoundError:
    print(f"\nKaggle '{TEST_FILE_PATH}' not found.")
except Exception as e:
    print(f"\nAn error occurred during submission generation: {e}")

Loading and preparing data...

--- Preprocessing Data and Validating Model ---
Fitting preprocessor and transforming data...
Training model and recording validation performance...
Validation complete. Best iteration found at: 2499 with RMSE: 0.7481

--- Training Final Model on ALL Data ---
Training final pipeline for 2499 rounds...

--- Generating Final Kaggle Submission ---
✅ Submission file 'submission.csv' created successfully.
Top 5 rows of submission file:
   id  purchaseValue
0   0   2.445515e+07
1   1   1.609249e+05
2   2   1.413633e+03
3   3   5.831444e+04
4   4   2.876357e+04
