In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import warnings
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler

warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

# ----------------------- File Paths -----------------------
TRAIN_FILE_PATH = './dataset/train_data.csv'
TEST_FILE_PATH = './dataset/test_data.csv'
SUBMISSION_FILE_PATH = 'submission.csv'

# ==============================================================================
# 1. DATA LOADING AND INITIAL PREPARATION
# ==============================================================================

print("Loading data...")
df_train = pd.read_csv(TRAIN_FILE_PATH, dtype={'userId': 'str', 'sessionId': 'str'})
df_test = pd.read_csv(TEST_FILE_PATH, dtype={'userId': 'str', 'sessionId': 'str'})
test_session_ids = df_test['sessionId'] # Storing for potential use later, although id is index now

# --- Initial Feature Cleaning (Common for Train and Test) ---
one_value_cols = [col for col in df_train.columns if df_train[col].nunique(dropna=False) == 1]
df_train = df_train.drop(columns=one_value_cols)
df_test = df_test.drop(columns=[c for c in one_value_cols if c in df_test.columns], errors='ignore')

# --- Target Variable Transformation ---
df_train['purchaseValue'] = df_train['purchaseValue'].fillna(0).astype(float)
df_train['log_purchaseValue'] = np.log1p(df_train['purchaseValue'])

# ==============================================================================
# 2. FEATURE ENGINEERING (REVISED & ROBUST)
# ==============================================================================

def create_user_behavior_aggregates(df):
    """
    Creates user-level aggregates based on BEHAVIOR, not purchase value.
    This prevents target leakage.
    """
    print("Creating non-leaky user-level aggregates...")
    user_aggregates = df.groupby('userId').agg(
        user_session_count=('sessionId', 'nunique'),
        user_total_hits=('totalHits', 'sum'),
        user_avg_hits_per_session=('totalHits', 'mean'),
        user_total_pageviews=('pageViews', 'sum'),
        user_avg_pageviews_per_session=('pageViews', 'mean'),
        user_unique_days_visited=('date', 'nunique'),
        user_first_session_ts=('sessionStart', pd.Series.min),
        user_last_session_ts=('sessionStart', pd.Series.max),
    ).reset_index()

    # Calculate time-based features for the user
    user_aggregates['user_activity_span_seconds'] = user_aggregates['user_last_session_ts'] - user_aggregates['user_first_session_ts']
    user_aggregates = user_aggregates.drop(columns=['user_first_session_ts', 'user_last_session_ts'])
    
    return user_aggregates

def create_session_and_interaction_features(df, user_agg_map=None):
    """
    Applies session-level features and new interaction features.
    Merges pre-calculated user-level aggregates.
    """
    df_copy = df.copy()
    
    # Fill NA in key columns for safe string operations
    for col in ['browser', 'os', 'userChannel', 'deviceType', 'geoNetwork.continent']:
        df_copy[col] = df_copy[col].fillna('missing')

    # Session-level time features
    df_copy['date'] = pd.to_datetime(df_copy['date'], errors='coerce', format='%Y%m%d')
    df_copy['sessionMonth'] = df_copy['date'].dt.month
    df_copy['sessionDayOfWeek'] = df_copy['date'].dt.dayofweek
    df_copy['sessionHour'] = pd.to_datetime(df_copy['sessionStart'], unit='s').dt.hour
    
    # Session-level engagement features
    df_copy['hits_per_pageview'] = df_copy['totalHits'] / (df_copy['pageViews'].fillna(0) + 1e-6)
    
    # --- NEW INTERACTION FEATURES ---
    df_copy['browser_os_interaction'] = df_copy['browser'].astype(str) + '_' + df_copy['os'].astype(str)
    df_copy['channel_device_interaction'] = df_copy['userChannel'].astype(str) + '_' + df_copy['deviceType'].astype(str)
    df_copy['continent_browser_interaction'] = df_copy['geoNetwork.continent'].astype(str) + '_' + df_copy['browser'].astype(str)
    
    # Merge pre-calculated user-level features
    if user_agg_map is not None:
        df_copy = pd.merge(df_copy, user_agg_map, on='userId', how='left')
        
        # --- NEW: Features comparing current session to user's history ---
        df_copy['session_hits_vs_user_avg'] = df_copy['totalHits'] - df_copy['user_avg_hits_per_session']
        df_copy['session_pvs_vs_user_avg'] = df_copy['pageViews'] - df_copy['user_avg_pageviews_per_session']


    # Drop original/intermediate columns
    df_copy = df_copy.drop(columns=['date', 'sessionStart'], errors='ignore')
    return df_copy

# ==============================================================================
# 3. PIPELINE DEFINITION (PREPROCESSING ONLY)
# ==============================================================================

# Custom TargetEncoder remains the same - it's a great way to handle categoricals
class TargetEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None, smoothing=10):
        self.columns, self.smoothing = columns, smoothing
        self.mappings_, self.global_mean_ = {}, 0
    def fit(self, X, y):
        self.global_mean_ = np.mean(y)
        for col in self.columns:
            df = pd.DataFrame({'feature': X[col], 'target': y})
            agg = df.groupby('feature')['target'].agg(['mean', 'count'])
            smooth_mean = (agg['count'] * agg['mean'] + self.smoothing * self.global_mean_) / (agg['count'] + self.smoothing)
            self.mappings_[col] = smooth_mean.to_dict()
        return self
    def transform(self, X):
        X_copy = X.copy()
        for col in self.columns:
            # Handle potential missing values during transform
            X_copy[col] = X_copy[col].fillna('missing').map(self.mappings_).fillna(self.global_mean_)
        return X_copy

# The preprocessing pipeline definition will be instantiated later, after we know the column names.

# ==============================================================================
# 4. VALIDATION ON A HOLD-OUT SET (CORRECTED PROCESS)
# ==============================================================================

print("\n--- Splitting data for local validation (BEFORE feature engineering) ---")
# Split the ORIGINAL dataframe to prevent any data leakage during feature creation
train_df, val_df = train_test_split(
    df_train, test_size=0.2, random_state=42
)

# --- Feature Engineering on the Split Data ---
# 1. Create user aggregates *only from the training part*
user_aggregates_val = create_user_behavior_aggregates(train_df)

# 2. Apply all features to the train and validation sets
X_train = create_session_and_interaction_features(train_df.drop(columns=['purchaseValue', 'log_purchaseValue']), user_aggregates_val)
X_val = create_session_and_interaction_features(val_df.drop(columns=['purchaseValue', 'log_purchaseValue']), user_aggregates_val)
y_train, y_val = train_df['log_purchaseValue'], val_df['log_purchaseValue']

# Define column types based on the engineered features
numerical_cols = X_train.select_dtypes(include=np.number).columns.tolist()
categorical_cols = [col for col in X_train.select_dtypes(include=['object', 'category']).columns if col not in ['userId', 'sessionId']]

# Instantiate and fit the preprocessing pipeline
preprocessing_pipeline = Pipeline([
    ('col_transformer', ColumnTransformer([
        ('target_encoder', TargetEncoder(columns=categorical_cols), categorical_cols),
        ('numerical_scaler', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', RobustScaler())
        ]), numerical_cols)
    ], remainder='drop'))
])

print("Preprocessing data for validation...")
X_train_processed = preprocessing_pipeline.fit_transform(X_train, y_train)
X_val_processed = preprocessing_pipeline.transform(X_val)

print("Training validation model with Early Stopping...")
xgb_params_val = {
    'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'random_state': 42,
    'n_jobs': -1, 'n_estimators': 2000, 'max_depth': 8, 'learning_rate': 0.02, # Slightly more robust params
    'subsample': 0.8, 'colsample_bytree': 0.8, 'early_stopping_rounds': 50
}
validation_model = xgb.XGBRegressor(**xgb_params_val)
validation_model.fit(X_train_processed, y_train,
                     eval_set=[(X_val_processed, y_val)],
                     verbose=False)

print("Evaluating model on hold-out validation set...")
val_preds_log = validation_model.predict(X_val_processed)
val_preds_real = np.expm1(val_preds_log)
val_preds_real[val_preds_real < 0] = 0
y_val_real = np.expm1(y_val)
r2 = r2_score(y_val_real, val_preds_real)
mae = mean_absolute_error(y_val_real, val_preds_real)
rmse = np.sqrt(mean_squared_error(y_val_real, val_preds_real))

print(f"\n--- Validation Set Performance (Robust) ---")
print(f"R² Score: {r2:.4f}")
print(f"MAE:      {mae:,.2f}")
print(f"RMSE:     {rmse:,.2f}")

# ==============================================================================
# 5. TRAIN FINAL MODEL ON ALL DATA AND CREATE SUBMISSION 
# ==============================================================================

print("\n\n--- Training Final Model on 100% of the Data ---")
# 1. Create user aggregates from the ENTIRE training set for maximum information
user_aggregates_full = create_user_behavior_aggregates(df_train)

# 2. Apply all features to the full train and test sets
X_full_engineered = create_session_and_interaction_features(df_train.drop(columns=['purchaseValue', 'log_purchaseValue']), user_aggregates_full)
X_test_engineered = create_session_and_interaction_features(df_test, user_aggregates_full)
y = df_train['log_purchaseValue']

# 3. Re-fit the preprocessing pipeline on ALL training data
print("Preprocessing full training and test data...")
X_full_processed = preprocessing_pipeline.fit_transform(X_full_engineered, y)
X_test_processed = preprocessing_pipeline.transform(X_test_engineered)

# Use parameters WITHOUT early stopping for the final model
# Using validation_model.best_iteration ensures we train for the optimal number of rounds.
final_n_estimators = validation_model.best_iteration if validation_model.best_iteration > 0 else 500
xgb_params_final = {
    'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'random_state': 42,
    'n_jobs': -1, 'n_estimators': final_n_estimators, 
    'max_depth': 8, 'learning_rate': 0.02, 'subsample': 0.8, 'colsample_bytree': 0.8
}

print(f"Training final model for {final_n_estimators} rounds...")
final_model = xgb.XGBRegressor(**xgb_params_final)
final_model.fit(X_full_processed, y)
print("Final model trained.")

print("\n--- Generating predictions on the test set ---")
test_preds_log = final_model.predict(X_test_processed)

# Post-processing: Convert from log scale and ensure non-negativity
test_preds_real = np.expm1(test_preds_log)
test_preds_real[test_preds_real < 0] = 0

# Creating submission file in the specified format
submission_df = pd.DataFrame({
    'id': df_test.index,
    'purchaseValue': test_preds_real
})

submission_df.to_csv(SUBMISSION_FILE_PATH, index=False)
print(f"\nSubmission file created successfully at: '{SUBMISSION_FILE_PATH}'")
print("Top 5 rows of the submission file:")
print(submission_df.head())

Loading data...

--- Splitting data for local validation (BEFORE feature engineering) ---
Creating non-leaky user-level aggregates...
Preprocessing data for validation...
Training validation model with Early Stopping...
Evaluating model on hold-out validation set...

--- Validation Set Performance (Robust) ---
R² Score: 0.0198
MAE:      27,007,723.83
RMSE:     215,249,181.42


--- Training Final Model on 100% of the Data ---
Creating non-leaky user-level aggregates...
Preprocessing full training and test data...
Training final model for 235 rounds...
Final model trained.

--- Generating predictions on the test set ---

Submission file created successfully at: 'submission.csv'
Top 5 rows of the submission file:
   id  purchaseValue
0   0   6.442287e+06
1   1   8.692483e-01
2   2   3.447260e-02
3   3   0.000000e+00
4   4   1.154217e+00


In [2]:
print("Checking for user overlap...")
train_users = set(df_train['userId'])
test_users = set(df_test['userId'])
overlap_count = len(train_users.intersection(test_users))
print(f"Number of users in Train: {len(train_users)}")
print(f"Number of users in Test: {len(test_users)}")
print(f"Number of users in BOTH Train and Test: {overlap_count}")
print(f"Overlap Percentage: {100 * overlap_count / len(test_users):.2f}%")

Checking for user overlap...
Number of users in Train: 100499
Number of users in Test: 27657
Number of users in BOTH Train and Test: 5879
Overlap Percentage: 21.26%
