In [3]:
# ==============================================================================
# VERSION 4.1: Fix for older XGBoost versions by removing early stopping during tuning
# ==============================================================================
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupKFold
import xgboost as xgb
from sklearn.metrics import roc_auc_score, mean_squared_error
import optuna
import warnings

warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

# --- Define File Paths ---
TRAIN_FILE_PATH = '/Users/shrinarayan/Desktop/Prediction-PurchaseValues/dataset/train_data.csv'
TEST_FILE_PATH = '/Users/shrinarayan/Desktop/Prediction-PurchaseValues/dataset/test_data.csv'


# ==============================================================================
# PART 1: ADVANCED FEATURE ENGINEERING TRANSFORMER
# ==============================================================================

class AdvancedFeatureEngineering(BaseEstimator, TransformerMixin):
    """
    Creates a rich set of session-level features for the model.
    NOTE: userId is intentionally NOT dropped here, as it's needed for GroupKFold.
    """
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_copy = X.copy()

        # Date/Time Engineering
        X_copy['date'] = pd.to_datetime(X_copy['date'], format='%Y%m%d')
        X_copy['sessionYear'] = X_copy['date'].dt.year
        X_copy['sessionMonth'] = X_copy['date'].dt.month
        X_copy['sessionDayOfWeek'] = X_copy['date'].dt.dayofweek
        X_copy['sessionHour'] = pd.to_datetime(X_copy['sessionStart'], unit='s').dt.hour
        X_copy['is_weekend'] = (X_copy['sessionDayOfWeek'] >= 5).astype(int)

        # Interaction Features
        X_copy['month_day_interaction'] = X_copy['sessionMonth'].astype(str) + '_' + X_copy['sessionDayOfWeek'].astype(str)
        X_copy['browser_os_interaction'] = X_copy['browser'].astype(str) + '_' + X_copy['os'].astype(str)
        X_copy['geo_channel_interaction'] = X_copy['geoNetwork.continent'].astype(str) + '_' + X_copy['userChannel'].astype(str)
        X_copy['device_channel_interaction'] = X_copy['deviceType'].astype(str) + '_' + X_copy['userChannel'].astype(str)

        # Ratio Features (handle division by zero)
        X_copy['hits_per_pageview'] = X_copy['totalHits'] / (X_copy['pageViews'] + 1e-6)

        # Binning AdWords Page
        X_copy['ad_page_binned'] = X_copy['trafficSource.adwordsClickInfo.page'].apply(
            lambda p: 1 if p == 1.0 else (2 if pd.notna(p) else 0)
        )

        # Drop original columns that are now redundant or identifiers not used as features
        cols_to_drop = ['date', 'sessionStart', 'sessionId', 'trafficSource.adwordsClickInfo.page']
        X_copy = X_copy.drop(columns=cols_to_drop, errors='ignore')

        return X_copy

class TargetEncoder(BaseEstimator, TransformerMixin):
    """
    Target encodes categorical features. Adds smoothing to prevent overfitting.
    """
    def __init__(self, columns=None, smoothing=10):
        self.columns = columns
        self.smoothing = smoothing
        self.mappings_ = {}
        self.global_mean_ = 0

    def fit(self, X, y):
        X_fit, y_fit = X.copy(), y.copy()
        self.global_mean_ = np.mean(y_fit)

        for col in self.columns:
            X_fit[col] = X_fit[col].fillna('missing')
            agg = y_fit.groupby(X_fit[col]).agg(['mean', 'count'])
            counts = agg['count']
            means = agg['mean']
            
            # Apply smoothing
            smooth_mean = (counts * means + self.smoothing * self.global_mean_) / (counts + self.smoothing)
            self.mappings_[col] = smooth_mean.to_dict()

        return self

    def transform(self, X):
        X_transform = X.copy()
        for col in self.columns:
            X_transform[col] = X_transform[col].fillna('missing')
            X_transform[col] = X_transform[col].map(self.mappings_[col]).fillna(self.global_mean_)
        return X_transform

# ==============================================================================
# PART 2: DATA PREPARATION (NOW WITH USER-LEVEL FEATURES)
# ==============================================================================

def create_user_level_features(df_train, df_test):
    """
    Combines train and test to create consistent user-level aggregate features.
    """
    print("Creating user-level features...")
    # Create target-related columns ONLY on the training set before combination
    df_train['purchaseValue'] = df_train['purchaseValue'].fillna(0) / 1e6
    df_train['made_purchase'] = (df_train['purchaseValue'] > 0).astype(int)

    # Combine for consistent aggregation
    combined_df = pd.concat([df_train.drop(['purchaseValue', 'made_purchase'], axis=1), df_test], axis=0)

    # Use the original df_train with target info to create aggregates
    user_aggregates = df_train.groupby('userId').agg(
        user_session_count=('sessionId', 'nunique'),
        user_total_hits=('totalHits', 'sum'),
        user_avg_hits=('totalHits', 'mean'),
        user_total_pageviews=('pageViews', 'sum'),
        user_avg_pageviews=('pageViews', 'mean'),
        user_purchase_count=('made_purchase', 'sum'),
        user_total_purchase_value=('purchaseValue', 'sum'),
    ).reset_index()

    # Calculate user conversion rate and average purchase value
    user_aggregates['user_conversion_rate'] = user_aggregates['user_purchase_count'] / user_aggregates['user_session_count']
    user_aggregates['user_avg_purchase_value'] = user_aggregates['user_total_purchase_value'] / (user_aggregates['user_purchase_count'] + 1e-6)

    # Merge these features back into the original dataframes
    df_train = pd.merge(df_train, user_aggregates, on='userId', how='left')
    df_test = pd.merge(df_test, user_aggregates, on='userId', how='left')
    
    print("User-level features created and merged.")
    return df_train, df_test


print("Loading and preparing data...")
df_train = pd.read_csv(TRAIN_FILE_PATH, dtype={'fullVisitorId': 'str'})
df_test = pd.read_csv(TEST_FILE_PATH, dtype={'fullVisitorId': 'str'})

# Drop constant value columns from training data and apply to test data
one_value_cols = [col for col in df_train.columns if df_train[col].nunique(dropna=False) == 1]
df_train = df_train.drop(columns=one_value_cols)
df_test = df_test.drop(columns=[c for c in one_value_cols if c in df_test.columns], errors='ignore')

# Create user-level features
df_train, df_test = create_user_level_features(df_train, df_test)

# Finalize target variable for training
df_train['log_purchaseValue'] = np.log1p(df_train['purchaseValue'])
X = df_train.drop(columns=['purchaseValue', 'made_purchase', 'log_purchaseValue'])
y = df_train[['made_purchase', 'log_purchaseValue']]

# --- Define Column Groups for Preprocessing (after all features are created) ---
temp_engineered_df = AdvancedFeatureEngineering().fit_transform(X)
user_level_numerical = [
    'user_session_count', 'user_total_hits', 'user_avg_hits',
    'user_total_pageviews', 'user_avg_pageviews', 'user_purchase_count',
    'user_total_purchase_value', 'user_conversion_rate', 'user_avg_purchase_value'
]
session_level_numerical = ['sessionNumber', 'pageViews', 'totalHits', 'hits_per_pageview']
numerical_cols = session_level_numerical + user_level_numerical

# All other columns (including time features) are treated as categorical. userId is excluded.
categorical_cols = [
    col for col in temp_engineered_df.columns
    if col not in numerical_cols and col != 'userId'
]

print(f"\nIdentified {len(numerical_cols)} numerical features.")
print(f"Identified {len(categorical_cols)} categorical features.")


# ==============================================================================
# PART 3: HYPERPARAMETER TUNING WITH OPTUNA (USING GROUPKFOLD)
# ==============================================================================

def objective_clf(trial):
    params = {
        'objective': 'binary:logistic', 'eval_metric': 'logloss', 'random_state': 42, 'n_jobs': -1,
        'n_estimators': trial.suggest_int('n_estimators', 400, 1200),
        'max_depth': trial.suggest_int('max_depth', 4, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
    }
    
    cv = GroupKFold(n_splits=4)
    groups = X_engineered['userId']
    y_target = y['made_purchase']
    cv_scores = []
    
    for train_idx, val_idx in cv.split(X_engineered, y_target, groups):
        X_train, X_val = X_engineered.iloc[train_idx], X_engineered.iloc[val_idx]
        y_train, y_val = y_target.iloc[train_idx], y_target.iloc[val_idx]

        preprocessor = ColumnTransformer([
            ('num', Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]), numerical_cols),
            ('cat', TargetEncoder(columns=categorical_cols), categorical_cols)
        ], remainder='drop')
        
        preprocessor.fit(X_train, y_train)
        X_train_processed = preprocessor.transform(X_train)
        X_val_processed = preprocessor.transform(X_val)

        model = xgb.XGBClassifier(**params)
        # !!!!! MAJOR CHANGE HERE: Removed early stopping arguments for compatibility !!!!!
        model.fit(X_train_processed, y_train)
        preds = model.predict_proba(X_val_processed)[:, 1]
        cv_scores.append(roc_auc_score(y_val, preds))
        
    return np.mean(cv_scores)

def objective_reg(trial):
    params = {
        'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'random_state': 42, 'n_jobs': -1,
        'n_estimators': trial.suggest_int('n_estimators', 400, 1200),
        'max_depth': trial.suggest_int('max_depth', 4, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
    }

    X_buyers_engineered = X_engineered[y['made_purchase'] == 1]
    y_reg_target = y.loc[y['made_purchase'] == 1, 'log_purchaseValue']

    cv = GroupKFold(n_splits=4)
    groups = X_buyers_engineered['userId']
    cv_scores = []

    for train_idx, val_idx in cv.split(X_buyers_engineered, y_reg_target, groups):
        X_train, X_val = X_buyers_engineered.iloc[train_idx], X_buyers_engineered.iloc[val_idx]
        y_train, y_val = y_reg_target.iloc[train_idx], y_reg_target.iloc[val_idx]

        preprocessor = ColumnTransformer([
            ('num', Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]), numerical_cols),
            ('cat', TargetEncoder(columns=categorical_cols), categorical_cols)
        ], remainder='drop')

        preprocessor.fit(X_train, y_train)
        X_train_processed = preprocessor.transform(X_train)
        X_val_processed = preprocessor.transform(X_val)

        model = xgb.XGBRegressor(**params)
        # !!!!! MAJOR CHANGE HERE: Removed early stopping arguments for compatibility !!!!!
        model.fit(X_train_processed, y_train)
        preds = model.predict(X_val_processed)
        cv_scores.append(np.sqrt(mean_squared_error(y_val, preds)))

    return np.mean(cv_scores)


# --- Run Optuna Studies ---
N_TRIALS = 20
print(f"\n--- Tuning Classifier with Optuna ({N_TRIALS} trials) ---")
X_engineered = AdvancedFeatureEngineering().fit_transform(X)
study_clf = optuna.create_study(direction='maximize')
study_clf.optimize(objective_clf, n_trials=N_TRIALS)
best_clf_params = study_clf.best_params

print(f"\n--- Tuning Regressor with Optuna ({N_TRIALS} trials) ---")
study_reg = optuna.create_study(direction='minimize')
study_reg.optimize(objective_reg, n_trials=N_TRIALS)
best_reg_params = study_reg.best_params


# ==============================================================================
# PART 4: FINAL MODEL TRAINING AND SUBMISSION
# ==============================================================================

print("\n--- Training Final Models with Best Parameters ---")
final_clf_pipeline = Pipeline([
    ('engineering', AdvancedFeatureEngineering()),
    ('preprocessing', ColumnTransformer([
        ('num', Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]), numerical_cols),
        ('cat', TargetEncoder(columns=categorical_cols), categorical_cols)
    ], remainder='drop')),
    ('classifier', xgb.XGBClassifier(**best_clf_params, random_state=42, n_jobs=-1))
])
final_clf_pipeline.fit(X, y['made_purchase'])
print("Final Classifier trained.")

X_buyers = X[y['made_purchase'] == 1]
y_buyers_log_value = y.loc[y['made_purchase'] == 1, 'log_purchaseValue']

final_reg_pipeline = Pipeline([
    ('engineering', AdvancedFeatureEngineering()),
    ('preprocessing', ColumnTransformer([
        ('num', Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]), numerical_cols),
        ('cat', TargetEncoder(columns=categorical_cols), categorical_cols)
    ], remainder='drop')),
    ('regressor', xgb.XGBRegressor(**best_reg_params, random_state=42, n_jobs=-1))
])
final_reg_pipeline.fit(X_buyers, y_buyers_log_value)
print("Final Regressor trained.")


print("\n--- Generating Final Kaggle Submission ---")
try:
    train_cols = X.columns
    for col in train_cols:
        if col not in df_test.columns:
            df_test[col] = np.nan
    df_test = df_test[train_cols]

    kaggle_prob_purchase = final_clf_pipeline.predict_proba(df_test)[:, 1]
    kaggle_log_value_pred = final_reg_pipeline.predict(df_test)
    kaggle_value_pred = np.expm1(kaggle_log_value_pred)
    kaggle_final_predictions_dollars = kaggle_prob_purchase * kaggle_value_pred
    kaggle_final_predictions_dollars[kaggle_final_predictions_dollars < 0] = 0
    kaggle_final_predictions_scaled = kaggle_final_predictions_dollars * 1e6

    submission_df = pd.DataFrame({'ID': df_test.index, 'purchaseValue': kaggle_final_predictions_scaled})
    submission_df.to_csv('submission_improved.csv', index=False)
    print("Submission file 'submission_improved.csv' created successfully.")
    print(submission_df.head())

except FileNotFoundError:
    print(f"\nKaggle test file at '{TEST_FILE_PATH}' not found.")
except Exception as e:
    print(f"\nAn error occurred during submission generation: {e}")

Loading and preparing data...
Creating user-level features...
User-level features created and merged.

Identified 13 numerical features.
Identified 36 categorical features.

--- Tuning Classifier with Optuna (20 trials) ---


[I 2025-07-15 13:52:39,498] A new study created in memory with name: no-name-7db65dbb-8cfd-4337-90aa-a4d7e4e7b52b
[I 2025-07-15 13:52:58,948] Trial 0 finished with value: 0.9997992123761983 and parameters: {'n_estimators': 1177, 'max_depth': 8, 'learning_rate': 0.06364487553493839, 'subsample': 0.8581157869329789, 'colsample_bytree': 0.7352708468974155, 'gamma': 0.11854878045838543}. Best is trial 0 with value: 0.9997992123761983.
[I 2025-07-15 13:53:13,591] Trial 1 finished with value: 0.9998096543775419 and parameters: {'n_estimators': 746, 'max_depth': 5, 'learning_rate': 0.018393932913427637, 'subsample': 0.9630656697122516, 'colsample_bytree': 0.894917280636048, 'gamma': 0.019787482458871124}. Best is trial 1 with value: 0.9998096543775419.
[I 2025-07-15 13:53:26,545] Trial 2 finished with value: 0.9998090919023246 and parameters: {'n_estimators': 555, 'max_depth': 5, 'learning_rate': 0.04524996394918677, 'subsample': 0.6985366176149823, 'colsample_bytree': 0.9052815949365604, 'ga


--- Tuning Regressor with Optuna (20 trials) ---


[I 2025-07-15 13:59:41,306] Trial 0 finished with value: 0.32131670355637076 and parameters: {'n_estimators': 1200, 'max_depth': 10, 'learning_rate': 0.03008227921110725, 'subsample': 0.9960032474756653, 'colsample_bytree': 0.7314583931481613}. Best is trial 0 with value: 0.32131670355637076.
[I 2025-07-15 13:59:47,525] Trial 1 finished with value: 0.311307064881424 and parameters: {'n_estimators': 502, 'max_depth': 7, 'learning_rate': 0.021206995531355397, 'subsample': 0.8613872518251569, 'colsample_bytree': 0.9826650187444967}. Best is trial 1 with value: 0.311307064881424.
[I 2025-07-15 13:59:57,660] Trial 2 finished with value: 0.3248924072685753 and parameters: {'n_estimators': 432, 'max_depth': 10, 'learning_rate': 0.06240998578832945, 'subsample': 0.7964398185228835, 'colsample_bytree': 0.6186295110290995}. Best is trial 1 with value: 0.311307064881424.
[I 2025-07-15 14:00:15,723] Trial 3 finished with value: 0.32574040353289463 and parameters: {'n_estimators': 1024, 'max_depth'


--- Training Final Models with Best Parameters ---
Final Classifier trained.
Final Regressor trained.

--- Generating Final Kaggle Submission ---
Submission file 'submission_improved.csv' created successfully.
   ID  purchaseValue
0   0   3.247984e+07
1   1   1.561607e+03
2   2   4.326103e+02
3   3   2.941875e+05
4   4   9.960168e+02
