In [12]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import GroupKFold
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import warnings

warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

In [13]:
# ----------------------- File Paths -----------------------
TRAIN_FILE_PATH ='/Users/shrinarayan/Desktop/Prediction-PurchaseValues/dataset/train_data.csv'
TEST_FILE_PATH = '/Users/shrinarayan/Desktop/Prediction-PurchaseValues/dataset/test_data.csv'

In [14]:
# ------------------ Feature Engineering -------------------
class AdvancedFeatureEngineering(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None): return self
    def transform(self, X):
        X_copy = X.copy()
        X_copy['date'] = pd.to_datetime(X_copy['date'], errors='coerce', format='%Y%m%d')
        X_copy['sessionYear'] = X_copy['date'].dt.year
        X_copy['sessionMonth'] = X_copy['date'].dt.month
        X_copy['sessionDayOfWeek'] = X_copy['date'].dt.dayofweek
        X_copy['sessionHour'] = pd.to_datetime(X_copy['sessionStart'], unit='s').dt.hour
        X_copy['is_weekend'] = (X_copy['sessionDayOfWeek'] >= 5).astype(int)
        X_copy['month_day_interaction'] = X_copy['sessionMonth'].astype(str) + '_' + X_copy['sessionDayOfWeek'].astype(str)
        X_copy['browser_os_interaction'] = X_copy['browser'].astype(str) + '_' + X_copy['os'].astype(str)
        X_copy['geo_channel_interaction'] = X_copy['geoNetwork.continent'].astype(str) + '_' + X_copy['userChannel'].astype(str)
        X_copy['device_channel_interaction'] = X_copy['deviceType'].astype(str) + '_' + X_copy['userChannel'].astype(str)
        X_copy['hits_per_pageview'] = X_copy['totalHits'] / (X_copy['pageViews'] + 1e-6)
        X_copy['ad_page_binned'] = X_copy['trafficSource.adwordsClickInfo.page'].apply(lambda p: 1 if p == 1.0 else (2 if pd.notna(p) else 0))
        X_copy = X_copy.drop(columns=['date', 'sessionStart', 'sessionId', 'trafficSource.adwordsClickInfo.page'], errors='ignore')
        return X_copy

In [15]:
class TargetEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None, smoothing=10):
        self.columns, self.smoothing = columns, smoothing
        self.mappings_, self.global_mean_ = {}, 0
    def fit(self, X, y):
        self.global_mean_ = np.mean(y)
        for col in self.columns:
            X[col] = X[col].fillna('missing')
            agg = y.groupby(X[col]).agg(['mean', 'count'])
            smooth_mean = (agg['count'] * agg['mean'] + self.smoothing * self.global_mean_) / (agg['count'] + self.smoothing)
            self.mappings_[col] = smooth_mean.to_dict()
        return self
    def transform(self, X):
        for col in self.columns:
            X[col] = X[col].fillna('missing').map(self.mappings_[col]).fillna(self.global_mean_)
        return X

In [16]:
# ------------------- User-Level Features ------------------

def create_user_level_features(df_train, df_test):
    print("Creating user-level features...")
    df_train['purchaseValue'] = df_train['purchaseValue'].fillna(0) / 1e6
    df_train['made_purchase'] = (df_train['purchaseValue'] > 0).astype(int)
    combined_df = pd.concat([df_train.drop(['purchaseValue', 'made_purchase'], axis=1), df_test], axis=0)
    user_agg = df_train.groupby('userId').agg(
        user_session_count=('sessionId', 'nunique'),
        user_total_hits=('totalHits', 'sum'),
        user_avg_hits=('totalHits', 'mean'),
        user_total_pageviews=('pageViews', 'sum'),
        user_avg_pageviews=('pageViews', 'mean'),
        user_purchase_count=('made_purchase', 'sum'),
        user_total_purchase_value=('purchaseValue', 'sum'),
    ).reset_index()
    user_agg['user_conversion_rate'] = user_agg['user_purchase_count'] / user_agg['user_session_count']
    user_agg['user_avg_purchase_value'] = user_agg['user_total_purchase_value'] / (user_agg['user_purchase_count'] + 1e-6)
    df_train = pd.merge(df_train, user_agg, on='userId', how='left')
    df_test = pd.merge(df_test, user_agg, on='userId', how='left')
    return df_train, df_test

In [17]:
# ------------------------ Load Data ------------------------

print("Loading and preparing data...")
df_train = pd.read_csv(TRAIN_FILE_PATH, dtype={'fullVisitorId': 'str'})
df_test = pd.read_csv(TEST_FILE_PATH, dtype={'fullVisitorId': 'str'})
one_value_cols = [col for col in df_train.columns if df_train[col].nunique(dropna=False) == 1]
df_train = df_train.drop(columns=one_value_cols)
df_test = df_test.drop(columns=[c for c in one_value_cols if c in df_test.columns], errors='ignore')
df_train, df_test = create_user_level_features(df_train, df_test)
df_train['log_purchaseValue'] = np.log1p(df_train['purchaseValue'])
X = df_train.drop(columns=['purchaseValue', 'made_purchase', 'log_purchaseValue'])
y = df_train['log_purchaseValue']

Loading and preparing data...
Creating user-level features...


In [18]:
# --------------------- Identify Columns --------------------

X_engineered = AdvancedFeatureEngineering().fit_transform(X)
user_level_numerical = [
    'user_session_count', 'user_total_hits', 'user_avg_hits',
    'user_total_pageviews', 'user_avg_pageviews', 'user_purchase_count',
    'user_total_purchase_value', 'user_conversion_rate', 'user_avg_purchase_value'
]
session_level_numerical = ['sessionNumber', 'pageViews', 'totalHits', 'hits_per_pageview']
numerical_cols = session_level_numerical + user_level_numerical
categorical_cols = [col for col in X_engineered.columns if col not in numerical_cols and col != 'userId']

In [19]:
# -------------------- Fixed Hyperparams --------------------

xgb_params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'random_state': 42,
    'n_jobs': -1,
    'n_estimators': 2000,
    'max_depth': 16,
    'learning_rate': 0.0125,
    'subsample': 0.85,
    'colsample_bytree': 0.85
}

In [20]:
# ------------------- Train Final Model ---------------------

print("\n--- Training Final XGBoost Regressor ---")
final_pipeline = Pipeline([
    ('engineering', AdvancedFeatureEngineering()),
    ('preprocessing', ColumnTransformer([
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', RobustScaler())
        ]), numerical_cols),
        ('cat', TargetEncoder(columns=categorical_cols), categorical_cols)
    ], remainder='drop')),
    ('regressor', xgb.XGBRegressor(**xgb_params))
])


--- Training Final XGBoost Regressor ---


In [21]:
final_pipeline.fit(X, y)
print("Final model trained.")

Final model trained.


In [22]:
from sklearn.metrics import r2_score, mean_absolute_error

# Predict on training data
train_preds_log = final_pipeline.predict(X)
train_preds_real = np.expm1(train_preds_log)
y_real = np.expm1(y)

# Evaluation Metrics
r2 = r2_score(y_real, train_preds_real)
mae = mean_absolute_error(y_real, train_preds_real)

print(f"\n--- Model Evaluation on Training Set ---")
print(f"R² Score: {r2:.4f}")
print(f"MAE: {mae:,.2f}")


--- Model Evaluation on Training Set ---
R² Score: 1.0000
MAE: 0.02
