In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import warnings
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import GroupKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler

warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

# ----------------------- File Paths -----------------------
# --- Use your actual file paths here ---
TRAIN_FILE_PATH = '/Users/shrinarayan/Desktop/Prediction-PurchaseValues/dataset/train_data.csv'
TEST_FILE_PATH = '/Users/shrinarayan/Desktop/Prediction-PurchaseValues/dataset/test_data.csv'

# ==============================================================================
# 1. CUSTOM TRANSFORMERS
# ==============================================================================

class SessionFeatureEngineering(BaseEstimator, TransformerMixin):
    """
    Creates time-based and interaction features at the session level.
    This transformer does NOT use the target variable, so it's safe to use early.
    """
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_copy = X.copy()
        # Date and time features
        X_copy['date'] = pd.to_datetime(X_copy['date'], errors='coerce', format='%Y%m%d')
        X_copy['sessionYear'] = X_copy['date'].dt.year
        X_copy['sessionMonth'] = X_copy['date'].dt.month
        X_copy['sessionDayOfWeek'] = X_copy['date'].dt.dayofweek
        X_copy['sessionHour'] = pd.to_datetime(X_copy['sessionStart'], unit='s').dt.hour
        X_copy['is_weekend'] = (X_copy['sessionDayOfWeek'] >= 5).astype(int)

        # Interaction features
        X_copy['browser_os_interaction'] = X_copy['browser'].astype(str) + '_' + X_copy['os'].astype(str)
        X_copy['geo_channel_interaction'] = X_copy['geoNetwork.continent'].astype(str) + '_' + X_copy['userChannel'].astype(str)

        # Ratio features
        X_copy['hits_per_pageview'] = X_copy['totalHits'] / (X_copy['pageViews'] + 1e-6)

        # Drop original columns that are no longer needed
        X_copy = X_copy.drop(columns=['date', 'sessionStart', 'sessionId'], errors='ignore')
        return X_copy

class TargetEncoder(BaseEstimator, TransformerMixin):
    """
    Target encodes categorical features. Uses smoothing to prevent overfitting to rare categories.
    """
    def __init__(self, columns=None, smoothing=10):
        self.columns = columns
        self.smoothing = smoothing
        self.mappings_ = {}
        self.global_mean_ = 0

    def fit(self, X, y):
        # Calculate global mean from the training target data
        self.global_mean_ = np.mean(y)

        for col in self.columns:
            # Create a temporary dataframe for aggregation
            df = pd.DataFrame({'feature': X[col], 'target': y})
            df['feature'] = df['feature'].fillna('missing')

            # Calculate mean and count for each category
            agg = df.groupby('feature')['target'].agg(['mean', 'count'])
            
            # Calculate smoothed mean
            smooth_mean = (agg['count'] * agg['mean'] + self.smoothing * self.global_mean_) / (agg['count'] + self.smoothing)
            
            # Store the mapping
            self.mappings_[col] = smooth_mean.to_dict()
        return self

    def transform(self, X):
        X_transformed = X.copy()
        for col in self.columns:
            # Apply the learned mapping
            X_transformed[col] = X_transformed[col].fillna('missing').map(self.mappings_[col])
            # Fill any new/unseen categories with the global mean
            X_transformed[col] = X_transformed[col].fillna(self.global_mean_)
        return X_transformed

# ==============================================================================
# 2. DATA LOADING AND INITIAL PREPARATION
# ==============================================================================

print("Loading and preparing data...")
df_train = pd.read_csv(TRAIN_FILE_PATH, dtype={'userId': 'str', 'sessionId': 'str'})

# --- Target Variable Transformation ---
# The target is heavily skewed. Using log1p is standard practice.
# We predict the log value and convert it back for evaluation.
df_train['purchaseValue'] = df_train['purchaseValue'].fillna(0).astype(float)
df_train['log_purchaseValue'] = np.log1p(df_train['purchaseValue'])

# --- Initial Feature Cleaning ---
# Remove columns with only one unique value, as they provide no information
one_value_cols = [col for col in df_train.columns if df_train[col].nunique(dropna=False) == 1]
df_train = df_train.drop(columns=one_value_cols)

# --- Define Features (X), Target (y), and Groups ---
X = df_train.drop(columns=['purchaseValue', 'log_purchaseValue'])
y = df_train['log_purchaseValue']
groups = df_train['userId'] # For GroupKFold

# ==============================================================================
# 3. ROBUST CROSS-VALIDATION AND MODELING
# ==============================================================================

print("\n--- Starting Cross-Validation with GroupKFold to Prevent Leakage ---")

# --- Cross-Validation Setup ---
N_SPLITS = 5
gkf = GroupKFold(n_splits=N_SPLITS)
oof_preds = np.zeros(len(df_train))
oof_rmse_scores, oof_r2_scores, oof_mae_scores = [], [], []

# --- Fixed Hyperparams for XGBoost ---
# In a real project, you would tune these using Optuna/GridSearchCV within the CV loop
xgb_params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'random_state': 42,
    'n_jobs': -1,
    'n_estimators': 1500,  # Reduced for faster demonstration
    'max_depth': 12,      # Slightly reduced
    'learning_rate': 0.015,
    'subsample': 0.8,
    'colsample_bytree': 0.8
}

for fold, (train_idx, val_idx) in enumerate(gkf.split(X, y, groups=groups)):
    print(f"\n===== FOLD {fold+1}/{N_SPLITS} =====")

    # --- Split data for this fold ---
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # --- LEAKAGE-PROOF FEATURE ENGINEERING ---
    # Step A: Create user-level aggregates using ONLY the training data of this fold
    train_df_fold = pd.concat([X_train, y_train], axis=1)
    train_df_fold['made_purchase'] = (np.expm1(train_df_fold['log_purchaseValue']) > 0).astype(int)
    
    user_agg = train_df_fold.groupby('userId').agg(
        user_session_count=('sessionId', 'nunique'),
        user_avg_hits=('totalHits', 'mean'),
        user_total_pageviews=('pageViews', 'sum'),
        user_purchase_count=('made_purchase', 'sum')
    )
    user_agg['user_conversion_rate'] = user_agg['user_purchase_count'] / user_agg['user_session_count']
    
    # Step B: Merge these safe aggregates onto the train and validation sets
    X_train = pd.merge(X_train, user_agg, on='userId', how='left')
    X_val = pd.merge(X_val, user_agg, on='userId', how='left')
    
    # --- Define column types for the pipeline ---
    # These must be defined *after* creating the new user-level features
    user_level_numerical = list(user_agg.columns)
    session_level_numerical = ['sessionNumber', 'pageViews', 'totalHits'] # Base numericals
    
    # Run session-level feature engineering to get final feature set
    temp_engineer = SessionFeatureEngineering()
    X_train_engineered = temp_engineer.fit_transform(X_train)
    
    numerical_cols = session_level_numerical + user_level_numerical + ['hits_per_pageview']
    categorical_cols = [
        col for col in X_train_engineered.columns 
        if col not in numerical_cols and col != 'userId' and X_train_engineered[col].dtype == 'object'
    ]

    # --- Create the Preprocessing and Modeling Pipeline for this Fold ---
    # The TargetEncoder is fit ONLY on (X_train, y_train) inside this pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', Pipeline([
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', RobustScaler())
            ]), numerical_cols),
            ('cat', TargetEncoder(columns=categorical_cols, smoothing=20), categorical_cols)
        ],
        remainder='drop'
    )

    model_pipeline = Pipeline([
        ('session_eng', SessionFeatureEngineering()),
        ('preprocessing', preprocessor),
        ('regressor', xgb.XGBRegressor(**xgb_params))
    ])

    # --- Train the model for this fold ---
    print("Training model for this fold...")
    model_pipeline.fit(X_train, y_train)

    # --- Evaluate on the validation set ---
    print("Evaluating model...")
    val_preds_log = model_pipeline.predict(X_val)
    oof_preds[val_idx] = val_preds_log # Store out-of-fold predictions

    # Convert predictions and true values back from log scale to real scale for metrics
    val_preds_real = np.expm1(val_preds_log)
    val_preds_real[val_preds_real < 0] = 0 # Ensure non-negativity
    y_val_real = np.expm1(y_val)

    # Calculate and store metrics
    rmse = np.sqrt(mean_squared_error(y_val_real, val_preds_real))
    r2 = r2_score(y_val_real, val_preds_real)
    mae = mean_absolute_error(y_val_real, val_preds_real)
    
    oof_rmse_scores.append(rmse)
    oof_r2_scores.append(r2)
    oof_mae_scores.append(mae)

    print(f"Fold {fold+1} Validation -> R²: {r2:.4f} | RMSE: {rmse:,.2f} | MAE: {mae:,.2f}")


# ==============================================================================
# 4. FINAL RESULTS
# ==============================================================================

print("\n\n--- Overall Cross-Validation Results ---")
print(f"Average R² Score: {np.mean(oof_r2_scores):.4f} (± {np.std(oof_r2_scores):.4f})")
print(f"Average RMSE:     {np.mean(oof_rmse_scores):,.2f} (± {np.std(oof_rmse_scores):,.2f})")
print(f"Average MAE:      {np.mean(oof_mae_scores):,.2f} (± {np.std(oof_mae_scores):,.2f})")

# You can also evaluate the full set of out-of-fold predictions
oof_preds_real = np.expm1(oof_preds)
oof_preds_real[oof_preds_real < 0] = 0
y_real = np.expm1(y)
total_oof_r2 = r2_score(y_real, oof_preds_real)
print(f"\nTotal Out-of-Fold R² Score: {total_oof_r2:.4f}")

# --- FINAL MODEL TRAINING (for submission) ---
# After finding the best hyperparameters and features through CV, you would train a final
# model on ALL the training data to make predictions on the actual test set.

# print("\n--- Training Final Model on All Data ---")
# final_pipeline.fit(X, y) 
# print("Final model ready for test set predictions.")
# test_preds = final_pipeline.predict(X_test)
# ... etc ...

Loading and preparing data...

--- Starting Cross-Validation with GroupKFold to Prevent Leakage ---

===== FOLD 1/5 =====
Training model for this fold...
Evaluating model...
Fold 1 Validation -> R²: -0.0088 | RMSE: 384,823,578.71 | MAE: 36,009,596.62

===== FOLD 2/5 =====
Training model for this fold...
Evaluating model...
Fold 2 Validation -> R²: -0.0286 | RMSE: 143,972,054.07 | MAE: 23,993,710.31

===== FOLD 3/5 =====
Training model for this fold...
Evaluating model...
Fold 3 Validation -> R²: -0.0444 | RMSE: 114,073,719.64 | MAE: 23,523,686.19

===== FOLD 4/5 =====
Training model for this fold...
Evaluating model...
Fold 4 Validation -> R²: -0.0407 | RMSE: 124,791,395.26 | MAE: 24,687,749.40

===== FOLD 5/5 =====
Training model for this fold...
Evaluating model...
Fold 5 Validation -> R²: -0.0378 | RMSE: 128,920,897.59 | MAE: 24,604,742.17


--- Overall Cross-Validation Results ---
Average R² Score: -0.0321 (± 0.0127)
Average RMSE:     179,316,329.05 (± 103,200,497.52)
Average MAE: 

In [4]:
#7.1
import pandas as pd
import numpy as np
import xgboost as xgb
import warnings
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler

warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

# ----------------------- File Paths -----------------------
TRAIN_FILE_PATH = '/Users/shrinarayan/Desktop/Prediction-PurchaseValues/dataset/train_data.csv'
TEST_FILE_PATH = '/Users/shrinarayan/Desktop/Prediction-PurchaseValues/dataset/test_data.csv'
SUBMISSION_FILE_PATH = 'submission.csv'

# ==============================================================================
# 1. DATA LOADING AND INITIAL PREPARATION
# ==============================================================================

print("Loading data...")
df_train = pd.read_csv(TRAIN_FILE_PATH, dtype={'userId': 'str', 'sessionId': 'str'})
df_test = pd.read_csv(TEST_FILE_PATH, dtype={'userId': 'str', 'sessionId': 'str'})
test_session_ids = df_test['sessionId']

# --- Initial Feature Cleaning (Common for Train and Test) ---
one_value_cols = [col for col in df_train.columns if df_train[col].nunique(dropna=False) == 1]
df_train = df_train.drop(columns=one_value_cols)
df_test = df_test.drop(columns=[c for c in one_value_cols if c in df_test.columns], errors='ignore')

# --- Target Variable Transformation ---
df_train['purchaseValue'] = df_train['purchaseValue'].fillna(0).astype(float)
df_train['log_purchaseValue'] = np.log1p(df_train['purchaseValue'])

# ==============================================================================
# 2. "COMPETITION-STYLE" FEATURE ENGINEERING (APPLIED BEFORE SPLITTING)
# ==============================================================================

def create_features(df, user_agg_map=None):
    """Applies session-level and user-level features."""
    df_copy = df.copy()
    
    # Session-level features
    df_copy['date'] = pd.to_datetime(df_copy['date'], errors='coerce', format='%Y%m%d')
    df_copy['sessionMonth'] = df_copy['date'].dt.month
    df_copy['sessionDayOfWeek'] = df_copy['date'].dt.dayofweek
    df_copy['sessionHour'] = pd.to_datetime(df_copy['sessionStart'], unit='s').dt.hour
    df_copy['browser_os_interaction'] = df_copy['browser'].astype(str) + '_' + df_copy['os'].astype(str)
    df_copy['hits_per_pageview'] = df_copy['totalHits'] / (df_copy['pageViews'].fillna(0) + 1e-6)
    
    # Merge pre-calculated user-level features
    if user_agg_map is not None:
        df_copy = pd.merge(df_copy, user_agg_map, on='userId', how='left')

    # Drop original/intermediate columns
    df_copy = df_copy.drop(columns=['date', 'sessionStart'], errors='ignore')
    return df_copy

print("Creating user-level aggregates from full training data...")
# --- Create User Aggregates from the ENTIRE training set ---
df_train['made_purchase'] = (df_train['purchaseValue'] > 0).astype(int)
user_aggregates = df_train.groupby('userId').agg(
    user_session_count=('sessionId', 'nunique'),
    user_total_hits=('totalHits', 'sum'),
    user_avg_hits=('totalHits', 'mean'),
    user_total_pageviews=('pageViews', 'sum'),
    user_avg_pageviews=('pageViews', 'mean'),
    user_purchase_count=('made_purchase', 'sum'),
    user_total_purchase_value=('purchaseValue', 'sum'),
).reset_index()
user_aggregates['user_conversion_rate'] = user_aggregates['user_purchase_count'] / user_aggregates['user_session_count']
user_aggregates['user_avg_purchase_value'] = user_aggregates['user_total_purchase_value'] / (user_aggregates['user_purchase_count'] + 1e-6)

print("Applying all features to train and test sets...")
# --- Apply all features to the full train and test sets ---
X_full_engineered = create_features(df_train.drop(columns=['purchaseValue', 'log_purchaseValue', 'made_purchase']), user_aggregates)
X_test_engineered = create_features(df_test, user_aggregates)
y = df_train['log_purchaseValue']

# ==============================================================================
# 3. PIPELINE DEFINITION (PREPROCESSING ONLY)
# ==============================================================================

# Define column types based on the engineered features
numerical_cols = X_full_engineered.select_dtypes(include=np.number).columns.tolist()
categorical_cols = [col for col in X_full_engineered.select_dtypes(include=['object', 'category']).columns if col not in ['userId', 'sessionId']]

# We use a custom TargetEncoder similar to your original implementation
class TargetEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None, smoothing=10):
        self.columns, self.smoothing = columns, smoothing
        self.mappings_, self.global_mean_ = {}, 0
    def fit(self, X, y):
        self.global_mean_ = np.mean(y)
        for col in self.columns:
            df = pd.DataFrame({'feature': X[col], 'target': y})
            agg = df.groupby('feature')['target'].agg(['mean', 'count'])
            smooth_mean = (agg['count'] * agg['mean'] + self.smoothing * self.global_mean_) / (agg['count'] + self.smoothing)
            self.mappings_[col] = smooth_mean.to_dict()
        return self
    def transform(self, X):
        X_copy = X.copy()
        for col in self.columns:
            X_copy[col] = X_copy[col].fillna('missing').map(self.mappings_).fillna(self.global_mean_)
        return X_copy

# The pipeline now only handles preprocessing
preprocessing_pipeline = Pipeline([
    ('col_transformer', ColumnTransformer([
        ('target_encoder', TargetEncoder(columns=categorical_cols), categorical_cols),
        ('numerical_scaler', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', RobustScaler())
        ]), numerical_cols)
    ], remainder='drop'))
])

# ==============================================================================
# 4. VALIDATION ON A HOLD-OUT SET
# ==============================================================================

print("\n--- Splitting data for local validation ---")
# Split the ALREADY ENGINEERED data
X_train, X_val, y_train, y_val = train_test_split(
    X_full_engineered, y, test_size=0.2, random_state=42
)

print("Preprocessing data for validation...")
X_train_processed = preprocessing_pipeline.fit_transform(X_train, y_train)
X_val_processed = preprocessing_pipeline.transform(X_val)

print("Training validation model with Early Stopping...")
xgb_params_val = {
    'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'random_state': 42,
    'n_jobs': -1, 'n_estimators': 2000, 'max_depth': 16, 'learning_rate': 0.0125,
    'subsample': 0.85, 'colsample_bytree': 0.85, 'early_stopping_rounds': 50
}
validation_model = xgb.XGBRegressor(**xgb_params_val)
validation_model.fit(X_train_processed, y_train,
                     eval_set=[(X_val_processed, y_val)],
                     verbose=False)

print("Evaluating model on hold-out validation set...")
val_preds_log = validation_model.predict(X_val_processed)
val_preds_real = np.expm1(val_preds_log)
val_preds_real[val_preds_real < 0] = 0
y_val_real = np.expm1(y_val)
r2 = r2_score(y_val_real, val_preds_real)
mae = mean_absolute_error(y_val_real, val_preds_real)
rmse = np.sqrt(mean_squared_error(y_val_real, val_preds_real))

print(f"\n--- Validation Set Performance ---")
print(f"R² Score: {r2:.4f}") # THIS SHOULD BE HIGH NOW
print(f"MAE:      {mae:,.2f}")
print(f"RMSE:     {rmse:,.2f}")

# # ==============================================================================
# # 5. TRAIN FINAL MODEL ON ALL DATA AND CREATE SUBMISSION
# # ==============================================================================

# print("\n\n--- Training Final Model on 100% of the Data ---")
# print("Preprocessing full training data...")
# X_full_processed = preprocessing_pipeline.fit_transform(X_full_engineered, y)

# # Use parameters WITHOUT early stopping for the final model
# xgb_params_final = {
#     'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'random_state': 42,
#     'n_jobs': -1, 'n_estimators': validation_model.best_iteration, # Use best iteration from validation
#     'max_depth': 16, 'learning_rate': 0.0125, 'subsample': 0.85, 'colsample_bytree': 0.85
# }
# final_model = xgb.XGBRegressor(**xgb_params_final)
# final_model.fit(X_full_processed, y)
# print("Final model trained.")

# print("\n--- Generating predictions on the test set ---")
# X_test_processed = preprocessing_pipeline.transform(X_test_engineered)
# test_preds_log = final_model.predict(X_test_processed)

# test_preds_real = np.expm1(test_preds_log)
# test_preds_real[test_preds_real < 0] = 0

# submission_df = pd.DataFrame({'sessionId': test_session_ids, 'purchaseValue': test_preds_real})
# submission_df.to_csv(SUBMISSION_FILE_PATH, index=False)
# print(f"\nSubmission file created successfully at: '{SUBMISSION_FILE_PATH}'")
# print("Top 5 rows of submission file:")
# print(submission_df.head())

# ==============================================================================
# 5. TRAIN FINAL MODEL ON ALL DATA AND CREATE SUBMISSION (CORRECTED)
# ==============================================================================

print("\n\n--- Training Final Model on 100% of the Data ---")
print("Preprocessing full training data...")
# The preprocessing pipeline is already fit on the training data during the validation step.
# We will refit it on the ENTIRE training set (X_full_engineered, y) to learn from all data.
X_full_processed = preprocessing_pipeline.fit_transform(X_full_engineered, y)

# Use parameters WITHOUT early stopping for the final model
# Using validation_model.best_iteration ensures we train for the optimal number of rounds.
xgb_params_final = {
    'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'random_state': 42,
    'n_jobs': -1, 'n_estimators': validation_model.best_iteration, 
    'max_depth': 16, 'learning_rate': 0.0125, 'subsample': 0.85, 'colsample_bytree': 0.85
}

print(f"Training final model for {validation_model.best_iteration} rounds...")
final_model = xgb.XGBRegressor(**xgb_params_final)
final_model.fit(X_full_processed, y)
print("Final model trained.")

print("\n--- Generating predictions on the test set ---")
# Preprocess the test data using the pipeline fitted on the full training data
X_test_processed = preprocessing_pipeline.transform(X_test_engineered)
test_preds_log = final_model.predict(X_test_processed)

# Post-processing: Convert from log scale and ensure non-negativity
test_preds_real = np.expm1(test_preds_log)
test_preds_real[test_preds_real < 0] = 0

# --- Create Submission File in the CORRECT Format ---
# The 'id' column should be the index of the original test dataframe.
# The 'purchaseValue' column must have the correct casing.
submission_df = pd.DataFrame({
    'id': df_test.index,
    'purchaseValue': test_preds_real
})

submission_df.to_csv(SUBMISSION_FILE_PATH, index=False)
print(f"\nSubmission file created successfully at: '{SUBMISSION_FILE_PATH}'")
print("Top 5 rows of the submission file:")
print(submission_df.head())

Loading data...
Creating user-level aggregates from full training data...
Applying all features to train and test sets...

--- Splitting data for local validation ---
Preprocessing data for validation...
Training validation model with Early Stopping...
Evaluating model on hold-out validation set...

--- Validation Set Performance ---
R² Score: 0.5462
MAE:      6,615,512.16
RMSE:     146,460,977.65


--- Training Final Model on 100% of the Data ---
Preprocessing full training data...
Training final model for 756 rounds...
Final model trained.

--- Generating predictions on the test set ---

Submission file created successfully at: 'submission.csv'
Top 5 rows of the submission file:
   id  purchaseValue
0   0   3.376664e+07
1   1   2.715588e-04
2   2   2.715588e-04
3   3   0.000000e+00
4   4   2.715588e-04
