In [None]:

import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupKFold
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import optuna
import warnings

warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

# --- Define File Paths ---
TRAIN_FILE_PATH = '/Users/shrinarayan/Desktop/Prediction-PurchaseValues/dataset/train_data.csv'
TEST_FILE_PATH = '/Users/shrinarayan/Desktop/Prediction-PurchaseValues/dataset/test_data.csv'


class AdvancedFeatureEngineering(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None): return self
    def transform(self, X):
        X_copy = X.copy()
        X_copy['date'] = pd.to_datetime(X_copy['date'], format='%Y%m%d')
        X_copy['sessionYear'] = X_copy['date'].dt.year
        X_copy['sessionMonth'] = X_copy['date'].dt.month
        X_copy['sessionDayOfWeek'] = X_copy['date'].dt.dayofweek
        X_copy['sessionHour'] = pd.to_datetime(X_copy['sessionStart'], unit='s').dt.hour
        X_copy['is_weekend'] = (X_copy['sessionDayOfWeek'] >= 5).astype(int)
        X_copy['month_day_interaction'] = X_copy['sessionMonth'].astype(str) + '_' + X_copy['sessionDayOfWeek'].astype(str)
        X_copy['browser_os_interaction'] = X_copy['browser'].astype(str) + '_' + X_copy['os'].astype(str)
        X_copy['geo_channel_interaction'] = X_copy['geoNetwork.continent'].astype(str) + '_' + X_copy['userChannel'].astype(str)
        X_copy['device_channel_interaction'] = X_copy['deviceType'].astype(str) + '_' + X_copy['userChannel'].astype(str)
        X_copy['hits_per_pageview'] = X_copy['totalHits'] / (X_copy['pageViews'] + 1e-6)
        X_copy['ad_page_binned'] = X_copy['trafficSource.adwordsClickInfo.page'].apply(lambda p: 1 if p == 1.0 else (2 if pd.notna(p) else 0))
        cols_to_drop = ['date', 'sessionStart', 'sessionId', 'trafficSource.adwordsClickInfo.page']
        X_copy = X_copy.drop(columns=cols_to_drop, errors='ignore')
        return X_copy

class TargetEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None, smoothing=10):
        self.columns, self.smoothing = columns, smoothing
        self.mappings_, self.global_mean_ = {}, 0
    def fit(self, X, y):
        X_fit, y_fit = X.copy(), y.copy()
        self.global_mean_ = np.mean(y_fit)
        for col in self.columns:
            X_fit[col] = X_fit[col].fillna('missing')
            agg = y_fit.groupby(X_fit[col]).agg(['mean', 'count'])
            smooth_mean = (agg['count'] * agg['mean'] + self.smoothing * self.global_mean_) / (agg['count'] + self.smoothing)
            self.mappings_[col] = smooth_mean.to_dict()
        return self
    def transform(self, X):
        X_transform = X.copy()
        for col in self.columns:
            X_transform[col] = X_transform[col].fillna('missing')
            X_transform[col] = X_transform[col].map(self.mappings_[col]).fillna(self.global_mean_)
        return X_transform

# ==============================================================================
# PART 2: DATA PREPARATION (Unchanged)
# ==============================================================================

def create_user_level_features(df_train, df_test):
    print("Creating user-level features...")
    df_train['purchaseValue'] = df_train['purchaseValue'].fillna(0) / 1e6
    df_train['made_purchase'] = (df_train['purchaseValue'] > 0).astype(int)
    combined_df = pd.concat([df_train.drop(['purchaseValue', 'made_purchase'], axis=1), df_test], axis=0)
    user_aggregates = df_train.groupby('userId').agg(
        user_session_count=('sessionId', 'nunique'), user_total_hits=('totalHits', 'sum'),
        user_avg_hits=('totalHits', 'mean'), user_total_pageviews=('pageViews', 'sum'),
        user_avg_pageviews=('pageViews', 'mean'), user_purchase_count=('made_purchase', 'sum'),
        user_total_purchase_value=('purchaseValue', 'sum'),
    ).reset_index()
    user_aggregates['user_conversion_rate'] = user_aggregates['user_purchase_count'] / user_aggregates['user_session_count']
    user_aggregates['user_avg_purchase_value'] = user_aggregates['user_total_purchase_value'] / (user_aggregates['user_purchase_count'] + 1e-6)
    df_train = pd.merge(df_train, user_aggregates, on='userId', how='left')
    df_test = pd.merge(df_test, user_aggregates, on='userId', how='left')
    print("User-level features created and merged.")
    return df_train, df_test

print("Loading and preparing data...")
df_train = pd.read_csv(TRAIN_FILE_PATH, dtype={'fullVisitorId': 'str'})
df_test = pd.read_csv(TEST_FILE_PATH, dtype={'fullVisitorId': 'str'})
one_value_cols = [col for col in df_train.columns if df_train[col].nunique(dropna=False) == 1]
df_train = df_train.drop(columns=one_value_cols)
df_test = df_test.drop(columns=[c for c in one_value_cols if c in df_test.columns], errors='ignore')
df_train, df_test = create_user_level_features(df_train, df_test)
df_train['log_purchaseValue'] = np.log1p(df_train['purchaseValue'])
X = df_train.drop(columns=['purchaseValue', 'made_purchase', 'log_purchaseValue'])
y = df_train['log_purchaseValue'] # Our target is now just the log value

temp_engineered_df = AdvancedFeatureEngineering().fit_transform(X)
user_level_numerical = [
    'user_session_count', 'user_total_hits', 'user_avg_hits', 'user_total_pageviews',
    'user_avg_pageviews', 'user_purchase_count', 'user_total_purchase_value',
    'user_conversion_rate', 'user_avg_purchase_value'
]
session_level_numerical = ['sessionNumber', 'pageViews', 'totalHits', 'hits_per_pageview']
numerical_cols = session_level_numerical + user_level_numerical
categorical_cols = [col for col in temp_engineered_df.columns if col not in numerical_cols and col != 'userId']
print(f"\nIdentified {len(numerical_cols)} numerical features.")
print(f"Identified {len(categorical_cols)} categorical features.")

# ==============================================================================
# PART 3: HYPERPARAMETER TUNING FOR SINGLE REGRESSOR
# ==============================================================================

def objective_reg(trial):
    params = {
        'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'random_state': 42, 'n_jobs': -1,
        'n_estimators': trial.suggest_int('n_estimators', 400, 1200),#2000
        'max_depth': trial.suggest_int('max_depth', 4, 10),#16
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),#0.0125
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),#0.85
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),#0.85
    }

    # Using the ENTIRE dataset for tuning, not just buyers
    cv = GroupKFold(n_splits=4)
    groups = X_engineered['userId']
    y_reg_target = y # y is already defined as df_train['log_purchaseValue']
    cv_scores = []

    for train_idx, val_idx in cv.split(X_engineered, y_reg_target, groups):
        X_train, X_val = X_engineered.iloc[train_idx], X_engineered.iloc[val_idx]
        y_train, y_val = y_reg_target.iloc[train_idx], y_reg_target.iloc[val_idx]

        preprocessor = ColumnTransformer([
            ('num', Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]), numerical_cols),
            ('cat', TargetEncoder(columns=categorical_cols), categorical_cols)
        ], remainder='drop')

        preprocessor.fit(X_train, y_train)
        X_train_processed = preprocessor.transform(X_train)
        X_val_processed = preprocessor.transform(X_val)

        model = xgb.XGBRegressor(**params)
        model.fit(X_train_processed, y_train) # Removed early stopping for compatibility
        preds = model.predict(X_val_processed)
        cv_scores.append(np.sqrt(mean_squared_error(y_val, preds)))

    return np.mean(cv_scores)

# --- Run Optuna Study ---
N_TRIALS = 30 # Increased trials slightly for a more thorough search
print(f"\n--- Tuning Single Regressor with Optuna ({N_TRIALS} trials) ---")
X_engineered = AdvancedFeatureEngineering().fit_transform(X)
study_reg = optuna.create_study(direction='minimize')
study_reg.optimize(objective_reg, n_trials=N_TRIALS)
best_reg_params = study_reg.best_params

# ==============================================================================
# PART 4: FINAL MODEL TRAINING AND SUBMISSION
# ==============================================================================

print("\n--- Training Final Single Regressor Model with Best Parameters ---")
# Build and fit the final regressor pipeline on ALL training data
final_reg_pipeline = Pipeline([
    ('engineering', AdvancedFeatureEngineering()),
    ('preprocessing', ColumnTransformer([
        ('num', Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]), numerical_cols),
        ('cat', TargetEncoder(columns=categorical_cols), categorical_cols)
    ], remainder='drop')),
    ('regressor', xgb.XGBRegressor(**best_reg_params, random_state=42, n_jobs=-1))
])

# Train on ALL of X and y (log_purchaseValue)
final_reg_pipeline.fit(X, y)
print("Final Single Regressor trained.")


print("\n--- Generating Final Kaggle Submission ---")
try:
    # Align columns of test set with train set
    train_cols = X.columns
    for col in train_cols:
        if col not in df_test.columns:
            df_test[col] = np.nan
    df_test = df_test[train_cols]

    # Predict log purchase value directly
    kaggle_log_value_pred = final_reg_pipeline.predict(df_test)
    
    # Transform prediction back to original scale
    kaggle_value_pred = np.expm1(kaggle_log_value_pred)
    
    # Ensure no negative predictions (can happen with model noise)
    kaggle_value_pred[kaggle_value_pred < 0] = 0

    # Scale back to the original submission format (multiply by 1e6)
    kaggle_final_predictions_scaled = kaggle_value_pred * 1e6
    
    # Create submission file
    submission_df = pd.DataFrame({'ID': df_test.index, 'purchaseValue': kaggle_final_predictions_scaled})
    submission_df.to_csv('submission_single_regressor.csv', index=False)
    print("Submission file 'submission_single_regressor.csv' created successfully.")
    print(submission_df.head())

except FileNotFoundError:
    print(f"\nKaggle test file at '{TEST_FILE_PATH}' not found.")
except Exception as e:
    print(f"\nAn error occurred during submission generation: {e}")

Loading and preparing data...
Creating user-level features...
User-level features created and merged.

Identified 13 numerical features.
Identified 36 categorical features.

--- Tuning Single Regressor with Optuna (30 trials) ---


[I 2025-07-16 19:48:20,707] A new study created in memory with name: no-name-87af6514-f769-4dde-bd9c-1faff056b00e
[I 2025-07-16 19:48:34,011] Trial 0 finished with value: 0.30318617901507333 and parameters: {'n_estimators': 414, 'max_depth': 8, 'learning_rate': 0.06624010103191053, 'subsample': 0.8928469657991477, 'colsample_bytree': 0.7131835896032724}. Best is trial 0 with value: 0.30318617901507333.
[I 2025-07-16 19:48:45,041] Trial 1 finished with value: 0.30672136290848784 and parameters: {'n_estimators': 640, 'max_depth': 4, 'learning_rate': 0.055980537907327654, 'subsample': 0.6421132696192603, 'colsample_bytree': 0.6909041657074161}. Best is trial 0 with value: 0.30318617901507333.
[I 2025-07-16 19:49:22,214] Trial 2 finished with value: 0.3057044515132742 and parameters: {'n_estimators': 1110, 'max_depth': 10, 'learning_rate': 0.011824912923344663, 'subsample': 0.8420187968526013, 'colsample_bytree': 0.6319182021918974}. Best is trial 0 with value: 0.30318617901507333.
[I 2025


--- Training Final Single Regressor Model with Best Parameters ---
Final Single Regressor trained.

--- Generating Final Kaggle Submission ---
Submission file 'submission_single_regressor.csv' created successfully.
   ID  purchaseValue
0   0   2.559731e+07
1   1   1.676521e+03
2   2   0.000000e+00
3   3   1.066896e+06
4   4   0.000000e+00
